]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/relocation.c
btrfs: backref: rename and move alloc_backref_node()
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / relocation.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
5d4f98a2
YZ
2/*
3 * Copyright (C) 2009 Oracle. All rights reserved.
5d4f98a2
YZ
4 */
5
6#include <linux/sched.h>
7#include <linux/pagemap.h>
8#include <linux/writeback.h>
9#include <linux/blkdev.h>
10#include <linux/rbtree.h>
5a0e3ad6 11#include <linux/slab.h>
726a3421 12#include <linux/error-injection.h>
5d4f98a2
YZ
13#include "ctree.h"
14#include "disk-io.h"
15#include "transaction.h"
16#include "volumes.h"
17#include "locking.h"
18#include "btrfs_inode.h"
19#include "async-thread.h"
0af3d00b 20#include "free-space-cache.h"
581bb050 21#include "inode-map.h"
62b99540 22#include "qgroup.h"
cdccee99 23#include "print-tree.h"
86736342 24#include "delalloc-space.h"
aac0023c 25#include "block-group.h"
19b546d7 26#include "backref.h"
e9a28dc5 27#include "misc.h"
5d4f98a2 28
0c891389
QW
29/*
30 * Relocation overview
31 *
32 * [What does relocation do]
33 *
34 * The objective of relocation is to relocate all extents of the target block
35 * group to other block groups.
36 * This is utilized by resize (shrink only), profile converting, compacting
37 * space, or balance routine to spread chunks over devices.
38 *
39 * Before | After
40 * ------------------------------------------------------------------
41 * BG A: 10 data extents | BG A: deleted
42 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
43 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
44 *
45 * [How does relocation work]
46 *
47 * 1. Mark the target block group read-only
48 * New extents won't be allocated from the target block group.
49 *
50 * 2.1 Record each extent in the target block group
51 * To build a proper map of extents to be relocated.
52 *
53 * 2.2 Build data reloc tree and reloc trees
54 * Data reloc tree will contain an inode, recording all newly relocated
55 * data extents.
56 * There will be only one data reloc tree for one data block group.
57 *
58 * Reloc tree will be a special snapshot of its source tree, containing
59 * relocated tree blocks.
60 * Each tree referring to a tree block in target block group will get its
61 * reloc tree built.
62 *
63 * 2.3 Swap source tree with its corresponding reloc tree
64 * Each involved tree only refers to new extents after swap.
65 *
66 * 3. Cleanup reloc trees and data reloc tree.
67 * As old extents in the target block group are still referenced by reloc
68 * trees, we need to clean them up before really freeing the target block
69 * group.
70 *
71 * The main complexity is in steps 2.2 and 2.3.
72 *
73 * The entry point of relocation is relocate_block_group() function.
74 */
75
2a979612 76#define RELOCATION_RESERVED_NODES 256
5d4f98a2
YZ
77/*
78 * map address of tree root to tree
79 */
80struct mapping_node {
e9a28dc5
QW
81 struct {
82 struct rb_node rb_node;
83 u64 bytenr;
84 }; /* Use rb_simle_node for search/insert */
5d4f98a2
YZ
85 void *data;
86};
87
88struct mapping_tree {
89 struct rb_root rb_root;
90 spinlock_t lock;
91};
92
93/*
94 * present a tree block to process
95 */
96struct tree_block {
e9a28dc5
QW
97 struct {
98 struct rb_node rb_node;
99 u64 bytenr;
100 }; /* Use rb_simple_node for search/insert */
5d4f98a2
YZ
101 struct btrfs_key key;
102 unsigned int level:8;
103 unsigned int key_ready:1;
104};
105
0257bb82
YZ
106#define MAX_EXTENTS 128
107
108struct file_extent_cluster {
109 u64 start;
110 u64 end;
111 u64 boundary[MAX_EXTENTS];
112 unsigned int nr;
113};
114
5d4f98a2
YZ
115struct reloc_control {
116 /* block group to relocate */
32da5386 117 struct btrfs_block_group *block_group;
5d4f98a2
YZ
118 /* extent tree */
119 struct btrfs_root *extent_root;
120 /* inode for moving data */
121 struct inode *data_inode;
3fd0a558
YZ
122
123 struct btrfs_block_rsv *block_rsv;
124
a26195a5 125 struct btrfs_backref_cache backref_cache;
3fd0a558
YZ
126
127 struct file_extent_cluster cluster;
5d4f98a2
YZ
128 /* tree blocks have been processed */
129 struct extent_io_tree processed_blocks;
130 /* map start of tree root to corresponding reloc tree */
131 struct mapping_tree reloc_root_tree;
132 /* list of reloc trees */
133 struct list_head reloc_roots;
d2311e69
QW
134 /* list of subvolume trees that get relocated */
135 struct list_head dirty_subvol_roots;
3fd0a558
YZ
136 /* size of metadata reservation for merging reloc trees */
137 u64 merging_rsv_size;
138 /* size of relocated tree nodes */
139 u64 nodes_relocated;
0647bf56
WS
140 /* reserved size for block group relocation*/
141 u64 reserved_bytes;
3fd0a558 142
5d4f98a2
YZ
143 u64 search_start;
144 u64 extents_found;
3fd0a558 145
3fd0a558
YZ
146 unsigned int stage:8;
147 unsigned int create_reloc_tree:1;
148 unsigned int merge_reloc_tree:1;
5d4f98a2 149 unsigned int found_file_extent:1;
5d4f98a2
YZ
150};
151
152/* stages of data relocation */
153#define MOVE_DATA_EXTENTS 0
154#define UPDATE_DATA_PTRS 1
155
a26195a5
QW
156static void remove_backref_node(struct btrfs_backref_cache *cache,
157 struct btrfs_backref_node *node);
9569cc20
QW
158
159static void mark_block_processed(struct reloc_control *rc,
a26195a5 160 struct btrfs_backref_node *node)
9569cc20
QW
161{
162 u32 blocksize;
163
164 if (node->level == 0 ||
165 in_range(node->bytenr, rc->block_group->start,
166 rc->block_group->length)) {
167 blocksize = rc->extent_root->fs_info->nodesize;
168 set_extent_bits(&rc->processed_blocks, node->bytenr,
169 node->bytenr + blocksize - 1, EXTENT_DIRTY);
170 }
171 node->processed = 1;
172}
173
5d4f98a2
YZ
174
175static void mapping_tree_init(struct mapping_tree *tree)
176{
6bef4d31 177 tree->rb_root = RB_ROOT;
5d4f98a2
YZ
178 spin_lock_init(&tree->lock);
179}
180
a26195a5 181static void backref_cache_cleanup(struct btrfs_backref_cache *cache)
3fd0a558 182{
a26195a5 183 struct btrfs_backref_node *node;
3fd0a558
YZ
184 int i;
185
186 while (!list_empty(&cache->detached)) {
187 node = list_entry(cache->detached.next,
a26195a5 188 struct btrfs_backref_node, list);
3fd0a558
YZ
189 remove_backref_node(cache, node);
190 }
191
192 while (!list_empty(&cache->leaves)) {
193 node = list_entry(cache->leaves.next,
a26195a5 194 struct btrfs_backref_node, lower);
3fd0a558
YZ
195 remove_backref_node(cache, node);
196 }
197
198 cache->last_trans = 0;
199
200 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
f4907095 201 ASSERT(list_empty(&cache->pending[i]));
84780289
QW
202 ASSERT(list_empty(&cache->pending_edge));
203 ASSERT(list_empty(&cache->useless_node));
f4907095
LB
204 ASSERT(list_empty(&cache->changed));
205 ASSERT(list_empty(&cache->detached));
206 ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
207 ASSERT(!cache->nr_nodes);
208 ASSERT(!cache->nr_edges);
3fd0a558
YZ
209}
210
a26195a5
QW
211static void free_backref_node(struct btrfs_backref_cache *cache,
212 struct btrfs_backref_node *node)
3fd0a558
YZ
213{
214 if (node) {
215 cache->nr_nodes--;
00246528 216 btrfs_put_root(node->root);
3fd0a558
YZ
217 kfree(node);
218 }
219}
220
a26195a5
QW
221static struct btrfs_backref_edge *alloc_backref_edge(
222 struct btrfs_backref_cache *cache)
3fd0a558 223{
a26195a5 224 struct btrfs_backref_edge *edge;
3fd0a558
YZ
225
226 edge = kzalloc(sizeof(*edge), GFP_NOFS);
227 if (edge)
228 cache->nr_edges++;
229 return edge;
5d4f98a2
YZ
230}
231
2a979612
QW
232#define LINK_LOWER (1 << 0)
233#define LINK_UPPER (1 << 1)
a26195a5
QW
234static void link_backref_edge(struct btrfs_backref_edge *edge,
235 struct btrfs_backref_node *lower,
236 struct btrfs_backref_node *upper,
2a979612
QW
237 int link_which)
238{
239 ASSERT(upper && lower && upper->level == lower->level + 1);
240 edge->node[LOWER] = lower;
241 edge->node[UPPER] = upper;
242 if (link_which & LINK_LOWER)
243 list_add_tail(&edge->list[LOWER], &lower->upper);
244 if (link_which & LINK_UPPER)
245 list_add_tail(&edge->list[UPPER], &upper->lower);
246}
247
a26195a5
QW
248static void free_backref_edge(struct btrfs_backref_cache *cache,
249 struct btrfs_backref_edge *edge)
5d4f98a2 250{
3fd0a558
YZ
251 if (edge) {
252 cache->nr_edges--;
253 kfree(edge);
254 }
5d4f98a2
YZ
255}
256
48a3b636 257static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
43c04fb1
JM
258{
259
260 struct btrfs_fs_info *fs_info = NULL;
a26195a5
QW
261 struct btrfs_backref_node *bnode = rb_entry(rb_node,
262 struct btrfs_backref_node, rb_node);
43c04fb1
JM
263 if (bnode->root)
264 fs_info = bnode->root->fs_info;
5d163e0e
JM
265 btrfs_panic(fs_info, errno,
266 "Inconsistency in backref cache found at offset %llu",
267 bytenr);
43c04fb1
JM
268}
269
5d4f98a2
YZ
270/*
271 * walk up backref nodes until reach node presents tree root
272 */
a26195a5
QW
273static struct btrfs_backref_node *walk_up_backref(
274 struct btrfs_backref_node *node,
275 struct btrfs_backref_edge *edges[], int *index)
5d4f98a2 276{
a26195a5 277 struct btrfs_backref_edge *edge;
5d4f98a2
YZ
278 int idx = *index;
279
280 while (!list_empty(&node->upper)) {
281 edge = list_entry(node->upper.next,
a26195a5 282 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
283 edges[idx++] = edge;
284 node = edge->node[UPPER];
285 }
3fd0a558 286 BUG_ON(node->detached);
5d4f98a2
YZ
287 *index = idx;
288 return node;
289}
290
291/*
292 * walk down backref nodes to find start of next reference path
293 */
a26195a5
QW
294static struct btrfs_backref_node *walk_down_backref(
295 struct btrfs_backref_edge *edges[], int *index)
5d4f98a2 296{
a26195a5
QW
297 struct btrfs_backref_edge *edge;
298 struct btrfs_backref_node *lower;
5d4f98a2
YZ
299 int idx = *index;
300
301 while (idx > 0) {
302 edge = edges[idx - 1];
303 lower = edge->node[LOWER];
304 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
305 idx--;
306 continue;
307 }
308 edge = list_entry(edge->list[LOWER].next,
a26195a5 309 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
310 edges[idx - 1] = edge;
311 *index = idx;
312 return edge->node[UPPER];
313 }
314 *index = 0;
315 return NULL;
316}
317
a26195a5 318static void unlock_node_buffer(struct btrfs_backref_node *node)
3fd0a558
YZ
319{
320 if (node->locked) {
321 btrfs_tree_unlock(node->eb);
322 node->locked = 0;
323 }
324}
325
a26195a5 326static void drop_node_buffer(struct btrfs_backref_node *node)
5d4f98a2
YZ
327{
328 if (node->eb) {
3fd0a558 329 unlock_node_buffer(node);
5d4f98a2
YZ
330 free_extent_buffer(node->eb);
331 node->eb = NULL;
332 }
333}
334
a26195a5
QW
335static void drop_backref_node(struct btrfs_backref_cache *tree,
336 struct btrfs_backref_node *node)
5d4f98a2 337{
5d4f98a2
YZ
338 BUG_ON(!list_empty(&node->upper));
339
340 drop_node_buffer(node);
3fd0a558 341 list_del(&node->list);
5d4f98a2 342 list_del(&node->lower);
3fd0a558
YZ
343 if (!RB_EMPTY_NODE(&node->rb_node))
344 rb_erase(&node->rb_node, &tree->rb_root);
345 free_backref_node(tree, node);
5d4f98a2
YZ
346}
347
348/*
349 * remove a backref node from the backref cache
350 */
a26195a5
QW
351static void remove_backref_node(struct btrfs_backref_cache *cache,
352 struct btrfs_backref_node *node)
5d4f98a2 353{
a26195a5
QW
354 struct btrfs_backref_node *upper;
355 struct btrfs_backref_edge *edge;
5d4f98a2
YZ
356
357 if (!node)
358 return;
359
3fd0a558 360 BUG_ON(!node->lowest && !node->detached);
5d4f98a2 361 while (!list_empty(&node->upper)) {
a26195a5 362 edge = list_entry(node->upper.next, struct btrfs_backref_edge,
5d4f98a2
YZ
363 list[LOWER]);
364 upper = edge->node[UPPER];
365 list_del(&edge->list[LOWER]);
366 list_del(&edge->list[UPPER]);
3fd0a558
YZ
367 free_backref_edge(cache, edge);
368
369 if (RB_EMPTY_NODE(&upper->rb_node)) {
370 BUG_ON(!list_empty(&node->upper));
371 drop_backref_node(cache, node);
372 node = upper;
373 node->lowest = 1;
374 continue;
375 }
5d4f98a2 376 /*
3fd0a558 377 * add the node to leaf node list if no other
5d4f98a2
YZ
378 * child block cached.
379 */
380 if (list_empty(&upper->lower)) {
3fd0a558 381 list_add_tail(&upper->lower, &cache->leaves);
5d4f98a2
YZ
382 upper->lowest = 1;
383 }
384 }
3fd0a558 385
5d4f98a2
YZ
386 drop_backref_node(cache, node);
387}
388
a26195a5
QW
389static void update_backref_node(struct btrfs_backref_cache *cache,
390 struct btrfs_backref_node *node, u64 bytenr)
3fd0a558
YZ
391{
392 struct rb_node *rb_node;
393 rb_erase(&node->rb_node, &cache->rb_root);
394 node->bytenr = bytenr;
e9a28dc5 395 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
43c04fb1
JM
396 if (rb_node)
397 backref_tree_panic(rb_node, -EEXIST, bytenr);
3fd0a558
YZ
398}
399
400/*
401 * update backref cache after a transaction commit
402 */
403static int update_backref_cache(struct btrfs_trans_handle *trans,
a26195a5 404 struct btrfs_backref_cache *cache)
3fd0a558 405{
a26195a5 406 struct btrfs_backref_node *node;
3fd0a558
YZ
407 int level = 0;
408
409 if (cache->last_trans == 0) {
410 cache->last_trans = trans->transid;
411 return 0;
412 }
413
414 if (cache->last_trans == trans->transid)
415 return 0;
416
417 /*
418 * detached nodes are used to avoid unnecessary backref
419 * lookup. transaction commit changes the extent tree.
420 * so the detached nodes are no longer useful.
421 */
422 while (!list_empty(&cache->detached)) {
423 node = list_entry(cache->detached.next,
a26195a5 424 struct btrfs_backref_node, list);
3fd0a558
YZ
425 remove_backref_node(cache, node);
426 }
427
428 while (!list_empty(&cache->changed)) {
429 node = list_entry(cache->changed.next,
a26195a5 430 struct btrfs_backref_node, list);
3fd0a558
YZ
431 list_del_init(&node->list);
432 BUG_ON(node->pending);
433 update_backref_node(cache, node, node->new_bytenr);
434 }
435
436 /*
437 * some nodes can be left in the pending list if there were
438 * errors during processing the pending nodes.
439 */
440 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
441 list_for_each_entry(node, &cache->pending[level], list) {
442 BUG_ON(!node->pending);
443 if (node->bytenr == node->new_bytenr)
444 continue;
445 update_backref_node(cache, node, node->new_bytenr);
446 }
447 }
448
449 cache->last_trans = 0;
450 return 1;
451}
452
6282675e
QW
453static bool reloc_root_is_dead(struct btrfs_root *root)
454{
455 /*
456 * Pair with set_bit/clear_bit in clean_dirty_subvols and
457 * btrfs_update_reloc_root. We need to see the updated bit before
458 * trying to access reloc_root
459 */
460 smp_rmb();
461 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
462 return true;
463 return false;
464}
465
466/*
467 * Check if this subvolume tree has valid reloc tree.
468 *
469 * Reloc tree after swap is considered dead, thus not considered as valid.
470 * This is enough for most callers, as they don't distinguish dead reloc root
471 * from no reloc root. But should_ignore_root() below is a special case.
472 */
473static bool have_reloc_root(struct btrfs_root *root)
474{
475 if (reloc_root_is_dead(root))
476 return false;
477 if (!root->reloc_root)
478 return false;
479 return true;
480}
f2a97a9d 481
3fd0a558
YZ
482static int should_ignore_root(struct btrfs_root *root)
483{
484 struct btrfs_root *reloc_root;
485
27cdeb70 486 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
3fd0a558
YZ
487 return 0;
488
6282675e
QW
489 /* This root has been merged with its reloc tree, we can ignore it */
490 if (reloc_root_is_dead(root))
491 return 1;
492
3fd0a558
YZ
493 reloc_root = root->reloc_root;
494 if (!reloc_root)
495 return 0;
496
4d4225fc
JB
497 if (btrfs_header_generation(reloc_root->commit_root) ==
498 root->fs_info->running_transaction->transid)
3fd0a558
YZ
499 return 0;
500 /*
501 * if there is reloc tree and it was created in previous
502 * transaction backref lookup can find the reloc tree,
503 * so backref node for the fs tree root is useless for
504 * relocation.
505 */
506 return 1;
507}
5d4f98a2
YZ
508/*
509 * find reloc tree by address of tree root
510 */
2433bea5 511struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
5d4f98a2 512{
2433bea5 513 struct reloc_control *rc = fs_info->reloc_ctl;
5d4f98a2
YZ
514 struct rb_node *rb_node;
515 struct mapping_node *node;
516 struct btrfs_root *root = NULL;
517
2433bea5 518 ASSERT(rc);
5d4f98a2 519 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5 520 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
5d4f98a2
YZ
521 if (rb_node) {
522 node = rb_entry(rb_node, struct mapping_node, rb_node);
523 root = (struct btrfs_root *)node->data;
524 }
525 spin_unlock(&rc->reloc_root_tree.lock);
00246528 526 return btrfs_grab_root(root);
5d4f98a2
YZ
527}
528
5d4f98a2
YZ
529static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
530 u64 root_objectid)
531{
532 struct btrfs_key key;
533
534 key.objectid = root_objectid;
535 key.type = BTRFS_ROOT_ITEM_KEY;
e1922118 536 key.offset = (u64)-1;
5d4f98a2 537
bc44d7c4 538 return btrfs_get_fs_root(fs_info, &key, false);
5d4f98a2
YZ
539}
540
4007ea87
QW
541/*
542 * Handle direct tree backref
543 *
544 * Direct tree backref means, the backref item shows its parent bytenr
545 * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
546 *
547 * @ref_key: The converted backref key.
548 * For keyed backref, it's the item key.
549 * For inlined backref, objectid is the bytenr,
550 * type is btrfs_inline_ref_type, offset is
551 * btrfs_inline_ref_offset.
552 */
a26195a5 553static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
4007ea87 554 struct btrfs_key *ref_key,
a26195a5 555 struct btrfs_backref_node *cur)
4007ea87 556{
a26195a5
QW
557 struct btrfs_backref_edge *edge;
558 struct btrfs_backref_node *upper;
4007ea87
QW
559 struct rb_node *rb_node;
560
561 ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
562
563 /* Only reloc root uses backref pointing to itself */
564 if (ref_key->objectid == ref_key->offset) {
565 struct btrfs_root *root;
566
567 cur->is_reloc_root = 1;
568 /* Only reloc backref cache cares about a specific root */
569 if (cache->is_reloc) {
570 root = find_reloc_root(cache->fs_info, cur->bytenr);
571 if (WARN_ON(!root))
572 return -ENOENT;
573 cur->root = root;
574 } else {
575 /*
576 * For generic purpose backref cache, reloc root node
577 * is useless.
578 */
579 list_add(&cur->list, &cache->useless_node);
580 }
581 return 0;
582 }
583
584 edge = alloc_backref_edge(cache);
585 if (!edge)
586 return -ENOMEM;
587
e9a28dc5 588 rb_node = rb_simple_search(&cache->rb_root, ref_key->offset);
4007ea87
QW
589 if (!rb_node) {
590 /* Parent node not yet cached */
b1818dab 591 upper = btrfs_backref_alloc_node(cache, ref_key->offset,
0304f2d8 592 cur->level + 1);
4007ea87
QW
593 if (!upper) {
594 free_backref_edge(cache, edge);
595 return -ENOMEM;
596 }
4007ea87
QW
597
598 /*
599 * Backrefs for the upper level block isn't cached, add the
600 * block to pending list
601 */
602 list_add_tail(&edge->list[UPPER], &cache->pending_edge);
603 } else {
604 /* Parent node already cached */
a26195a5 605 upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
4007ea87
QW
606 ASSERT(upper->checked);
607 INIT_LIST_HEAD(&edge->list[UPPER]);
608 }
2a979612 609 link_backref_edge(edge, cur, upper, LINK_LOWER);
4007ea87
QW
610 return 0;
611}
612
4d81ea8b
QW
613/*
614 * Handle indirect tree backref
615 *
616 * Indirect tree backref means, we only know which tree the node belongs to.
617 * We still need to do a tree search to find out the parents. This is for
618 * TREE_BLOCK_REF backref (keyed or inlined).
619 *
620 * @ref_key: The same as @ref_key in handle_direct_tree_backref()
621 * @tree_key: The first key of this tree block.
622 * @path: A clean (released) path, to avoid allocating path everytime
623 * the function get called.
624 */
a26195a5 625static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
4d81ea8b
QW
626 struct btrfs_path *path,
627 struct btrfs_key *ref_key,
628 struct btrfs_key *tree_key,
a26195a5 629 struct btrfs_backref_node *cur)
4d81ea8b
QW
630{
631 struct btrfs_fs_info *fs_info = cache->fs_info;
a26195a5
QW
632 struct btrfs_backref_node *upper;
633 struct btrfs_backref_node *lower;
634 struct btrfs_backref_edge *edge;
4d81ea8b
QW
635 struct extent_buffer *eb;
636 struct btrfs_root *root;
637 struct rb_node *rb_node;
638 int level;
639 bool need_check = true;
640 int ret;
641
642 root = read_fs_root(fs_info, ref_key->offset);
643 if (IS_ERR(root))
644 return PTR_ERR(root);
645 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
646 cur->cowonly = 1;
647
648 if (btrfs_root_level(&root->root_item) == cur->level) {
649 /* Tree root */
650 ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
651 if (should_ignore_root(root)) {
652 btrfs_put_root(root);
653 list_add(&cur->list, &cache->useless_node);
654 } else {
655 cur->root = root;
656 }
657 return 0;
658 }
659
660 level = cur->level + 1;
661
662 /* Search the tree to find parent blocks referring to the block */
663 path->search_commit_root = 1;
664 path->skip_locking = 1;
665 path->lowest_level = level;
666 ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
667 path->lowest_level = 0;
668 if (ret < 0) {
669 btrfs_put_root(root);
670 return ret;
671 }
672 if (ret > 0 && path->slots[level] > 0)
673 path->slots[level]--;
674
675 eb = path->nodes[level];
676 if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
677 btrfs_err(fs_info,
678"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
679 cur->bytenr, level - 1, root->root_key.objectid,
680 tree_key->objectid, tree_key->type, tree_key->offset);
681 btrfs_put_root(root);
682 ret = -ENOENT;
683 goto out;
684 }
685 lower = cur;
686
687 /* Add all nodes and edges in the path */
688 for (; level < BTRFS_MAX_LEVEL; level++) {
689 if (!path->nodes[level]) {
690 ASSERT(btrfs_root_bytenr(&root->root_item) ==
691 lower->bytenr);
692 if (should_ignore_root(root)) {
693 btrfs_put_root(root);
694 list_add(&lower->list, &cache->useless_node);
695 } else {
696 lower->root = root;
697 }
698 break;
699 }
700
701 edge = alloc_backref_edge(cache);
702 if (!edge) {
703 btrfs_put_root(root);
704 ret = -ENOMEM;
705 goto out;
706 }
707
708 eb = path->nodes[level];
e9a28dc5 709 rb_node = rb_simple_search(&cache->rb_root, eb->start);
4d81ea8b 710 if (!rb_node) {
b1818dab
QW
711 upper = btrfs_backref_alloc_node(cache, eb->start,
712 lower->level + 1);
4d81ea8b
QW
713 if (!upper) {
714 btrfs_put_root(root);
715 free_backref_edge(cache, edge);
716 ret = -ENOMEM;
717 goto out;
718 }
4d81ea8b 719 upper->owner = btrfs_header_owner(eb);
4d81ea8b
QW
720 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
721 upper->cowonly = 1;
722
723 /*
724 * If we know the block isn't shared we can avoid
725 * checking its backrefs.
726 */
727 if (btrfs_block_can_be_shared(root, eb))
728 upper->checked = 0;
729 else
730 upper->checked = 1;
731
732 /*
733 * Add the block to pending list if we need to check its
734 * backrefs, we only do this once while walking up a
735 * tree as we will catch anything else later on.
736 */
737 if (!upper->checked && need_check) {
738 need_check = false;
739 list_add_tail(&edge->list[UPPER],
740 &cache->pending_edge);
741 } else {
742 if (upper->checked)
743 need_check = true;
744 INIT_LIST_HEAD(&edge->list[UPPER]);
745 }
746 } else {
a26195a5
QW
747 upper = rb_entry(rb_node, struct btrfs_backref_node,
748 rb_node);
4d81ea8b
QW
749 ASSERT(upper->checked);
750 INIT_LIST_HEAD(&edge->list[UPPER]);
751 if (!upper->owner)
752 upper->owner = btrfs_header_owner(eb);
753 }
2a979612 754 link_backref_edge(edge, lower, upper, LINK_LOWER);
4d81ea8b
QW
755
756 if (rb_node) {
757 btrfs_put_root(root);
758 break;
759 }
760 lower = upper;
761 upper = NULL;
762 }
763out:
764 btrfs_release_path(path);
765 return ret;
766}
767
a26195a5 768static int handle_one_tree_block(struct btrfs_backref_cache *cache,
e7d571c7
QW
769 struct btrfs_path *path,
770 struct btrfs_backref_iter *iter,
771 struct btrfs_key *node_key,
a26195a5 772 struct btrfs_backref_node *cur)
5d4f98a2 773{
e7d571c7 774 struct btrfs_fs_info *fs_info = cache->fs_info;
a26195a5
QW
775 struct btrfs_backref_edge *edge;
776 struct btrfs_backref_node *exist;
5d4f98a2 777 int ret;
5d4f98a2 778
71f572a9 779 ret = btrfs_backref_iter_start(iter, cur->bytenr);
e7d571c7
QW
780 if (ret < 0)
781 return ret;
71f572a9
QW
782 /*
783 * We skip the first btrfs_tree_block_info, as we don't use the key
784 * stored in it, but fetch it from the tree block
785 */
786 if (btrfs_backref_has_tree_block_info(iter)) {
787 ret = btrfs_backref_iter_next(iter);
e7d571c7 788 if (ret < 0)
71f572a9 789 goto out;
71f572a9
QW
790 /* No extra backref? This means the tree block is corrupted */
791 if (ret > 0) {
e7d571c7 792 ret = -EUCLEAN;
71f572a9
QW
793 goto out;
794 }
795 }
5d4f98a2
YZ
796 WARN_ON(cur->checked);
797 if (!list_empty(&cur->upper)) {
798 /*
70f23fd6 799 * the backref was added previously when processing
5d4f98a2
YZ
800 * backref of type BTRFS_TREE_BLOCK_REF_KEY
801 */
75bfb9af 802 ASSERT(list_is_singular(&cur->upper));
a26195a5 803 edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
5d4f98a2 804 list[LOWER]);
75bfb9af 805 ASSERT(list_empty(&edge->list[UPPER]));
5d4f98a2
YZ
806 exist = edge->node[UPPER];
807 /*
808 * add the upper level block to pending list if we need
809 * check its backrefs
810 */
811 if (!exist->checked)
84780289 812 list_add_tail(&edge->list[UPPER], &cache->pending_edge);
5d4f98a2
YZ
813 } else {
814 exist = NULL;
815 }
816
71f572a9
QW
817 for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
818 struct extent_buffer *eb;
819 struct btrfs_key key;
820 int type;
5d4f98a2 821
71f572a9
QW
822 cond_resched();
823 eb = btrfs_backref_get_eb(iter);
5d4f98a2 824
71f572a9
QW
825 key.objectid = iter->bytenr;
826 if (btrfs_backref_iter_is_inline_ref(iter)) {
827 struct btrfs_extent_inline_ref *iref;
5d4f98a2 828
5d4f98a2 829 /* update key for inline back ref */
71f572a9
QW
830 iref = (struct btrfs_extent_inline_ref *)
831 ((unsigned long)iter->cur_ptr);
3de28d57
LB
832 type = btrfs_get_extent_inline_ref_type(eb, iref,
833 BTRFS_REF_TYPE_BLOCK);
834 if (type == BTRFS_REF_TYPE_INVALID) {
e7d571c7 835 ret = -EUCLEAN;
3de28d57
LB
836 goto out;
837 }
838 key.type = type;
5d4f98a2 839 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
71f572a9
QW
840 } else {
841 key.type = iter->cur_key.type;
842 key.offset = iter->cur_key.offset;
5d4f98a2
YZ
843 }
844
fa6ac715
QW
845 /*
846 * Parent node found and matches current inline ref, no need to
847 * rebuild this node for this inline ref.
848 */
5d4f98a2
YZ
849 if (exist &&
850 ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
851 exist->owner == key.offset) ||
852 (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
853 exist->bytenr == key.offset))) {
854 exist = NULL;
71f572a9 855 continue;
5d4f98a2
YZ
856 }
857
fa6ac715 858 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
5d4f98a2 859 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
4007ea87 860 ret = handle_direct_tree_backref(cache, &key, cur);
e7d571c7 861 if (ret < 0)
5d4f98a2 862 goto out;
71f572a9 863 continue;
6d8ff4e4 864 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
e7d571c7
QW
865 ret = -EINVAL;
866 btrfs_print_v0_err(fs_info);
867 btrfs_handle_fs_error(fs_info, ret, NULL);
ba3c2b19 868 goto out;
5d4f98a2 869 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
71f572a9 870 continue;
5d4f98a2
YZ
871 }
872
fa6ac715
QW
873 /*
874 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
875 * means the root objectid. We need to search the tree to get
876 * its parent bytenr.
877 */
4d81ea8b
QW
878 ret = handle_indirect_tree_backref(cache, path, &key, node_key,
879 cur);
e7d571c7 880 if (ret < 0)
5d4f98a2 881 goto out;
5d4f98a2 882 }
71f572a9 883 ret = 0;
5d4f98a2
YZ
884 cur->checked = 1;
885 WARN_ON(exist);
e7d571c7
QW
886out:
887 btrfs_backref_iter_release(iter);
888 return ret;
889}
890
1f872924
QW
891/*
892 * In handle_one_tree_backref(), we have only linked the lower node to the edge,
893 * but the upper node hasn't been linked to the edge.
a26195a5
QW
894 * This means we can only iterate through btrfs_backref_node::upper to reach
895 * parent edges, but not through btrfs_backref_node::lower to reach children
896 * edges.
1f872924 897 *
a26195a5
QW
898 * This function will finish the btrfs_backref_node::lower to related edges,
899 * so that backref cache can be bi-directionally iterated.
1f872924
QW
900 *
901 * Also, this will add the nodes to backref cache for the next run.
902 */
a26195a5
QW
903static int finish_upper_links(struct btrfs_backref_cache *cache,
904 struct btrfs_backref_node *start)
1f872924
QW
905{
906 struct list_head *useless_node = &cache->useless_node;
a26195a5 907 struct btrfs_backref_edge *edge;
1f872924
QW
908 struct rb_node *rb_node;
909 LIST_HEAD(pending_edge);
910
911 ASSERT(start->checked);
912
913 /* Insert this node to cache if it's not COW-only */
914 if (!start->cowonly) {
e9a28dc5
QW
915 rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
916 &start->rb_node);
1f872924
QW
917 if (rb_node)
918 backref_tree_panic(rb_node, -EEXIST, start->bytenr);
919 list_add_tail(&start->lower, &cache->leaves);
920 }
921
922 /*
923 * Use breadth first search to iterate all related edges.
924 *
925 * The starting points are all the edges of this node
926 */
927 list_for_each_entry(edge, &start->upper, list[LOWER])
928 list_add_tail(&edge->list[UPPER], &pending_edge);
929
930 while (!list_empty(&pending_edge)) {
a26195a5
QW
931 struct btrfs_backref_node *upper;
932 struct btrfs_backref_node *lower;
1f872924
QW
933 struct rb_node *rb_node;
934
a26195a5
QW
935 edge = list_first_entry(&pending_edge,
936 struct btrfs_backref_edge, list[UPPER]);
1f872924
QW
937 list_del_init(&edge->list[UPPER]);
938 upper = edge->node[UPPER];
939 lower = edge->node[LOWER];
940
941 /* Parent is detached, no need to keep any edges */
942 if (upper->detached) {
943 list_del(&edge->list[LOWER]);
944 free_backref_edge(cache, edge);
945
946 /* Lower node is orphan, queue for cleanup */
947 if (list_empty(&lower->upper))
948 list_add(&lower->list, useless_node);
949 continue;
950 }
951
952 /*
953 * All new nodes added in current build_backref_tree() haven't
954 * been linked to the cache rb tree.
955 * So if we have upper->rb_node populated, this means a cache
956 * hit. We only need to link the edge, as @upper and all its
957 * parent have already been linked.
958 */
959 if (!RB_EMPTY_NODE(&upper->rb_node)) {
960 if (upper->lowest) {
961 list_del_init(&upper->lower);
962 upper->lowest = 0;
963 }
964
965 list_add_tail(&edge->list[UPPER], &upper->lower);
966 continue;
967 }
968
969 /* Sanity check, we shouldn't have any unchecked nodes */
970 if (!upper->checked) {
971 ASSERT(0);
972 return -EUCLEAN;
973 }
974
975 /* Sanity check, COW-only node has non-COW-only parent */
976 if (start->cowonly != upper->cowonly) {
977 ASSERT(0);
978 return -EUCLEAN;
979 }
980
981 /* Only cache non-COW-only (subvolume trees) tree blocks */
982 if (!upper->cowonly) {
e9a28dc5
QW
983 rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
984 &upper->rb_node);
1f872924
QW
985 if (rb_node) {
986 backref_tree_panic(rb_node, -EEXIST,
987 upper->bytenr);
988 return -EUCLEAN;
989 }
990 }
991
992 list_add_tail(&edge->list[UPPER], &upper->lower);
993
994 /*
995 * Also queue all the parent edges of this uncached node to
996 * finish the upper linkage
997 */
998 list_for_each_entry(edge, &upper->upper, list[LOWER])
999 list_add_tail(&edge->list[UPPER], &pending_edge);
1000 }
1001 return 0;
1002}
1003
29db137b
QW
1004/*
1005 * For useless nodes, do two major clean ups:
1006 *
1007 * - Cleanup the children edges and nodes
1008 * If child node is also orphan (no parent) during cleanup, then the child
1009 * node will also be cleaned up.
1010 *
1011 * - Freeing up leaves (level 0), keeps nodes detached
1012 * For nodes, the node is still cached as "detached"
1013 *
1014 * Return false if @node is not in the @useless_nodes list.
1015 * Return true if @node is in the @useless_nodes list.
1016 */
1017static bool handle_useless_nodes(struct reloc_control *rc,
a26195a5 1018 struct btrfs_backref_node *node)
29db137b 1019{
a26195a5 1020 struct btrfs_backref_cache *cache = &rc->backref_cache;
29db137b
QW
1021 struct list_head *useless_node = &cache->useless_node;
1022 bool ret = false;
1023
1024 while (!list_empty(useless_node)) {
a26195a5 1025 struct btrfs_backref_node *cur;
29db137b 1026
a26195a5 1027 cur = list_first_entry(useless_node, struct btrfs_backref_node,
29db137b
QW
1028 list);
1029 list_del_init(&cur->list);
1030
1031 /* Only tree root nodes can be added to @useless_nodes */
1032 ASSERT(list_empty(&cur->upper));
1033
1034 if (cur == node)
1035 ret = true;
1036
1037 /* The node is the lowest node */
1038 if (cur->lowest) {
1039 list_del_init(&cur->lower);
1040 cur->lowest = 0;
1041 }
1042
1043 /* Cleanup the lower edges */
1044 while (!list_empty(&cur->lower)) {
a26195a5
QW
1045 struct btrfs_backref_edge *edge;
1046 struct btrfs_backref_node *lower;
29db137b
QW
1047
1048 edge = list_entry(cur->lower.next,
a26195a5 1049 struct btrfs_backref_edge, list[UPPER]);
29db137b
QW
1050 list_del(&edge->list[UPPER]);
1051 list_del(&edge->list[LOWER]);
1052 lower = edge->node[LOWER];
1053 free_backref_edge(cache, edge);
1054
1055 /* Child node is also orphan, queue for cleanup */
1056 if (list_empty(&lower->upper))
1057 list_add(&lower->list, useless_node);
1058 }
1059 /* Mark this block processed for relocation */
1060 mark_block_processed(rc, cur);
1061
1062 /*
1063 * Backref nodes for tree leaves are deleted from the cache.
1064 * Backref nodes for upper level tree blocks are left in the
1065 * cache to avoid unnecessary backref lookup.
1066 */
1067 if (cur->level > 0) {
1068 list_add(&cur->list, &cache->detached);
1069 cur->detached = 1;
1070 } else {
1071 rb_erase(&cur->rb_node, &cache->rb_root);
1072 free_backref_node(cache, cur);
1073 }
1074 }
1075 return ret;
1076}
1077
e7d571c7
QW
1078/*
1079 * Build backref tree for a given tree block. Root of the backref tree
1080 * corresponds the tree block, leaves of the backref tree correspond roots of
1081 * b-trees that reference the tree block.
1082 *
1083 * The basic idea of this function is check backrefs of a given block to find
1084 * upper level blocks that reference the block, and then check backrefs of
1085 * these upper level blocks recursively. The recursion stops when tree root is
1086 * reached or backrefs for the block is cached.
1087 *
1088 * NOTE: if we find that backrefs for a block are cached, we know backrefs for
1089 * all upper level blocks that directly/indirectly reference the block are also
1090 * cached.
1091 */
a26195a5 1092static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
e7d571c7
QW
1093 struct reloc_control *rc, struct btrfs_key *node_key,
1094 int level, u64 bytenr)
1095{
1096 struct btrfs_backref_iter *iter;
a26195a5 1097 struct btrfs_backref_cache *cache = &rc->backref_cache;
e7d571c7
QW
1098 /* For searching parent of TREE_BLOCK_REF */
1099 struct btrfs_path *path;
a26195a5
QW
1100 struct btrfs_backref_node *cur;
1101 struct btrfs_backref_node *upper;
1102 struct btrfs_backref_node *lower;
1103 struct btrfs_backref_node *node = NULL;
1104 struct btrfs_backref_edge *edge;
e7d571c7
QW
1105 int ret;
1106 int err = 0;
5d4f98a2 1107
e7d571c7
QW
1108 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
1109 if (!iter)
1110 return ERR_PTR(-ENOMEM);
1111 path = btrfs_alloc_path();
1112 if (!path) {
1113 err = -ENOMEM;
1114 goto out;
1115 }
1116
b1818dab 1117 node = btrfs_backref_alloc_node(cache, bytenr, level);
e7d571c7
QW
1118 if (!node) {
1119 err = -ENOMEM;
1120 goto out;
5d4f98a2
YZ
1121 }
1122
e7d571c7
QW
1123 node->lowest = 1;
1124 cur = node;
1125
1126 /* Breadth-first search to build backref cache */
1127 do {
1128 ret = handle_one_tree_block(cache, path, iter, node_key, cur);
1129 if (ret < 0) {
1130 err = ret;
1131 goto out;
1132 }
1133 edge = list_first_entry_or_null(&cache->pending_edge,
a26195a5 1134 struct btrfs_backref_edge, list[UPPER]);
e7d571c7
QW
1135 /*
1136 * The pending list isn't empty, take the first block to
1137 * process
1138 */
1139 if (edge) {
1140 list_del_init(&edge->list[UPPER]);
1141 cur = edge->node[UPPER];
1142 }
1143 } while (edge);
1144
1f872924
QW
1145 /* Finish the upper linkage of newly added edges/nodes */
1146 ret = finish_upper_links(cache, node);
1147 if (ret < 0) {
1148 err = ret;
1149 goto out;
3fd0a558 1150 }
5d4f98a2 1151
29db137b
QW
1152 if (handle_useless_nodes(rc, node))
1153 node = NULL;
5d4f98a2 1154out:
71f572a9
QW
1155 btrfs_backref_iter_free(iter);
1156 btrfs_free_path(path);
5d4f98a2 1157 if (err) {
84780289
QW
1158 while (!list_empty(&cache->useless_node)) {
1159 lower = list_first_entry(&cache->useless_node,
a26195a5 1160 struct btrfs_backref_node, list);
75bfb9af 1161 list_del_init(&lower->list);
3fd0a558 1162 }
84780289
QW
1163 while (!list_empty(&cache->pending_edge)) {
1164 edge = list_first_entry(&cache->pending_edge,
a26195a5 1165 struct btrfs_backref_edge, list[UPPER]);
75bfb9af 1166 list_del(&edge->list[UPPER]);
3fd0a558 1167 list_del(&edge->list[LOWER]);
75bfb9af 1168 lower = edge->node[LOWER];
5d4f98a2 1169 upper = edge->node[UPPER];
3fd0a558 1170 free_backref_edge(cache, edge);
75bfb9af
JB
1171
1172 /*
1173 * Lower is no longer linked to any upper backref nodes
1174 * and isn't in the cache, we can free it ourselves.
1175 */
1176 if (list_empty(&lower->upper) &&
1177 RB_EMPTY_NODE(&lower->rb_node))
84780289 1178 list_add(&lower->list, &cache->useless_node);
75bfb9af
JB
1179
1180 if (!RB_EMPTY_NODE(&upper->rb_node))
1181 continue;
1182
01327610 1183 /* Add this guy's upper edges to the list to process */
75bfb9af 1184 list_for_each_entry(edge, &upper->upper, list[LOWER])
84780289
QW
1185 list_add_tail(&edge->list[UPPER],
1186 &cache->pending_edge);
75bfb9af 1187 if (list_empty(&upper->upper))
84780289 1188 list_add(&upper->list, &cache->useless_node);
75bfb9af
JB
1189 }
1190
84780289
QW
1191 while (!list_empty(&cache->useless_node)) {
1192 lower = list_first_entry(&cache->useless_node,
a26195a5 1193 struct btrfs_backref_node, list);
75bfb9af 1194 list_del_init(&lower->list);
0fd8c3da
LB
1195 if (lower == node)
1196 node = NULL;
75bfb9af 1197 free_backref_node(cache, lower);
5d4f98a2 1198 }
0fd8c3da 1199
8e19c973 1200 remove_backref_node(cache, node);
84780289
QW
1201 ASSERT(list_empty(&cache->useless_node) &&
1202 list_empty(&cache->pending_edge));
5d4f98a2
YZ
1203 return ERR_PTR(err);
1204 }
75bfb9af 1205 ASSERT(!node || !node->detached);
84780289
QW
1206 ASSERT(list_empty(&cache->useless_node) &&
1207 list_empty(&cache->pending_edge));
5d4f98a2
YZ
1208 return node;
1209}
1210
3fd0a558
YZ
1211/*
1212 * helper to add backref node for the newly created snapshot.
1213 * the backref node is created by cloning backref node that
1214 * corresponds to root of source tree
1215 */
1216static int clone_backref_node(struct btrfs_trans_handle *trans,
1217 struct reloc_control *rc,
1218 struct btrfs_root *src,
1219 struct btrfs_root *dest)
1220{
1221 struct btrfs_root *reloc_root = src->reloc_root;
a26195a5
QW
1222 struct btrfs_backref_cache *cache = &rc->backref_cache;
1223 struct btrfs_backref_node *node = NULL;
1224 struct btrfs_backref_node *new_node;
1225 struct btrfs_backref_edge *edge;
1226 struct btrfs_backref_edge *new_edge;
3fd0a558
YZ
1227 struct rb_node *rb_node;
1228
1229 if (cache->last_trans > 0)
1230 update_backref_cache(trans, cache);
1231
e9a28dc5 1232 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
3fd0a558 1233 if (rb_node) {
a26195a5 1234 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
3fd0a558
YZ
1235 if (node->detached)
1236 node = NULL;
1237 else
1238 BUG_ON(node->new_bytenr != reloc_root->node->start);
1239 }
1240
1241 if (!node) {
e9a28dc5
QW
1242 rb_node = rb_simple_search(&cache->rb_root,
1243 reloc_root->commit_root->start);
3fd0a558 1244 if (rb_node) {
a26195a5 1245 node = rb_entry(rb_node, struct btrfs_backref_node,
3fd0a558
YZ
1246 rb_node);
1247 BUG_ON(node->detached);
1248 }
1249 }
1250
1251 if (!node)
1252 return 0;
1253
b1818dab
QW
1254 new_node = btrfs_backref_alloc_node(cache, dest->node->start,
1255 node->level);
3fd0a558
YZ
1256 if (!new_node)
1257 return -ENOMEM;
1258
3fd0a558 1259 new_node->lowest = node->lowest;
6848ad64 1260 new_node->checked = 1;
00246528 1261 new_node->root = btrfs_grab_root(dest);
0b530bc5 1262 ASSERT(new_node->root);
3fd0a558
YZ
1263
1264 if (!node->lowest) {
1265 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1266 new_edge = alloc_backref_edge(cache);
1267 if (!new_edge)
1268 goto fail;
1269
2a979612
QW
1270 link_backref_edge(new_edge, edge->node[LOWER], new_node,
1271 LINK_UPPER);
3fd0a558 1272 }
76b9e23d
MX
1273 } else {
1274 list_add_tail(&new_node->lower, &cache->leaves);
3fd0a558
YZ
1275 }
1276
e9a28dc5
QW
1277 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
1278 &new_node->rb_node);
43c04fb1
JM
1279 if (rb_node)
1280 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
3fd0a558
YZ
1281
1282 if (!new_node->lowest) {
1283 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1284 list_add_tail(&new_edge->list[LOWER],
1285 &new_edge->node[LOWER]->upper);
1286 }
1287 }
1288 return 0;
1289fail:
1290 while (!list_empty(&new_node->lower)) {
1291 new_edge = list_entry(new_node->lower.next,
a26195a5 1292 struct btrfs_backref_edge, list[UPPER]);
3fd0a558
YZ
1293 list_del(&new_edge->list[UPPER]);
1294 free_backref_edge(cache, new_edge);
1295 }
1296 free_backref_node(cache, new_node);
1297 return -ENOMEM;
1298}
1299
5d4f98a2
YZ
1300/*
1301 * helper to add 'address of tree root -> reloc tree' mapping
1302 */
ffd7b339 1303static int __must_check __add_reloc_root(struct btrfs_root *root)
5d4f98a2 1304{
0b246afa 1305 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1306 struct rb_node *rb_node;
1307 struct mapping_node *node;
0b246afa 1308 struct reloc_control *rc = fs_info->reloc_ctl;
5d4f98a2
YZ
1309
1310 node = kmalloc(sizeof(*node), GFP_NOFS);
ffd7b339
JM
1311 if (!node)
1312 return -ENOMEM;
5d4f98a2 1313
ea287ab1 1314 node->bytenr = root->commit_root->start;
5d4f98a2
YZ
1315 node->data = root;
1316
1317 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5
QW
1318 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
1319 node->bytenr, &node->rb_node);
5d4f98a2 1320 spin_unlock(&rc->reloc_root_tree.lock);
ffd7b339 1321 if (rb_node) {
0b246afa 1322 btrfs_panic(fs_info, -EEXIST,
5d163e0e
JM
1323 "Duplicate root found for start=%llu while inserting into relocation tree",
1324 node->bytenr);
ffd7b339 1325 }
5d4f98a2
YZ
1326
1327 list_add_tail(&root->root_list, &rc->reloc_roots);
1328 return 0;
1329}
1330
1331/*
c974c464 1332 * helper to delete the 'address of tree root -> reloc tree'
5d4f98a2
YZ
1333 * mapping
1334 */
c974c464 1335static void __del_reloc_root(struct btrfs_root *root)
5d4f98a2 1336{
0b246afa 1337 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1338 struct rb_node *rb_node;
1339 struct mapping_node *node = NULL;
0b246afa 1340 struct reloc_control *rc = fs_info->reloc_ctl;
f44deb74 1341 bool put_ref = false;
5d4f98a2 1342
65c6e82b 1343 if (rc && root->node) {
389305b2 1344 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5
QW
1345 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
1346 root->commit_root->start);
389305b2
QW
1347 if (rb_node) {
1348 node = rb_entry(rb_node, struct mapping_node, rb_node);
1349 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
ea287ab1 1350 RB_CLEAR_NODE(&node->rb_node);
389305b2
QW
1351 }
1352 spin_unlock(&rc->reloc_root_tree.lock);
1353 if (!node)
1354 return;
1355 BUG_ON((struct btrfs_root *)node->data != root);
5d4f98a2 1356 }
5d4f98a2 1357
f44deb74
JB
1358 /*
1359 * We only put the reloc root here if it's on the list. There's a lot
1360 * of places where the pattern is to splice the rc->reloc_roots, process
1361 * the reloc roots, and then add the reloc root back onto
1362 * rc->reloc_roots. If we call __del_reloc_root while it's off of the
1363 * list we don't want the reference being dropped, because the guy
1364 * messing with the list is in charge of the reference.
1365 */
0b246afa 1366 spin_lock(&fs_info->trans_lock);
f44deb74
JB
1367 if (!list_empty(&root->root_list)) {
1368 put_ref = true;
1369 list_del_init(&root->root_list);
1370 }
0b246afa 1371 spin_unlock(&fs_info->trans_lock);
f44deb74
JB
1372 if (put_ref)
1373 btrfs_put_root(root);
c974c464
WS
1374 kfree(node);
1375}
1376
1377/*
1378 * helper to update the 'address of tree root -> reloc tree'
1379 * mapping
1380 */
ea287ab1 1381static int __update_reloc_root(struct btrfs_root *root)
c974c464 1382{
0b246afa 1383 struct btrfs_fs_info *fs_info = root->fs_info;
c974c464
WS
1384 struct rb_node *rb_node;
1385 struct mapping_node *node = NULL;
0b246afa 1386 struct reloc_control *rc = fs_info->reloc_ctl;
c974c464
WS
1387
1388 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5
QW
1389 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
1390 root->commit_root->start);
c974c464
WS
1391 if (rb_node) {
1392 node = rb_entry(rb_node, struct mapping_node, rb_node);
1393 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
5d4f98a2 1394 }
c974c464
WS
1395 spin_unlock(&rc->reloc_root_tree.lock);
1396
1397 if (!node)
1398 return 0;
1399 BUG_ON((struct btrfs_root *)node->data != root);
1400
1401 spin_lock(&rc->reloc_root_tree.lock);
ea287ab1 1402 node->bytenr = root->node->start;
e9a28dc5
QW
1403 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
1404 node->bytenr, &node->rb_node);
c974c464
WS
1405 spin_unlock(&rc->reloc_root_tree.lock);
1406 if (rb_node)
1407 backref_tree_panic(rb_node, -EEXIST, node->bytenr);
5d4f98a2
YZ
1408 return 0;
1409}
1410
3fd0a558
YZ
1411static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1412 struct btrfs_root *root, u64 objectid)
5d4f98a2 1413{
0b246afa 1414 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1415 struct btrfs_root *reloc_root;
1416 struct extent_buffer *eb;
1417 struct btrfs_root_item *root_item;
1418 struct btrfs_key root_key;
1419 int ret;
1420
5d4f98a2
YZ
1421 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1422 BUG_ON(!root_item);
1423
1424 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1425 root_key.type = BTRFS_ROOT_ITEM_KEY;
3fd0a558 1426 root_key.offset = objectid;
5d4f98a2 1427
3fd0a558 1428 if (root->root_key.objectid == objectid) {
054570a1
FM
1429 u64 commit_root_gen;
1430
3fd0a558
YZ
1431 /* called by btrfs_init_reloc_root */
1432 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1433 BTRFS_TREE_RELOC_OBJECTID);
1434 BUG_ON(ret);
054570a1
FM
1435 /*
1436 * Set the last_snapshot field to the generation of the commit
1437 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1438 * correctly (returns true) when the relocation root is created
1439 * either inside the critical section of a transaction commit
1440 * (through transaction.c:qgroup_account_snapshot()) and when
1441 * it's created before the transaction commit is started.
1442 */
1443 commit_root_gen = btrfs_header_generation(root->commit_root);
1444 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
3fd0a558
YZ
1445 } else {
1446 /*
1447 * called by btrfs_reloc_post_snapshot_hook.
1448 * the source tree is a reloc tree, all tree blocks
1449 * modified after it was created have RELOC flag
1450 * set in their headers. so it's OK to not update
1451 * the 'last_snapshot'.
1452 */
1453 ret = btrfs_copy_root(trans, root, root->node, &eb,
1454 BTRFS_TREE_RELOC_OBJECTID);
1455 BUG_ON(ret);
1456 }
5d4f98a2 1457
5d4f98a2 1458 memcpy(root_item, &root->root_item, sizeof(*root_item));
5d4f98a2
YZ
1459 btrfs_set_root_bytenr(root_item, eb->start);
1460 btrfs_set_root_level(root_item, btrfs_header_level(eb));
1461 btrfs_set_root_generation(root_item, trans->transid);
3fd0a558
YZ
1462
1463 if (root->root_key.objectid == objectid) {
1464 btrfs_set_root_refs(root_item, 0);
1465 memset(&root_item->drop_progress, 0,
1466 sizeof(struct btrfs_disk_key));
1467 root_item->drop_level = 0;
1468 }
5d4f98a2
YZ
1469
1470 btrfs_tree_unlock(eb);
1471 free_extent_buffer(eb);
1472
0b246afa 1473 ret = btrfs_insert_root(trans, fs_info->tree_root,
5d4f98a2
YZ
1474 &root_key, root_item);
1475 BUG_ON(ret);
1476 kfree(root_item);
1477
3dbf1738 1478 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
5d4f98a2 1479 BUG_ON(IS_ERR(reloc_root));
3dbf1738 1480 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
5d4f98a2 1481 reloc_root->last_trans = trans->transid;
3fd0a558
YZ
1482 return reloc_root;
1483}
1484
1485/*
1486 * create reloc tree for a given fs tree. reloc tree is just a
1487 * snapshot of the fs tree with special root objectid.
f44deb74
JB
1488 *
1489 * The reloc_root comes out of here with two references, one for
1490 * root->reloc_root, and another for being on the rc->reloc_roots list.
3fd0a558
YZ
1491 */
1492int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1493 struct btrfs_root *root)
1494{
0b246afa 1495 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558 1496 struct btrfs_root *reloc_root;
0b246afa 1497 struct reloc_control *rc = fs_info->reloc_ctl;
20dd2cbf 1498 struct btrfs_block_rsv *rsv;
3fd0a558 1499 int clear_rsv = 0;
ffd7b339 1500 int ret;
3fd0a558 1501
aec7db3b 1502 if (!rc)
2abc726a
JB
1503 return 0;
1504
1fac4a54
QW
1505 /*
1506 * The subvolume has reloc tree but the swap is finished, no need to
1507 * create/update the dead reloc tree
1508 */
6282675e 1509 if (reloc_root_is_dead(root))
1fac4a54
QW
1510 return 0;
1511
aec7db3b
JB
1512 /*
1513 * This is subtle but important. We do not do
1514 * record_root_in_transaction for reloc roots, instead we record their
1515 * corresponding fs root, and then here we update the last trans for the
1516 * reloc root. This means that we have to do this for the entire life
1517 * of the reloc root, regardless of which stage of the relocation we are
1518 * in.
1519 */
3fd0a558
YZ
1520 if (root->reloc_root) {
1521 reloc_root = root->reloc_root;
1522 reloc_root->last_trans = trans->transid;
1523 return 0;
1524 }
1525
aec7db3b
JB
1526 /*
1527 * We are merging reloc roots, we do not need new reloc trees. Also
1528 * reloc trees never need their own reloc tree.
1529 */
1530 if (!rc->create_reloc_tree ||
1531 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1532 return 0;
1533
20dd2cbf
MX
1534 if (!trans->reloc_reserved) {
1535 rsv = trans->block_rsv;
3fd0a558
YZ
1536 trans->block_rsv = rc->block_rsv;
1537 clear_rsv = 1;
1538 }
1539 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1540 if (clear_rsv)
20dd2cbf 1541 trans->block_rsv = rsv;
5d4f98a2 1542
ffd7b339
JM
1543 ret = __add_reloc_root(reloc_root);
1544 BUG_ON(ret < 0);
f44deb74 1545 root->reloc_root = btrfs_grab_root(reloc_root);
5d4f98a2
YZ
1546 return 0;
1547}
1548
1549/*
1550 * update root item of reloc tree
1551 */
1552int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1553 struct btrfs_root *root)
1554{
0b246afa 1555 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1556 struct btrfs_root *reloc_root;
1557 struct btrfs_root_item *root_item;
5d4f98a2
YZ
1558 int ret;
1559
6282675e 1560 if (!have_reloc_root(root))
7585717f 1561 goto out;
5d4f98a2
YZ
1562
1563 reloc_root = root->reloc_root;
1564 root_item = &reloc_root->root_item;
1565
f44deb74
JB
1566 /*
1567 * We are probably ok here, but __del_reloc_root() will drop its ref of
1568 * the root. We have the ref for root->reloc_root, but just in case
1569 * hold it while we update the reloc root.
1570 */
1571 btrfs_grab_root(reloc_root);
1572
d2311e69 1573 /* root->reloc_root will stay until current relocation finished */
0b246afa 1574 if (fs_info->reloc_ctl->merge_reloc_tree &&
3fd0a558 1575 btrfs_root_refs(root_item) == 0) {
d2311e69 1576 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
6282675e
QW
1577 /*
1578 * Mark the tree as dead before we change reloc_root so
1579 * have_reloc_root will not touch it from now on.
1580 */
1581 smp_wmb();
c974c464 1582 __del_reloc_root(reloc_root);
5d4f98a2
YZ
1583 }
1584
5d4f98a2 1585 if (reloc_root->commit_root != reloc_root->node) {
ea287ab1 1586 __update_reloc_root(reloc_root);
5d4f98a2
YZ
1587 btrfs_set_root_node(root_item, reloc_root->node);
1588 free_extent_buffer(reloc_root->commit_root);
1589 reloc_root->commit_root = btrfs_root_node(reloc_root);
1590 }
1591
0b246afa 1592 ret = btrfs_update_root(trans, fs_info->tree_root,
5d4f98a2
YZ
1593 &reloc_root->root_key, root_item);
1594 BUG_ON(ret);
f44deb74 1595 btrfs_put_root(reloc_root);
7585717f 1596out:
5d4f98a2
YZ
1597 return 0;
1598}
1599
1600/*
1601 * helper to find first cached inode with inode number >= objectid
1602 * in a subvolume
1603 */
1604static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1605{
1606 struct rb_node *node;
1607 struct rb_node *prev;
1608 struct btrfs_inode *entry;
1609 struct inode *inode;
1610
1611 spin_lock(&root->inode_lock);
1612again:
1613 node = root->inode_tree.rb_node;
1614 prev = NULL;
1615 while (node) {
1616 prev = node;
1617 entry = rb_entry(node, struct btrfs_inode, rb_node);
1618
4a0cc7ca 1619 if (objectid < btrfs_ino(entry))
5d4f98a2 1620 node = node->rb_left;
4a0cc7ca 1621 else if (objectid > btrfs_ino(entry))
5d4f98a2
YZ
1622 node = node->rb_right;
1623 else
1624 break;
1625 }
1626 if (!node) {
1627 while (prev) {
1628 entry = rb_entry(prev, struct btrfs_inode, rb_node);
4a0cc7ca 1629 if (objectid <= btrfs_ino(entry)) {
5d4f98a2
YZ
1630 node = prev;
1631 break;
1632 }
1633 prev = rb_next(prev);
1634 }
1635 }
1636 while (node) {
1637 entry = rb_entry(node, struct btrfs_inode, rb_node);
1638 inode = igrab(&entry->vfs_inode);
1639 if (inode) {
1640 spin_unlock(&root->inode_lock);
1641 return inode;
1642 }
1643
4a0cc7ca 1644 objectid = btrfs_ino(entry) + 1;
5d4f98a2
YZ
1645 if (cond_resched_lock(&root->inode_lock))
1646 goto again;
1647
1648 node = rb_next(node);
1649 }
1650 spin_unlock(&root->inode_lock);
1651 return NULL;
1652}
1653
5d4f98a2
YZ
1654/*
1655 * get new location of data
1656 */
1657static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1658 u64 bytenr, u64 num_bytes)
1659{
1660 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1661 struct btrfs_path *path;
1662 struct btrfs_file_extent_item *fi;
1663 struct extent_buffer *leaf;
1664 int ret;
1665
1666 path = btrfs_alloc_path();
1667 if (!path)
1668 return -ENOMEM;
1669
1670 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
f85b7379
DS
1671 ret = btrfs_lookup_file_extent(NULL, root, path,
1672 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
5d4f98a2
YZ
1673 if (ret < 0)
1674 goto out;
1675 if (ret > 0) {
1676 ret = -ENOENT;
1677 goto out;
1678 }
1679
1680 leaf = path->nodes[0];
1681 fi = btrfs_item_ptr(leaf, path->slots[0],
1682 struct btrfs_file_extent_item);
1683
1684 BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1685 btrfs_file_extent_compression(leaf, fi) ||
1686 btrfs_file_extent_encryption(leaf, fi) ||
1687 btrfs_file_extent_other_encoding(leaf, fi));
1688
1689 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
83d4cfd4 1690 ret = -EINVAL;
5d4f98a2
YZ
1691 goto out;
1692 }
1693
3fd0a558 1694 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5d4f98a2
YZ
1695 ret = 0;
1696out:
1697 btrfs_free_path(path);
1698 return ret;
1699}
1700
1701/*
1702 * update file extent items in the tree leaf to point to
1703 * the new locations.
1704 */
3fd0a558
YZ
1705static noinline_for_stack
1706int replace_file_extents(struct btrfs_trans_handle *trans,
1707 struct reloc_control *rc,
1708 struct btrfs_root *root,
1709 struct extent_buffer *leaf)
5d4f98a2 1710{
0b246afa 1711 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1712 struct btrfs_key key;
1713 struct btrfs_file_extent_item *fi;
1714 struct inode *inode = NULL;
5d4f98a2
YZ
1715 u64 parent;
1716 u64 bytenr;
3fd0a558 1717 u64 new_bytenr = 0;
5d4f98a2
YZ
1718 u64 num_bytes;
1719 u64 end;
1720 u32 nritems;
1721 u32 i;
83d4cfd4 1722 int ret = 0;
5d4f98a2
YZ
1723 int first = 1;
1724 int dirty = 0;
1725
1726 if (rc->stage != UPDATE_DATA_PTRS)
1727 return 0;
1728
1729 /* reloc trees always use full backref */
1730 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1731 parent = leaf->start;
1732 else
1733 parent = 0;
1734
1735 nritems = btrfs_header_nritems(leaf);
1736 for (i = 0; i < nritems; i++) {
82fa113f
QW
1737 struct btrfs_ref ref = { 0 };
1738
5d4f98a2
YZ
1739 cond_resched();
1740 btrfs_item_key_to_cpu(leaf, &key, i);
1741 if (key.type != BTRFS_EXTENT_DATA_KEY)
1742 continue;
1743 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1744 if (btrfs_file_extent_type(leaf, fi) ==
1745 BTRFS_FILE_EXTENT_INLINE)
1746 continue;
1747 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1748 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1749 if (bytenr == 0)
1750 continue;
9569cc20
QW
1751 if (!in_range(bytenr, rc->block_group->start,
1752 rc->block_group->length))
5d4f98a2
YZ
1753 continue;
1754
1755 /*
1756 * if we are modifying block in fs tree, wait for readpage
1757 * to complete and drop the extent cache
1758 */
1759 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5d4f98a2
YZ
1760 if (first) {
1761 inode = find_next_inode(root, key.objectid);
5d4f98a2 1762 first = 0;
4a0cc7ca 1763 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
3fd0a558 1764 btrfs_add_delayed_iput(inode);
5d4f98a2 1765 inode = find_next_inode(root, key.objectid);
5d4f98a2 1766 }
4a0cc7ca 1767 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
5d4f98a2
YZ
1768 end = key.offset +
1769 btrfs_file_extent_num_bytes(leaf, fi);
1770 WARN_ON(!IS_ALIGNED(key.offset,
0b246afa
JM
1771 fs_info->sectorsize));
1772 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
5d4f98a2
YZ
1773 end--;
1774 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
d0082371 1775 key.offset, end);
5d4f98a2
YZ
1776 if (!ret)
1777 continue;
1778
dcdbc059
NB
1779 btrfs_drop_extent_cache(BTRFS_I(inode),
1780 key.offset, end, 1);
5d4f98a2 1781 unlock_extent(&BTRFS_I(inode)->io_tree,
d0082371 1782 key.offset, end);
5d4f98a2
YZ
1783 }
1784 }
1785
1786 ret = get_new_location(rc->data_inode, &new_bytenr,
1787 bytenr, num_bytes);
83d4cfd4
JB
1788 if (ret) {
1789 /*
1790 * Don't have to abort since we've not changed anything
1791 * in the file extent yet.
1792 */
1793 break;
3fd0a558 1794 }
5d4f98a2
YZ
1795
1796 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1797 dirty = 1;
1798
1799 key.offset -= btrfs_file_extent_offset(leaf, fi);
82fa113f
QW
1800 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1801 num_bytes, parent);
1802 ref.real_root = root->root_key.objectid;
1803 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1804 key.objectid, key.offset);
1805 ret = btrfs_inc_extent_ref(trans, &ref);
83d4cfd4 1806 if (ret) {
66642832 1807 btrfs_abort_transaction(trans, ret);
83d4cfd4
JB
1808 break;
1809 }
5d4f98a2 1810
ffd4bb2a
QW
1811 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1812 num_bytes, parent);
1813 ref.real_root = root->root_key.objectid;
1814 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1815 key.objectid, key.offset);
1816 ret = btrfs_free_extent(trans, &ref);
83d4cfd4 1817 if (ret) {
66642832 1818 btrfs_abort_transaction(trans, ret);
83d4cfd4
JB
1819 break;
1820 }
5d4f98a2
YZ
1821 }
1822 if (dirty)
1823 btrfs_mark_buffer_dirty(leaf);
3fd0a558
YZ
1824 if (inode)
1825 btrfs_add_delayed_iput(inode);
83d4cfd4 1826 return ret;
5d4f98a2
YZ
1827}
1828
1829static noinline_for_stack
1830int memcmp_node_keys(struct extent_buffer *eb, int slot,
1831 struct btrfs_path *path, int level)
1832{
1833 struct btrfs_disk_key key1;
1834 struct btrfs_disk_key key2;
1835 btrfs_node_key(eb, &key1, slot);
1836 btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1837 return memcmp(&key1, &key2, sizeof(key1));
1838}
1839
1840/*
1841 * try to replace tree blocks in fs tree with the new blocks
1842 * in reloc tree. tree blocks haven't been modified since the
1843 * reloc tree was create can be replaced.
1844 *
1845 * if a block was replaced, level of the block + 1 is returned.
1846 * if no block got replaced, 0 is returned. if there are other
1847 * errors, a negative error number is returned.
1848 */
3fd0a558 1849static noinline_for_stack
3d0174f7 1850int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
3fd0a558
YZ
1851 struct btrfs_root *dest, struct btrfs_root *src,
1852 struct btrfs_path *path, struct btrfs_key *next_key,
1853 int lowest_level, int max_level)
5d4f98a2 1854{
0b246afa 1855 struct btrfs_fs_info *fs_info = dest->fs_info;
5d4f98a2
YZ
1856 struct extent_buffer *eb;
1857 struct extent_buffer *parent;
82fa113f 1858 struct btrfs_ref ref = { 0 };
5d4f98a2
YZ
1859 struct btrfs_key key;
1860 u64 old_bytenr;
1861 u64 new_bytenr;
1862 u64 old_ptr_gen;
1863 u64 new_ptr_gen;
1864 u64 last_snapshot;
1865 u32 blocksize;
3fd0a558 1866 int cow = 0;
5d4f98a2
YZ
1867 int level;
1868 int ret;
1869 int slot;
1870
1871 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1872 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
5d4f98a2
YZ
1873
1874 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
3fd0a558 1875again:
5d4f98a2
YZ
1876 slot = path->slots[lowest_level];
1877 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1878
1879 eb = btrfs_lock_root_node(dest);
8bead258 1880 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
1881 level = btrfs_header_level(eb);
1882
1883 if (level < lowest_level) {
1884 btrfs_tree_unlock(eb);
1885 free_extent_buffer(eb);
1886 return 0;
1887 }
1888
3fd0a558
YZ
1889 if (cow) {
1890 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1891 BUG_ON(ret);
1892 }
8bead258 1893 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
1894
1895 if (next_key) {
1896 next_key->objectid = (u64)-1;
1897 next_key->type = (u8)-1;
1898 next_key->offset = (u64)-1;
1899 }
1900
1901 parent = eb;
1902 while (1) {
581c1760
QW
1903 struct btrfs_key first_key;
1904
5d4f98a2
YZ
1905 level = btrfs_header_level(parent);
1906 BUG_ON(level < lowest_level);
1907
1908 ret = btrfs_bin_search(parent, &key, level, &slot);
cbca7d59
FM
1909 if (ret < 0)
1910 break;
5d4f98a2
YZ
1911 if (ret && slot > 0)
1912 slot--;
1913
1914 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1915 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1916
1917 old_bytenr = btrfs_node_blockptr(parent, slot);
0b246afa 1918 blocksize = fs_info->nodesize;
5d4f98a2 1919 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
17515f1b 1920 btrfs_node_key_to_cpu(parent, &first_key, slot);
5d4f98a2
YZ
1921
1922 if (level <= max_level) {
1923 eb = path->nodes[level];
1924 new_bytenr = btrfs_node_blockptr(eb,
1925 path->slots[level]);
1926 new_ptr_gen = btrfs_node_ptr_generation(eb,
1927 path->slots[level]);
1928 } else {
1929 new_bytenr = 0;
1930 new_ptr_gen = 0;
1931 }
1932
fae7f21c 1933 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
5d4f98a2
YZ
1934 ret = level;
1935 break;
1936 }
1937
1938 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1939 memcmp_node_keys(parent, slot, path, level)) {
3fd0a558 1940 if (level <= lowest_level) {
5d4f98a2
YZ
1941 ret = 0;
1942 break;
1943 }
1944
581c1760
QW
1945 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
1946 level - 1, &first_key);
64c043de
LB
1947 if (IS_ERR(eb)) {
1948 ret = PTR_ERR(eb);
264813ac 1949 break;
64c043de
LB
1950 } else if (!extent_buffer_uptodate(eb)) {
1951 ret = -EIO;
416bc658 1952 free_extent_buffer(eb);
379cde74 1953 break;
416bc658 1954 }
5d4f98a2 1955 btrfs_tree_lock(eb);
3fd0a558
YZ
1956 if (cow) {
1957 ret = btrfs_cow_block(trans, dest, eb, parent,
1958 slot, &eb);
1959 BUG_ON(ret);
5d4f98a2 1960 }
8bead258 1961 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
1962
1963 btrfs_tree_unlock(parent);
1964 free_extent_buffer(parent);
1965
1966 parent = eb;
1967 continue;
1968 }
1969
3fd0a558
YZ
1970 if (!cow) {
1971 btrfs_tree_unlock(parent);
1972 free_extent_buffer(parent);
1973 cow = 1;
1974 goto again;
1975 }
1976
5d4f98a2
YZ
1977 btrfs_node_key_to_cpu(path->nodes[level], &key,
1978 path->slots[level]);
b3b4aa74 1979 btrfs_release_path(path);
5d4f98a2
YZ
1980
1981 path->lowest_level = level;
1982 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1983 path->lowest_level = 0;
1984 BUG_ON(ret);
1985
824d8dff
QW
1986 /*
1987 * Info qgroup to trace both subtrees.
1988 *
1989 * We must trace both trees.
1990 * 1) Tree reloc subtree
1991 * If not traced, we will leak data numbers
1992 * 2) Fs subtree
1993 * If not traced, we will double count old data
f616f5cd
QW
1994 *
1995 * We don't scan the subtree right now, but only record
1996 * the swapped tree blocks.
1997 * The real subtree rescan is delayed until we have new
1998 * CoW on the subtree root node before transaction commit.
824d8dff 1999 */
370a11b8
QW
2000 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
2001 rc->block_group, parent, slot,
2002 path->nodes[level], path->slots[level],
2003 last_snapshot);
2004 if (ret < 0)
2005 break;
5d4f98a2
YZ
2006 /*
2007 * swap blocks in fs tree and reloc tree.
2008 */
2009 btrfs_set_node_blockptr(parent, slot, new_bytenr);
2010 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
2011 btrfs_mark_buffer_dirty(parent);
2012
2013 btrfs_set_node_blockptr(path->nodes[level],
2014 path->slots[level], old_bytenr);
2015 btrfs_set_node_ptr_generation(path->nodes[level],
2016 path->slots[level], old_ptr_gen);
2017 btrfs_mark_buffer_dirty(path->nodes[level]);
2018
82fa113f
QW
2019 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
2020 blocksize, path->nodes[level]->start);
2021 ref.skip_qgroup = true;
2022 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
2023 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2 2024 BUG_ON(ret);
82fa113f
QW
2025 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
2026 blocksize, 0);
2027 ref.skip_qgroup = true;
2028 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
2029 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2
YZ
2030 BUG_ON(ret);
2031
ffd4bb2a
QW
2032 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
2033 blocksize, path->nodes[level]->start);
2034 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
2035 ref.skip_qgroup = true;
2036 ret = btrfs_free_extent(trans, &ref);
5d4f98a2
YZ
2037 BUG_ON(ret);
2038
ffd4bb2a
QW
2039 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
2040 blocksize, 0);
2041 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
2042 ref.skip_qgroup = true;
2043 ret = btrfs_free_extent(trans, &ref);
5d4f98a2
YZ
2044 BUG_ON(ret);
2045
2046 btrfs_unlock_up_safe(path, 0);
2047
2048 ret = level;
2049 break;
2050 }
2051 btrfs_tree_unlock(parent);
2052 free_extent_buffer(parent);
2053 return ret;
2054}
2055
2056/*
2057 * helper to find next relocated block in reloc tree
2058 */
2059static noinline_for_stack
2060int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2061 int *level)
2062{
2063 struct extent_buffer *eb;
2064 int i;
2065 u64 last_snapshot;
2066 u32 nritems;
2067
2068 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2069
2070 for (i = 0; i < *level; i++) {
2071 free_extent_buffer(path->nodes[i]);
2072 path->nodes[i] = NULL;
2073 }
2074
2075 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
2076 eb = path->nodes[i];
2077 nritems = btrfs_header_nritems(eb);
2078 while (path->slots[i] + 1 < nritems) {
2079 path->slots[i]++;
2080 if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
2081 last_snapshot)
2082 continue;
2083
2084 *level = i;
2085 return 0;
2086 }
2087 free_extent_buffer(path->nodes[i]);
2088 path->nodes[i] = NULL;
2089 }
2090 return 1;
2091}
2092
2093/*
2094 * walk down reloc tree to find relocated block of lowest level
2095 */
2096static noinline_for_stack
2097int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2098 int *level)
2099{
2ff7e61e 2100 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
2101 struct extent_buffer *eb = NULL;
2102 int i;
2103 u64 bytenr;
2104 u64 ptr_gen = 0;
2105 u64 last_snapshot;
5d4f98a2
YZ
2106 u32 nritems;
2107
2108 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2109
2110 for (i = *level; i > 0; i--) {
581c1760
QW
2111 struct btrfs_key first_key;
2112
5d4f98a2
YZ
2113 eb = path->nodes[i];
2114 nritems = btrfs_header_nritems(eb);
2115 while (path->slots[i] < nritems) {
2116 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2117 if (ptr_gen > last_snapshot)
2118 break;
2119 path->slots[i]++;
2120 }
2121 if (path->slots[i] >= nritems) {
2122 if (i == *level)
2123 break;
2124 *level = i + 1;
2125 return 0;
2126 }
2127 if (i == 1) {
2128 *level = i;
2129 return 0;
2130 }
2131
2132 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
581c1760
QW
2133 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
2134 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
2135 &first_key);
64c043de
LB
2136 if (IS_ERR(eb)) {
2137 return PTR_ERR(eb);
2138 } else if (!extent_buffer_uptodate(eb)) {
416bc658
JB
2139 free_extent_buffer(eb);
2140 return -EIO;
2141 }
5d4f98a2
YZ
2142 BUG_ON(btrfs_header_level(eb) != i - 1);
2143 path->nodes[i - 1] = eb;
2144 path->slots[i - 1] = 0;
2145 }
2146 return 1;
2147}
2148
2149/*
2150 * invalidate extent cache for file extents whose key in range of
2151 * [min_key, max_key)
2152 */
2153static int invalidate_extent_cache(struct btrfs_root *root,
2154 struct btrfs_key *min_key,
2155 struct btrfs_key *max_key)
2156{
0b246afa 2157 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
2158 struct inode *inode = NULL;
2159 u64 objectid;
2160 u64 start, end;
33345d01 2161 u64 ino;
5d4f98a2
YZ
2162
2163 objectid = min_key->objectid;
2164 while (1) {
2165 cond_resched();
2166 iput(inode);
2167
2168 if (objectid > max_key->objectid)
2169 break;
2170
2171 inode = find_next_inode(root, objectid);
2172 if (!inode)
2173 break;
4a0cc7ca 2174 ino = btrfs_ino(BTRFS_I(inode));
5d4f98a2 2175
33345d01 2176 if (ino > max_key->objectid) {
5d4f98a2
YZ
2177 iput(inode);
2178 break;
2179 }
2180
33345d01 2181 objectid = ino + 1;
5d4f98a2
YZ
2182 if (!S_ISREG(inode->i_mode))
2183 continue;
2184
33345d01 2185 if (unlikely(min_key->objectid == ino)) {
5d4f98a2
YZ
2186 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2187 continue;
2188 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2189 start = 0;
2190 else {
2191 start = min_key->offset;
0b246afa 2192 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
5d4f98a2
YZ
2193 }
2194 } else {
2195 start = 0;
2196 }
2197
33345d01 2198 if (unlikely(max_key->objectid == ino)) {
5d4f98a2
YZ
2199 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2200 continue;
2201 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2202 end = (u64)-1;
2203 } else {
2204 if (max_key->offset == 0)
2205 continue;
2206 end = max_key->offset;
0b246afa 2207 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
5d4f98a2
YZ
2208 end--;
2209 }
2210 } else {
2211 end = (u64)-1;
2212 }
2213
2214 /* the lock_extent waits for readpage to complete */
d0082371 2215 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
dcdbc059 2216 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
d0082371 2217 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
5d4f98a2
YZ
2218 }
2219 return 0;
2220}
2221
2222static int find_next_key(struct btrfs_path *path, int level,
2223 struct btrfs_key *key)
2224
2225{
2226 while (level < BTRFS_MAX_LEVEL) {
2227 if (!path->nodes[level])
2228 break;
2229 if (path->slots[level] + 1 <
2230 btrfs_header_nritems(path->nodes[level])) {
2231 btrfs_node_key_to_cpu(path->nodes[level], key,
2232 path->slots[level] + 1);
2233 return 0;
2234 }
2235 level++;
2236 }
2237 return 1;
2238}
2239
d2311e69
QW
2240/*
2241 * Insert current subvolume into reloc_control::dirty_subvol_roots
2242 */
2243static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
2244 struct reloc_control *rc,
2245 struct btrfs_root *root)
2246{
2247 struct btrfs_root *reloc_root = root->reloc_root;
2248 struct btrfs_root_item *reloc_root_item;
2249
2250 /* @root must be a subvolume tree root with a valid reloc tree */
2251 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
2252 ASSERT(reloc_root);
2253
2254 reloc_root_item = &reloc_root->root_item;
2255 memset(&reloc_root_item->drop_progress, 0,
2256 sizeof(reloc_root_item->drop_progress));
2257 reloc_root_item->drop_level = 0;
2258 btrfs_set_root_refs(reloc_root_item, 0);
2259 btrfs_update_reloc_root(trans, root);
2260
2261 if (list_empty(&root->reloc_dirty_list)) {
00246528 2262 btrfs_grab_root(root);
d2311e69
QW
2263 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
2264 }
2265}
2266
2267static int clean_dirty_subvols(struct reloc_control *rc)
2268{
2269 struct btrfs_root *root;
2270 struct btrfs_root *next;
2271 int ret = 0;
30d40577 2272 int ret2;
d2311e69
QW
2273
2274 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
2275 reloc_dirty_list) {
30d40577
QW
2276 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2277 /* Merged subvolume, cleanup its reloc root */
2278 struct btrfs_root *reloc_root = root->reloc_root;
d2311e69 2279
30d40577
QW
2280 list_del_init(&root->reloc_dirty_list);
2281 root->reloc_root = NULL;
6282675e
QW
2282 /*
2283 * Need barrier to ensure clear_bit() only happens after
2284 * root->reloc_root = NULL. Pairs with have_reloc_root.
2285 */
2286 smp_wmb();
1fac4a54 2287 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
f28de8d8 2288 if (reloc_root) {
f44deb74
JB
2289 /*
2290 * btrfs_drop_snapshot drops our ref we hold for
2291 * ->reloc_root. If it fails however we must
2292 * drop the ref ourselves.
2293 */
f28de8d8 2294 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
f44deb74
JB
2295 if (ret2 < 0) {
2296 btrfs_put_root(reloc_root);
2297 if (!ret)
2298 ret = ret2;
2299 }
f28de8d8 2300 }
00246528 2301 btrfs_put_root(root);
30d40577
QW
2302 } else {
2303 /* Orphan reloc tree, just clean it up */
0078a9f9 2304 ret2 = btrfs_drop_snapshot(root, 0, 1);
f44deb74
JB
2305 if (ret2 < 0) {
2306 btrfs_put_root(root);
2307 if (!ret)
2308 ret = ret2;
2309 }
d2311e69 2310 }
d2311e69
QW
2311 }
2312 return ret;
2313}
2314
5d4f98a2
YZ
2315/*
2316 * merge the relocated tree blocks in reloc tree with corresponding
2317 * fs tree.
2318 */
2319static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2320 struct btrfs_root *root)
2321{
0b246afa 2322 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
2323 struct btrfs_key key;
2324 struct btrfs_key next_key;
9e6a0c52 2325 struct btrfs_trans_handle *trans = NULL;
5d4f98a2
YZ
2326 struct btrfs_root *reloc_root;
2327 struct btrfs_root_item *root_item;
2328 struct btrfs_path *path;
3fd0a558 2329 struct extent_buffer *leaf;
5d4f98a2
YZ
2330 int level;
2331 int max_level;
2332 int replaced = 0;
2333 int ret;
2334 int err = 0;
3fd0a558 2335 u32 min_reserved;
5d4f98a2
YZ
2336
2337 path = btrfs_alloc_path();
2338 if (!path)
2339 return -ENOMEM;
e4058b54 2340 path->reada = READA_FORWARD;
5d4f98a2
YZ
2341
2342 reloc_root = root->reloc_root;
2343 root_item = &reloc_root->root_item;
2344
2345 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2346 level = btrfs_root_level(root_item);
67439dad 2347 atomic_inc(&reloc_root->node->refs);
5d4f98a2
YZ
2348 path->nodes[level] = reloc_root->node;
2349 path->slots[level] = 0;
2350 } else {
2351 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2352
2353 level = root_item->drop_level;
2354 BUG_ON(level == 0);
2355 path->lowest_level = level;
2356 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
33c66f43 2357 path->lowest_level = 0;
5d4f98a2
YZ
2358 if (ret < 0) {
2359 btrfs_free_path(path);
2360 return ret;
2361 }
2362
2363 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2364 path->slots[level]);
2365 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2366
2367 btrfs_unlock_up_safe(path, 0);
2368 }
2369
0b246afa 2370 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
3fd0a558 2371 memset(&next_key, 0, sizeof(next_key));
5d4f98a2 2372
3fd0a558 2373 while (1) {
08e007d2
MX
2374 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2375 BTRFS_RESERVE_FLUSH_ALL);
3fd0a558 2376 if (ret) {
9e6a0c52
JB
2377 err = ret;
2378 goto out;
5d4f98a2 2379 }
9e6a0c52
JB
2380 trans = btrfs_start_transaction(root, 0);
2381 if (IS_ERR(trans)) {
2382 err = PTR_ERR(trans);
2383 trans = NULL;
2384 goto out;
2385 }
2abc726a
JB
2386
2387 /*
2388 * At this point we no longer have a reloc_control, so we can't
2389 * depend on btrfs_init_reloc_root to update our last_trans.
2390 *
2391 * But that's ok, we started the trans handle on our
2392 * corresponding fs_root, which means it's been added to the
2393 * dirty list. At commit time we'll still call
2394 * btrfs_update_reloc_root() and update our root item
2395 * appropriately.
2396 */
2397 reloc_root->last_trans = trans->transid;
9e6a0c52 2398 trans->block_rsv = rc->block_rsv;
5d4f98a2 2399
5d4f98a2 2400 replaced = 0;
5d4f98a2
YZ
2401 max_level = level;
2402
2403 ret = walk_down_reloc_tree(reloc_root, path, &level);
2404 if (ret < 0) {
2405 err = ret;
2406 goto out;
2407 }
2408 if (ret > 0)
2409 break;
2410
2411 if (!find_next_key(path, level, &key) &&
2412 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2413 ret = 0;
5d4f98a2 2414 } else {
3d0174f7 2415 ret = replace_path(trans, rc, root, reloc_root, path,
3fd0a558 2416 &next_key, level, max_level);
5d4f98a2
YZ
2417 }
2418 if (ret < 0) {
2419 err = ret;
2420 goto out;
2421 }
2422
2423 if (ret > 0) {
2424 level = ret;
2425 btrfs_node_key_to_cpu(path->nodes[level], &key,
2426 path->slots[level]);
2427 replaced = 1;
5d4f98a2
YZ
2428 }
2429
2430 ret = walk_up_reloc_tree(reloc_root, path, &level);
2431 if (ret > 0)
2432 break;
2433
2434 BUG_ON(level == 0);
2435 /*
2436 * save the merging progress in the drop_progress.
2437 * this is OK since root refs == 1 in this case.
2438 */
2439 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2440 path->slots[level]);
2441 root_item->drop_level = level;
2442
3a45bb20 2443 btrfs_end_transaction_throttle(trans);
9e6a0c52 2444 trans = NULL;
5d4f98a2 2445
2ff7e61e 2446 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
2447
2448 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2449 invalidate_extent_cache(root, &key, &next_key);
2450 }
2451
2452 /*
2453 * handle the case only one block in the fs tree need to be
2454 * relocated and the block is tree root.
2455 */
2456 leaf = btrfs_lock_root_node(root);
2457 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2458 btrfs_tree_unlock(leaf);
2459 free_extent_buffer(leaf);
2460 if (ret < 0)
2461 err = ret;
2462out:
2463 btrfs_free_path(path);
2464
d2311e69
QW
2465 if (err == 0)
2466 insert_dirty_subvol(trans, rc, root);
5d4f98a2 2467
9e6a0c52 2468 if (trans)
3a45bb20 2469 btrfs_end_transaction_throttle(trans);
5d4f98a2 2470
2ff7e61e 2471 btrfs_btree_balance_dirty(fs_info);
5d4f98a2 2472
5d4f98a2
YZ
2473 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2474 invalidate_extent_cache(root, &key, &next_key);
2475
2476 return err;
2477}
2478
3fd0a558
YZ
2479static noinline_for_stack
2480int prepare_to_merge(struct reloc_control *rc, int err)
5d4f98a2 2481{
3fd0a558 2482 struct btrfs_root *root = rc->extent_root;
0b246afa 2483 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 2484 struct btrfs_root *reloc_root;
3fd0a558
YZ
2485 struct btrfs_trans_handle *trans;
2486 LIST_HEAD(reloc_roots);
2487 u64 num_bytes = 0;
2488 int ret;
3fd0a558 2489
0b246afa
JM
2490 mutex_lock(&fs_info->reloc_mutex);
2491 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
3fd0a558 2492 rc->merging_rsv_size += rc->nodes_relocated * 2;
0b246afa 2493 mutex_unlock(&fs_info->reloc_mutex);
7585717f 2494
3fd0a558
YZ
2495again:
2496 if (!err) {
2497 num_bytes = rc->merging_rsv_size;
08e007d2
MX
2498 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2499 BTRFS_RESERVE_FLUSH_ALL);
3fd0a558
YZ
2500 if (ret)
2501 err = ret;
2502 }
2503
7a7eaa40 2504 trans = btrfs_join_transaction(rc->extent_root);
3612b495
TI
2505 if (IS_ERR(trans)) {
2506 if (!err)
2ff7e61e 2507 btrfs_block_rsv_release(fs_info, rc->block_rsv,
63f018be 2508 num_bytes, NULL);
3612b495
TI
2509 return PTR_ERR(trans);
2510 }
3fd0a558
YZ
2511
2512 if (!err) {
2513 if (num_bytes != rc->merging_rsv_size) {
3a45bb20 2514 btrfs_end_transaction(trans);
2ff7e61e 2515 btrfs_block_rsv_release(fs_info, rc->block_rsv,
63f018be 2516 num_bytes, NULL);
3fd0a558
YZ
2517 goto again;
2518 }
2519 }
5d4f98a2 2520
3fd0a558
YZ
2521 rc->merge_reloc_tree = 1;
2522
2523 while (!list_empty(&rc->reloc_roots)) {
2524 reloc_root = list_entry(rc->reloc_roots.next,
2525 struct btrfs_root, root_list);
2526 list_del_init(&reloc_root->root_list);
5d4f98a2 2527
0b246afa 2528 root = read_fs_root(fs_info, reloc_root->root_key.offset);
5d4f98a2
YZ
2529 BUG_ON(IS_ERR(root));
2530 BUG_ON(root->reloc_root != reloc_root);
2531
3fd0a558
YZ
2532 /*
2533 * set reference count to 1, so btrfs_recover_relocation
2534 * knows it should resumes merging
2535 */
2536 if (!err)
2537 btrfs_set_root_refs(&reloc_root->root_item, 1);
5d4f98a2 2538 btrfs_update_reloc_root(trans, root);
5d4f98a2 2539
3fd0a558 2540 list_add(&reloc_root->root_list, &reloc_roots);
00246528 2541 btrfs_put_root(root);
3fd0a558 2542 }
5d4f98a2 2543
3fd0a558 2544 list_splice(&reloc_roots, &rc->reloc_roots);
5d4f98a2 2545
3fd0a558 2546 if (!err)
3a45bb20 2547 btrfs_commit_transaction(trans);
3fd0a558 2548 else
3a45bb20 2549 btrfs_end_transaction(trans);
3fd0a558 2550 return err;
5d4f98a2
YZ
2551}
2552
aca1bba6
LB
2553static noinline_for_stack
2554void free_reloc_roots(struct list_head *list)
2555{
2556 struct btrfs_root *reloc_root;
2557
2558 while (!list_empty(list)) {
2559 reloc_root = list_entry(list->next, struct btrfs_root,
2560 root_list);
bb166d72 2561 __del_reloc_root(reloc_root);
aca1bba6
LB
2562 }
2563}
2564
3fd0a558 2565static noinline_for_stack
94404e82 2566void merge_reloc_roots(struct reloc_control *rc)
5d4f98a2 2567{
0b246afa 2568 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2 2569 struct btrfs_root *root;
3fd0a558
YZ
2570 struct btrfs_root *reloc_root;
2571 LIST_HEAD(reloc_roots);
2572 int found = 0;
aca1bba6 2573 int ret = 0;
3fd0a558
YZ
2574again:
2575 root = rc->extent_root;
7585717f
CM
2576
2577 /*
2578 * this serializes us with btrfs_record_root_in_transaction,
2579 * we have to make sure nobody is in the middle of
2580 * adding their roots to the list while we are
2581 * doing this splice
2582 */
0b246afa 2583 mutex_lock(&fs_info->reloc_mutex);
3fd0a558 2584 list_splice_init(&rc->reloc_roots, &reloc_roots);
0b246afa 2585 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2 2586
3fd0a558
YZ
2587 while (!list_empty(&reloc_roots)) {
2588 found = 1;
2589 reloc_root = list_entry(reloc_roots.next,
2590 struct btrfs_root, root_list);
5d4f98a2 2591
3fd0a558 2592 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
0b246afa 2593 root = read_fs_root(fs_info,
3fd0a558
YZ
2594 reloc_root->root_key.offset);
2595 BUG_ON(IS_ERR(root));
2596 BUG_ON(root->reloc_root != reloc_root);
5d4f98a2 2597
3fd0a558 2598 ret = merge_reloc_root(rc, root);
00246528 2599 btrfs_put_root(root);
b37b39cd 2600 if (ret) {
25e293c2
WS
2601 if (list_empty(&reloc_root->root_list))
2602 list_add_tail(&reloc_root->root_list,
2603 &reloc_roots);
aca1bba6 2604 goto out;
b37b39cd 2605 }
3fd0a558
YZ
2606 } else {
2607 list_del_init(&reloc_root->root_list);
30d40577
QW
2608 /* Don't forget to queue this reloc root for cleanup */
2609 list_add_tail(&reloc_root->reloc_dirty_list,
2610 &rc->dirty_subvol_roots);
3fd0a558 2611 }
5d4f98a2
YZ
2612 }
2613
3fd0a558
YZ
2614 if (found) {
2615 found = 0;
2616 goto again;
2617 }
aca1bba6
LB
2618out:
2619 if (ret) {
0b246afa 2620 btrfs_handle_fs_error(fs_info, ret, NULL);
aca1bba6
LB
2621 if (!list_empty(&reloc_roots))
2622 free_reloc_roots(&reloc_roots);
467bb1d2
WS
2623
2624 /* new reloc root may be added */
0b246afa 2625 mutex_lock(&fs_info->reloc_mutex);
467bb1d2 2626 list_splice_init(&rc->reloc_roots, &reloc_roots);
0b246afa 2627 mutex_unlock(&fs_info->reloc_mutex);
467bb1d2
WS
2628 if (!list_empty(&reloc_roots))
2629 free_reloc_roots(&reloc_roots);
aca1bba6
LB
2630 }
2631
7b7b7431
JB
2632 /*
2633 * We used to have
2634 *
2635 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2636 *
2637 * here, but it's wrong. If we fail to start the transaction in
2638 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2639 * have actually been removed from the reloc_root_tree rb tree. This is
2640 * fine because we're bailing here, and we hold a reference on the root
2641 * for the list that holds it, so these roots will be cleaned up when we
2642 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
2643 * will be cleaned up on unmount.
2644 *
2645 * The remaining nodes will be cleaned up by free_reloc_control.
2646 */
5d4f98a2
YZ
2647}
2648
2649static void free_block_list(struct rb_root *blocks)
2650{
2651 struct tree_block *block;
2652 struct rb_node *rb_node;
2653 while ((rb_node = rb_first(blocks))) {
2654 block = rb_entry(rb_node, struct tree_block, rb_node);
2655 rb_erase(rb_node, blocks);
2656 kfree(block);
2657 }
2658}
2659
2660static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2661 struct btrfs_root *reloc_root)
2662{
0b246afa 2663 struct btrfs_fs_info *fs_info = reloc_root->fs_info;
5d4f98a2 2664 struct btrfs_root *root;
442b1ac5 2665 int ret;
5d4f98a2
YZ
2666
2667 if (reloc_root->last_trans == trans->transid)
2668 return 0;
2669
0b246afa 2670 root = read_fs_root(fs_info, reloc_root->root_key.offset);
5d4f98a2
YZ
2671 BUG_ON(IS_ERR(root));
2672 BUG_ON(root->reloc_root != reloc_root);
442b1ac5 2673 ret = btrfs_record_root_in_trans(trans, root);
00246528 2674 btrfs_put_root(root);
5d4f98a2 2675
442b1ac5 2676 return ret;
5d4f98a2
YZ
2677}
2678
3fd0a558
YZ
2679static noinline_for_stack
2680struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2681 struct reloc_control *rc,
a26195a5
QW
2682 struct btrfs_backref_node *node,
2683 struct btrfs_backref_edge *edges[])
5d4f98a2 2684{
a26195a5 2685 struct btrfs_backref_node *next;
5d4f98a2 2686 struct btrfs_root *root;
3fd0a558
YZ
2687 int index = 0;
2688
5d4f98a2
YZ
2689 next = node;
2690 while (1) {
2691 cond_resched();
2692 next = walk_up_backref(next, edges, &index);
2693 root = next->root;
3fd0a558 2694 BUG_ON(!root);
27cdeb70 2695 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
5d4f98a2
YZ
2696
2697 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2698 record_reloc_root_in_trans(trans, root);
2699 break;
2700 }
2701
3fd0a558
YZ
2702 btrfs_record_root_in_trans(trans, root);
2703 root = root->reloc_root;
2704
2705 if (next->new_bytenr != root->node->start) {
2706 BUG_ON(next->new_bytenr);
2707 BUG_ON(!list_empty(&next->list));
2708 next->new_bytenr = root->node->start;
00246528
JB
2709 btrfs_put_root(next->root);
2710 next->root = btrfs_grab_root(root);
0b530bc5 2711 ASSERT(next->root);
3fd0a558
YZ
2712 list_add_tail(&next->list,
2713 &rc->backref_cache.changed);
9569cc20 2714 mark_block_processed(rc, next);
5d4f98a2
YZ
2715 break;
2716 }
2717
3fd0a558 2718 WARN_ON(1);
5d4f98a2
YZ
2719 root = NULL;
2720 next = walk_down_backref(edges, &index);
2721 if (!next || next->level <= node->level)
2722 break;
2723 }
3fd0a558
YZ
2724 if (!root)
2725 return NULL;
5d4f98a2 2726
3fd0a558
YZ
2727 next = node;
2728 /* setup backref node path for btrfs_reloc_cow_block */
2729 while (1) {
2730 rc->backref_cache.path[next->level] = next;
2731 if (--index < 0)
2732 break;
2733 next = edges[index]->node[UPPER];
5d4f98a2 2734 }
5d4f98a2
YZ
2735 return root;
2736}
2737
3fd0a558
YZ
2738/*
2739 * select a tree root for relocation. return NULL if the block
2740 * is reference counted. we should use do_relocation() in this
2741 * case. return a tree root pointer if the block isn't reference
2742 * counted. return -ENOENT if the block is root of reloc tree.
2743 */
5d4f98a2 2744static noinline_for_stack
a26195a5 2745struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
5d4f98a2 2746{
a26195a5 2747 struct btrfs_backref_node *next;
3fd0a558
YZ
2748 struct btrfs_root *root;
2749 struct btrfs_root *fs_root = NULL;
a26195a5 2750 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3fd0a558
YZ
2751 int index = 0;
2752
2753 next = node;
2754 while (1) {
2755 cond_resched();
2756 next = walk_up_backref(next, edges, &index);
2757 root = next->root;
2758 BUG_ON(!root);
2759
25985edc 2760 /* no other choice for non-references counted tree */
27cdeb70 2761 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
3fd0a558
YZ
2762 return root;
2763
2764 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2765 fs_root = root;
2766
2767 if (next != node)
2768 return NULL;
2769
2770 next = walk_down_backref(edges, &index);
2771 if (!next || next->level <= node->level)
2772 break;
2773 }
2774
2775 if (!fs_root)
2776 return ERR_PTR(-ENOENT);
2777 return fs_root;
5d4f98a2
YZ
2778}
2779
2780static noinline_for_stack
3fd0a558 2781u64 calcu_metadata_size(struct reloc_control *rc,
a26195a5 2782 struct btrfs_backref_node *node, int reserve)
5d4f98a2 2783{
0b246afa 2784 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5
QW
2785 struct btrfs_backref_node *next = node;
2786 struct btrfs_backref_edge *edge;
2787 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3fd0a558
YZ
2788 u64 num_bytes = 0;
2789 int index = 0;
2790
2791 BUG_ON(reserve && node->processed);
2792
2793 while (next) {
2794 cond_resched();
2795 while (1) {
2796 if (next->processed && (reserve || next != node))
2797 break;
2798
0b246afa 2799 num_bytes += fs_info->nodesize;
3fd0a558
YZ
2800
2801 if (list_empty(&next->upper))
2802 break;
2803
2804 edge = list_entry(next->upper.next,
a26195a5 2805 struct btrfs_backref_edge, list[LOWER]);
3fd0a558
YZ
2806 edges[index++] = edge;
2807 next = edge->node[UPPER];
2808 }
2809 next = walk_down_backref(edges, &index);
2810 }
2811 return num_bytes;
5d4f98a2
YZ
2812}
2813
3fd0a558
YZ
2814static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2815 struct reloc_control *rc,
a26195a5 2816 struct btrfs_backref_node *node)
5d4f98a2 2817{
3fd0a558 2818 struct btrfs_root *root = rc->extent_root;
da17066c 2819 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558
YZ
2820 u64 num_bytes;
2821 int ret;
0647bf56 2822 u64 tmp;
3fd0a558
YZ
2823
2824 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
5d4f98a2 2825
3fd0a558 2826 trans->block_rsv = rc->block_rsv;
0647bf56 2827 rc->reserved_bytes += num_bytes;
8ca17f0f
JB
2828
2829 /*
2830 * We are under a transaction here so we can only do limited flushing.
2831 * If we get an enospc just kick back -EAGAIN so we know to drop the
2832 * transaction and try to refill when we can flush all the things.
2833 */
0647bf56 2834 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
8ca17f0f 2835 BTRFS_RESERVE_FLUSH_LIMIT);
3fd0a558 2836 if (ret) {
da17066c 2837 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
8ca17f0f
JB
2838 while (tmp <= rc->reserved_bytes)
2839 tmp <<= 1;
2840 /*
2841 * only one thread can access block_rsv at this point,
2842 * so we don't need hold lock to protect block_rsv.
2843 * we expand more reservation size here to allow enough
52042d8e 2844 * space for relocation and we will return earlier in
8ca17f0f
JB
2845 * enospc case.
2846 */
da17066c
JM
2847 rc->block_rsv->size = tmp + fs_info->nodesize *
2848 RELOCATION_RESERVED_NODES;
8ca17f0f 2849 return -EAGAIN;
5d4f98a2 2850 }
3fd0a558 2851
3fd0a558
YZ
2852 return 0;
2853}
2854
5d4f98a2
YZ
2855/*
2856 * relocate a block tree, and then update pointers in upper level
2857 * blocks that reference the block to point to the new location.
2858 *
2859 * if called by link_to_upper, the block has already been relocated.
2860 * in that case this function just updates pointers.
2861 */
2862static int do_relocation(struct btrfs_trans_handle *trans,
3fd0a558 2863 struct reloc_control *rc,
a26195a5 2864 struct btrfs_backref_node *node,
5d4f98a2
YZ
2865 struct btrfs_key *key,
2866 struct btrfs_path *path, int lowest)
2867{
2ff7e61e 2868 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5
QW
2869 struct btrfs_backref_node *upper;
2870 struct btrfs_backref_edge *edge;
2871 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
5d4f98a2
YZ
2872 struct btrfs_root *root;
2873 struct extent_buffer *eb;
2874 u32 blocksize;
2875 u64 bytenr;
2876 u64 generation;
5d4f98a2
YZ
2877 int slot;
2878 int ret;
2879 int err = 0;
2880
2881 BUG_ON(lowest && node->eb);
2882
2883 path->lowest_level = node->level + 1;
3fd0a558 2884 rc->backref_cache.path[node->level] = node;
5d4f98a2 2885 list_for_each_entry(edge, &node->upper, list[LOWER]) {
581c1760 2886 struct btrfs_key first_key;
82fa113f 2887 struct btrfs_ref ref = { 0 };
581c1760 2888
5d4f98a2 2889 cond_resched();
5d4f98a2
YZ
2890
2891 upper = edge->node[UPPER];
dc4103f9 2892 root = select_reloc_root(trans, rc, upper, edges);
3fd0a558
YZ
2893 BUG_ON(!root);
2894
2895 if (upper->eb && !upper->locked) {
2896 if (!lowest) {
2897 ret = btrfs_bin_search(upper->eb, key,
2898 upper->level, &slot);
cbca7d59
FM
2899 if (ret < 0) {
2900 err = ret;
2901 goto next;
2902 }
3fd0a558
YZ
2903 BUG_ON(ret);
2904 bytenr = btrfs_node_blockptr(upper->eb, slot);
2905 if (node->eb->start == bytenr)
2906 goto next;
2907 }
5d4f98a2 2908 drop_node_buffer(upper);
3fd0a558 2909 }
5d4f98a2
YZ
2910
2911 if (!upper->eb) {
2912 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
3561b9db
LB
2913 if (ret) {
2914 if (ret < 0)
2915 err = ret;
2916 else
2917 err = -ENOENT;
2918
2919 btrfs_release_path(path);
5d4f98a2
YZ
2920 break;
2921 }
5d4f98a2 2922
3fd0a558
YZ
2923 if (!upper->eb) {
2924 upper->eb = path->nodes[upper->level];
2925 path->nodes[upper->level] = NULL;
2926 } else {
2927 BUG_ON(upper->eb != path->nodes[upper->level]);
2928 }
5d4f98a2 2929
3fd0a558
YZ
2930 upper->locked = 1;
2931 path->locks[upper->level] = 0;
5d4f98a2 2932
3fd0a558 2933 slot = path->slots[upper->level];
b3b4aa74 2934 btrfs_release_path(path);
5d4f98a2
YZ
2935 } else {
2936 ret = btrfs_bin_search(upper->eb, key, upper->level,
2937 &slot);
cbca7d59
FM
2938 if (ret < 0) {
2939 err = ret;
2940 goto next;
2941 }
5d4f98a2
YZ
2942 BUG_ON(ret);
2943 }
2944
2945 bytenr = btrfs_node_blockptr(upper->eb, slot);
3fd0a558 2946 if (lowest) {
4547f4d8
LB
2947 if (bytenr != node->bytenr) {
2948 btrfs_err(root->fs_info,
2949 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2950 bytenr, node->bytenr, slot,
2951 upper->eb->start);
2952 err = -EIO;
2953 goto next;
2954 }
5d4f98a2 2955 } else {
3fd0a558
YZ
2956 if (node->eb->start == bytenr)
2957 goto next;
5d4f98a2
YZ
2958 }
2959
da17066c 2960 blocksize = root->fs_info->nodesize;
5d4f98a2 2961 generation = btrfs_node_ptr_generation(upper->eb, slot);
581c1760
QW
2962 btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
2963 eb = read_tree_block(fs_info, bytenr, generation,
2964 upper->level - 1, &first_key);
64c043de
LB
2965 if (IS_ERR(eb)) {
2966 err = PTR_ERR(eb);
2967 goto next;
2968 } else if (!extent_buffer_uptodate(eb)) {
416bc658 2969 free_extent_buffer(eb);
97d9a8a4
TI
2970 err = -EIO;
2971 goto next;
2972 }
5d4f98a2 2973 btrfs_tree_lock(eb);
8bead258 2974 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
2975
2976 if (!node->eb) {
2977 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2978 slot, &eb);
3fd0a558
YZ
2979 btrfs_tree_unlock(eb);
2980 free_extent_buffer(eb);
5d4f98a2
YZ
2981 if (ret < 0) {
2982 err = ret;
3fd0a558 2983 goto next;
5d4f98a2 2984 }
3fd0a558 2985 BUG_ON(node->eb != eb);
5d4f98a2
YZ
2986 } else {
2987 btrfs_set_node_blockptr(upper->eb, slot,
2988 node->eb->start);
2989 btrfs_set_node_ptr_generation(upper->eb, slot,
2990 trans->transid);
2991 btrfs_mark_buffer_dirty(upper->eb);
2992
82fa113f
QW
2993 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2994 node->eb->start, blocksize,
2995 upper->eb->start);
2996 ref.real_root = root->root_key.objectid;
2997 btrfs_init_tree_ref(&ref, node->level,
2998 btrfs_header_owner(upper->eb));
2999 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2
YZ
3000 BUG_ON(ret);
3001
3002 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
3003 BUG_ON(ret);
5d4f98a2 3004 }
3fd0a558
YZ
3005next:
3006 if (!upper->pending)
3007 drop_node_buffer(upper);
3008 else
3009 unlock_node_buffer(upper);
3010 if (err)
3011 break;
5d4f98a2 3012 }
3fd0a558
YZ
3013
3014 if (!err && node->pending) {
3015 drop_node_buffer(node);
3016 list_move_tail(&node->list, &rc->backref_cache.changed);
3017 node->pending = 0;
3018 }
3019
5d4f98a2 3020 path->lowest_level = 0;
3fd0a558 3021 BUG_ON(err == -ENOSPC);
5d4f98a2
YZ
3022 return err;
3023}
3024
3025static int link_to_upper(struct btrfs_trans_handle *trans,
3fd0a558 3026 struct reloc_control *rc,
a26195a5 3027 struct btrfs_backref_node *node,
5d4f98a2
YZ
3028 struct btrfs_path *path)
3029{
3030 struct btrfs_key key;
5d4f98a2
YZ
3031
3032 btrfs_node_key_to_cpu(node->eb, &key, 0);
3fd0a558 3033 return do_relocation(trans, rc, node, &key, path, 0);
5d4f98a2
YZ
3034}
3035
3036static int finish_pending_nodes(struct btrfs_trans_handle *trans,
3fd0a558
YZ
3037 struct reloc_control *rc,
3038 struct btrfs_path *path, int err)
5d4f98a2 3039{
3fd0a558 3040 LIST_HEAD(list);
a26195a5
QW
3041 struct btrfs_backref_cache *cache = &rc->backref_cache;
3042 struct btrfs_backref_node *node;
5d4f98a2
YZ
3043 int level;
3044 int ret;
5d4f98a2
YZ
3045
3046 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3047 while (!list_empty(&cache->pending[level])) {
3048 node = list_entry(cache->pending[level].next,
a26195a5 3049 struct btrfs_backref_node, list);
3fd0a558
YZ
3050 list_move_tail(&node->list, &list);
3051 BUG_ON(!node->pending);
5d4f98a2 3052
3fd0a558
YZ
3053 if (!err) {
3054 ret = link_to_upper(trans, rc, node, path);
3055 if (ret < 0)
3056 err = ret;
3057 }
5d4f98a2 3058 }
3fd0a558 3059 list_splice_init(&list, &cache->pending[level]);
5d4f98a2 3060 }
5d4f98a2
YZ
3061 return err;
3062}
3063
5d4f98a2
YZ
3064/*
3065 * mark a block and all blocks directly/indirectly reference the block
3066 * as processed.
3067 */
3068static void update_processed_blocks(struct reloc_control *rc,
a26195a5 3069 struct btrfs_backref_node *node)
5d4f98a2 3070{
a26195a5
QW
3071 struct btrfs_backref_node *next = node;
3072 struct btrfs_backref_edge *edge;
3073 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
5d4f98a2
YZ
3074 int index = 0;
3075
3076 while (next) {
3077 cond_resched();
3078 while (1) {
3079 if (next->processed)
3080 break;
3081
9569cc20 3082 mark_block_processed(rc, next);
5d4f98a2
YZ
3083
3084 if (list_empty(&next->upper))
3085 break;
3086
3087 edge = list_entry(next->upper.next,
a26195a5 3088 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
3089 edges[index++] = edge;
3090 next = edge->node[UPPER];
3091 }
3092 next = walk_down_backref(edges, &index);
3093 }
3094}
3095
7476dfda 3096static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
3fd0a558 3097{
da17066c 3098 u32 blocksize = rc->extent_root->fs_info->nodesize;
7476dfda 3099
3fd0a558
YZ
3100 if (test_range_bit(&rc->processed_blocks, bytenr,
3101 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
3102 return 1;
3103 return 0;
5d4f98a2
YZ
3104}
3105
2ff7e61e 3106static int get_tree_block_key(struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
3107 struct tree_block *block)
3108{
3109 struct extent_buffer *eb;
3110
581c1760
QW
3111 eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
3112 block->level, NULL);
64c043de
LB
3113 if (IS_ERR(eb)) {
3114 return PTR_ERR(eb);
3115 } else if (!extent_buffer_uptodate(eb)) {
416bc658
JB
3116 free_extent_buffer(eb);
3117 return -EIO;
3118 }
5d4f98a2
YZ
3119 if (block->level == 0)
3120 btrfs_item_key_to_cpu(eb, &block->key, 0);
3121 else
3122 btrfs_node_key_to_cpu(eb, &block->key, 0);
3123 free_extent_buffer(eb);
3124 block->key_ready = 1;
3125 return 0;
3126}
3127
5d4f98a2
YZ
3128/*
3129 * helper function to relocate a tree block
3130 */
3131static int relocate_tree_block(struct btrfs_trans_handle *trans,
3132 struct reloc_control *rc,
a26195a5 3133 struct btrfs_backref_node *node,
5d4f98a2
YZ
3134 struct btrfs_key *key,
3135 struct btrfs_path *path)
3136{
3137 struct btrfs_root *root;
3fd0a558
YZ
3138 int ret = 0;
3139
3140 if (!node)
3141 return 0;
5d4f98a2 3142
5f6b2e5c
JB
3143 /*
3144 * If we fail here we want to drop our backref_node because we are going
3145 * to start over and regenerate the tree for it.
3146 */
3147 ret = reserve_metadata_space(trans, rc, node);
3148 if (ret)
3149 goto out;
3150
3fd0a558 3151 BUG_ON(node->processed);
147d256e 3152 root = select_one_root(node);
3fd0a558 3153 if (root == ERR_PTR(-ENOENT)) {
5d4f98a2 3154 update_processed_blocks(rc, node);
3fd0a558 3155 goto out;
5d4f98a2
YZ
3156 }
3157
3fd0a558 3158 if (root) {
27cdeb70 3159 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3fd0a558
YZ
3160 BUG_ON(node->new_bytenr);
3161 BUG_ON(!list_empty(&node->list));
3162 btrfs_record_root_in_trans(trans, root);
3163 root = root->reloc_root;
3164 node->new_bytenr = root->node->start;
00246528
JB
3165 btrfs_put_root(node->root);
3166 node->root = btrfs_grab_root(root);
0b530bc5 3167 ASSERT(node->root);
3fd0a558
YZ
3168 list_add_tail(&node->list, &rc->backref_cache.changed);
3169 } else {
3170 path->lowest_level = node->level;
3171 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
b3b4aa74 3172 btrfs_release_path(path);
3fd0a558
YZ
3173 if (ret > 0)
3174 ret = 0;
3175 }
3176 if (!ret)
3177 update_processed_blocks(rc, node);
3178 } else {
3179 ret = do_relocation(trans, rc, node, key, path, 1);
3180 }
5d4f98a2 3181out:
0647bf56 3182 if (ret || node->level == 0 || node->cowonly)
3fd0a558 3183 remove_backref_node(&rc->backref_cache, node);
5d4f98a2
YZ
3184 return ret;
3185}
3186
3187/*
3188 * relocate a list of blocks
3189 */
3190static noinline_for_stack
3191int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3192 struct reloc_control *rc, struct rb_root *blocks)
3193{
2ff7e61e 3194 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5 3195 struct btrfs_backref_node *node;
5d4f98a2
YZ
3196 struct btrfs_path *path;
3197 struct tree_block *block;
98ff7b94 3198 struct tree_block *next;
5d4f98a2
YZ
3199 int ret;
3200 int err = 0;
3201
3202 path = btrfs_alloc_path();
e1a12670
LB
3203 if (!path) {
3204 err = -ENOMEM;
34c2b290 3205 goto out_free_blocks;
e1a12670 3206 }
5d4f98a2 3207
98ff7b94
QW
3208 /* Kick in readahead for tree blocks with missing keys */
3209 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
5d4f98a2 3210 if (!block->key_ready)
2ff7e61e 3211 readahead_tree_block(fs_info, block->bytenr);
5d4f98a2
YZ
3212 }
3213
98ff7b94
QW
3214 /* Get first keys */
3215 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
34c2b290 3216 if (!block->key_ready) {
2ff7e61e 3217 err = get_tree_block_key(fs_info, block);
34c2b290
DS
3218 if (err)
3219 goto out_free_path;
3220 }
5d4f98a2
YZ
3221 }
3222
98ff7b94
QW
3223 /* Do tree relocation */
3224 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3fd0a558 3225 node = build_backref_tree(rc, &block->key,
5d4f98a2
YZ
3226 block->level, block->bytenr);
3227 if (IS_ERR(node)) {
3228 err = PTR_ERR(node);
3229 goto out;
3230 }
3231
3232 ret = relocate_tree_block(trans, rc, node, &block->key,
3233 path);
3234 if (ret < 0) {
50dbbb71
JB
3235 err = ret;
3236 break;
5d4f98a2 3237 }
5d4f98a2 3238 }
5d4f98a2 3239out:
3fd0a558 3240 err = finish_pending_nodes(trans, rc, path, err);
5d4f98a2 3241
34c2b290 3242out_free_path:
5d4f98a2 3243 btrfs_free_path(path);
34c2b290 3244out_free_blocks:
e1a12670 3245 free_block_list(blocks);
5d4f98a2
YZ
3246 return err;
3247}
3248
efa56464
YZ
3249static noinline_for_stack
3250int prealloc_file_extent_cluster(struct inode *inode,
3251 struct file_extent_cluster *cluster)
3252{
3253 u64 alloc_hint = 0;
3254 u64 start;
3255 u64 end;
3256 u64 offset = BTRFS_I(inode)->index_cnt;
3257 u64 num_bytes;
3258 int nr = 0;
3259 int ret = 0;
dcb40c19
WX
3260 u64 prealloc_start = cluster->start - offset;
3261 u64 prealloc_end = cluster->end - offset;
18513091 3262 u64 cur_offset;
364ecf36 3263 struct extent_changeset *data_reserved = NULL;
efa56464
YZ
3264
3265 BUG_ON(cluster->start != cluster->boundary[0]);
5955102c 3266 inode_lock(inode);
efa56464 3267
364ecf36 3268 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
dcb40c19 3269 prealloc_end + 1 - prealloc_start);
efa56464
YZ
3270 if (ret)
3271 goto out;
3272
18513091 3273 cur_offset = prealloc_start;
efa56464
YZ
3274 while (nr < cluster->nr) {
3275 start = cluster->boundary[nr] - offset;
3276 if (nr + 1 < cluster->nr)
3277 end = cluster->boundary[nr + 1] - 1 - offset;
3278 else
3279 end = cluster->end - offset;
3280
d0082371 3281 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
efa56464 3282 num_bytes = end + 1 - start;
18513091 3283 if (cur_offset < start)
bc42bda2
QW
3284 btrfs_free_reserved_data_space(inode, data_reserved,
3285 cur_offset, start - cur_offset);
efa56464
YZ
3286 ret = btrfs_prealloc_file_range(inode, 0, start,
3287 num_bytes, num_bytes,
3288 end + 1, &alloc_hint);
18513091 3289 cur_offset = end + 1;
d0082371 3290 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
efa56464
YZ
3291 if (ret)
3292 break;
3293 nr++;
3294 }
18513091 3295 if (cur_offset < prealloc_end)
bc42bda2
QW
3296 btrfs_free_reserved_data_space(inode, data_reserved,
3297 cur_offset, prealloc_end + 1 - cur_offset);
efa56464 3298out:
5955102c 3299 inode_unlock(inode);
364ecf36 3300 extent_changeset_free(data_reserved);
efa56464
YZ
3301 return ret;
3302}
3303
5d4f98a2 3304static noinline_for_stack
0257bb82
YZ
3305int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3306 u64 block_start)
3307{
0257bb82
YZ
3308 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3309 struct extent_map *em;
3310 int ret = 0;
3311
172ddd60 3312 em = alloc_extent_map();
0257bb82
YZ
3313 if (!em)
3314 return -ENOMEM;
3315
3316 em->start = start;
3317 em->len = end + 1 - start;
3318 em->block_len = em->len;
3319 em->block_start = block_start;
0257bb82
YZ
3320 set_bit(EXTENT_FLAG_PINNED, &em->flags);
3321
d0082371 3322 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
0257bb82
YZ
3323 while (1) {
3324 write_lock(&em_tree->lock);
09a2a8f9 3325 ret = add_extent_mapping(em_tree, em, 0);
0257bb82
YZ
3326 write_unlock(&em_tree->lock);
3327 if (ret != -EEXIST) {
3328 free_extent_map(em);
3329 break;
3330 }
dcdbc059 3331 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
0257bb82 3332 }
d0082371 3333 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
0257bb82
YZ
3334 return ret;
3335}
3336
726a3421
QW
3337/*
3338 * Allow error injection to test balance cancellation
3339 */
3340int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
3341{
3342 return atomic_read(&fs_info->balance_cancel_req);
3343}
3344ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
3345
0257bb82
YZ
3346static int relocate_file_extent_cluster(struct inode *inode,
3347 struct file_extent_cluster *cluster)
5d4f98a2 3348{
2ff7e61e 3349 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5d4f98a2
YZ
3350 u64 page_start;
3351 u64 page_end;
0257bb82
YZ
3352 u64 offset = BTRFS_I(inode)->index_cnt;
3353 unsigned long index;
5d4f98a2 3354 unsigned long last_index;
5d4f98a2
YZ
3355 struct page *page;
3356 struct file_ra_state *ra;
3b16a4e3 3357 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
0257bb82 3358 int nr = 0;
5d4f98a2
YZ
3359 int ret = 0;
3360
0257bb82
YZ
3361 if (!cluster->nr)
3362 return 0;
3363
5d4f98a2
YZ
3364 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3365 if (!ra)
3366 return -ENOMEM;
3367
efa56464
YZ
3368 ret = prealloc_file_extent_cluster(inode, cluster);
3369 if (ret)
3370 goto out;
0257bb82 3371
efa56464 3372 file_ra_state_init(ra, inode->i_mapping);
5d4f98a2 3373
0257bb82
YZ
3374 ret = setup_extent_mapping(inode, cluster->start - offset,
3375 cluster->end - offset, cluster->start);
5d4f98a2 3376 if (ret)
efa56464 3377 goto out;
5d4f98a2 3378
09cbfeaf
KS
3379 index = (cluster->start - offset) >> PAGE_SHIFT;
3380 last_index = (cluster->end - offset) >> PAGE_SHIFT;
0257bb82 3381 while (index <= last_index) {
9f3db423
NB
3382 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3383 PAGE_SIZE);
efa56464
YZ
3384 if (ret)
3385 goto out;
3386
0257bb82 3387 page = find_lock_page(inode->i_mapping, index);
5d4f98a2 3388 if (!page) {
0257bb82
YZ
3389 page_cache_sync_readahead(inode->i_mapping,
3390 ra, NULL, index,
3391 last_index + 1 - index);
a94733d0 3392 page = find_or_create_page(inode->i_mapping, index,
3b16a4e3 3393 mask);
0257bb82 3394 if (!page) {
691fa059 3395 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3396 PAGE_SIZE, true);
44db1216 3397 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 3398 PAGE_SIZE);
0257bb82 3399 ret = -ENOMEM;
efa56464 3400 goto out;
0257bb82 3401 }
5d4f98a2 3402 }
0257bb82
YZ
3403
3404 if (PageReadahead(page)) {
3405 page_cache_async_readahead(inode->i_mapping,
3406 ra, NULL, page, index,
3407 last_index + 1 - index);
3408 }
3409
5d4f98a2
YZ
3410 if (!PageUptodate(page)) {
3411 btrfs_readpage(NULL, page);
3412 lock_page(page);
3413 if (!PageUptodate(page)) {
3414 unlock_page(page);
09cbfeaf 3415 put_page(page);
691fa059 3416 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3417 PAGE_SIZE, true);
8b62f87b 3418 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 3419 PAGE_SIZE);
5d4f98a2 3420 ret = -EIO;
efa56464 3421 goto out;
5d4f98a2
YZ
3422 }
3423 }
5d4f98a2 3424
4eee4fa4 3425 page_start = page_offset(page);
09cbfeaf 3426 page_end = page_start + PAGE_SIZE - 1;
0257bb82 3427
d0082371 3428 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
0257bb82 3429
5d4f98a2
YZ
3430 set_page_extent_mapped(page);
3431
0257bb82
YZ
3432 if (nr < cluster->nr &&
3433 page_start + offset == cluster->boundary[nr]) {
3434 set_extent_bits(&BTRFS_I(inode)->io_tree,
3435 page_start, page_end,
ceeb0ae7 3436 EXTENT_BOUNDARY);
0257bb82
YZ
3437 nr++;
3438 }
5d4f98a2 3439
765f3ceb 3440 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
330a5827 3441 NULL);
765f3ceb
NB
3442 if (ret) {
3443 unlock_page(page);
3444 put_page(page);
3445 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3446 PAGE_SIZE, true);
765f3ceb 3447 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 3448 PAGE_SIZE);
765f3ceb
NB
3449
3450 clear_extent_bits(&BTRFS_I(inode)->io_tree,
3451 page_start, page_end,
3452 EXTENT_LOCKED | EXTENT_BOUNDARY);
3453 goto out;
3454
3455 }
5d4f98a2 3456 set_page_dirty(page);
5d4f98a2 3457
0257bb82 3458 unlock_extent(&BTRFS_I(inode)->io_tree,
d0082371 3459 page_start, page_end);
5d4f98a2 3460 unlock_page(page);
09cbfeaf 3461 put_page(page);
0257bb82
YZ
3462
3463 index++;
8702ba93 3464 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
efa56464 3465 balance_dirty_pages_ratelimited(inode->i_mapping);
2ff7e61e 3466 btrfs_throttle(fs_info);
7f913c7c
QW
3467 if (btrfs_should_cancel_balance(fs_info)) {
3468 ret = -ECANCELED;
3469 goto out;
3470 }
5d4f98a2 3471 }
0257bb82 3472 WARN_ON(nr != cluster->nr);
efa56464 3473out:
5d4f98a2 3474 kfree(ra);
5d4f98a2
YZ
3475 return ret;
3476}
3477
3478static noinline_for_stack
0257bb82
YZ
3479int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3480 struct file_extent_cluster *cluster)
5d4f98a2 3481{
0257bb82 3482 int ret;
5d4f98a2 3483
0257bb82
YZ
3484 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3485 ret = relocate_file_extent_cluster(inode, cluster);
3486 if (ret)
3487 return ret;
3488 cluster->nr = 0;
5d4f98a2 3489 }
5d4f98a2 3490
0257bb82
YZ
3491 if (!cluster->nr)
3492 cluster->start = extent_key->objectid;
3493 else
3494 BUG_ON(cluster->nr >= MAX_EXTENTS);
3495 cluster->end = extent_key->objectid + extent_key->offset - 1;
3496 cluster->boundary[cluster->nr] = extent_key->objectid;
3497 cluster->nr++;
3498
3499 if (cluster->nr >= MAX_EXTENTS) {
3500 ret = relocate_file_extent_cluster(inode, cluster);
3501 if (ret)
3502 return ret;
3503 cluster->nr = 0;
3504 }
3505 return 0;
5d4f98a2
YZ
3506}
3507
5d4f98a2
YZ
3508/*
3509 * helper to add a tree block to the list.
3510 * the major work is getting the generation and level of the block
3511 */
3512static int add_tree_block(struct reloc_control *rc,
3513 struct btrfs_key *extent_key,
3514 struct btrfs_path *path,
3515 struct rb_root *blocks)
3516{
3517 struct extent_buffer *eb;
3518 struct btrfs_extent_item *ei;
3519 struct btrfs_tree_block_info *bi;
3520 struct tree_block *block;
3521 struct rb_node *rb_node;
3522 u32 item_size;
3523 int level = -1;
7fdf4b60 3524 u64 generation;
5d4f98a2
YZ
3525
3526 eb = path->nodes[0];
3527 item_size = btrfs_item_size_nr(eb, path->slots[0]);
3528
3173a18f
JB
3529 if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3530 item_size >= sizeof(*ei) + sizeof(*bi)) {
5d4f98a2
YZ
3531 ei = btrfs_item_ptr(eb, path->slots[0],
3532 struct btrfs_extent_item);
3173a18f
JB
3533 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3534 bi = (struct btrfs_tree_block_info *)(ei + 1);
3535 level = btrfs_tree_block_level(eb, bi);
3536 } else {
3537 level = (int)extent_key->offset;
3538 }
5d4f98a2 3539 generation = btrfs_extent_generation(eb, ei);
6d8ff4e4 3540 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
ba3c2b19
NB
3541 btrfs_print_v0_err(eb->fs_info);
3542 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3543 return -EINVAL;
5d4f98a2 3544 } else {
5d4f98a2 3545 BUG();
5d4f98a2
YZ
3546 }
3547
b3b4aa74 3548 btrfs_release_path(path);
5d4f98a2
YZ
3549
3550 BUG_ON(level == -1);
3551
3552 block = kmalloc(sizeof(*block), GFP_NOFS);
3553 if (!block)
3554 return -ENOMEM;
3555
3556 block->bytenr = extent_key->objectid;
da17066c 3557 block->key.objectid = rc->extent_root->fs_info->nodesize;
5d4f98a2
YZ
3558 block->key.offset = generation;
3559 block->level = level;
3560 block->key_ready = 0;
3561
e9a28dc5 3562 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
43c04fb1
JM
3563 if (rb_node)
3564 backref_tree_panic(rb_node, -EEXIST, block->bytenr);
5d4f98a2
YZ
3565
3566 return 0;
3567}
3568
3569/*
3570 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3571 */
3572static int __add_tree_block(struct reloc_control *rc,
3573 u64 bytenr, u32 blocksize,
3574 struct rb_root *blocks)
3575{
0b246afa 3576 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
3577 struct btrfs_path *path;
3578 struct btrfs_key key;
3579 int ret;
0b246afa 3580 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5d4f98a2 3581
7476dfda 3582 if (tree_block_processed(bytenr, rc))
5d4f98a2
YZ
3583 return 0;
3584
e9a28dc5 3585 if (rb_simple_search(blocks, bytenr))
5d4f98a2
YZ
3586 return 0;
3587
3588 path = btrfs_alloc_path();
3589 if (!path)
3590 return -ENOMEM;
aee68ee5 3591again:
5d4f98a2 3592 key.objectid = bytenr;
aee68ee5
JB
3593 if (skinny) {
3594 key.type = BTRFS_METADATA_ITEM_KEY;
3595 key.offset = (u64)-1;
3596 } else {
3597 key.type = BTRFS_EXTENT_ITEM_KEY;
3598 key.offset = blocksize;
3599 }
5d4f98a2
YZ
3600
3601 path->search_commit_root = 1;
3602 path->skip_locking = 1;
3603 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3604 if (ret < 0)
3605 goto out;
5d4f98a2 3606
aee68ee5
JB
3607 if (ret > 0 && skinny) {
3608 if (path->slots[0]) {
3609 path->slots[0]--;
3610 btrfs_item_key_to_cpu(path->nodes[0], &key,
3611 path->slots[0]);
3612 if (key.objectid == bytenr &&
3613 (key.type == BTRFS_METADATA_ITEM_KEY ||
3614 (key.type == BTRFS_EXTENT_ITEM_KEY &&
3615 key.offset == blocksize)))
3616 ret = 0;
3617 }
3618
3619 if (ret) {
3620 skinny = false;
3621 btrfs_release_path(path);
3622 goto again;
3623 }
3173a18f 3624 }
cdccee99
LB
3625 if (ret) {
3626 ASSERT(ret == 1);
3627 btrfs_print_leaf(path->nodes[0]);
3628 btrfs_err(fs_info,
3629 "tree block extent item (%llu) is not found in extent tree",
3630 bytenr);
3631 WARN_ON(1);
3632 ret = -EINVAL;
3633 goto out;
3634 }
3173a18f 3635
5d4f98a2
YZ
3636 ret = add_tree_block(rc, &key, path, blocks);
3637out:
3638 btrfs_free_path(path);
3639 return ret;
3640}
3641
0af3d00b 3642static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
32da5386 3643 struct btrfs_block_group *block_group,
1bbc621e
CM
3644 struct inode *inode,
3645 u64 ino)
0af3d00b
JB
3646{
3647 struct btrfs_key key;
0af3d00b
JB
3648 struct btrfs_root *root = fs_info->tree_root;
3649 struct btrfs_trans_handle *trans;
0af3d00b
JB
3650 int ret = 0;
3651
3652 if (inode)
3653 goto truncate;
3654
3655 key.objectid = ino;
3656 key.type = BTRFS_INODE_ITEM_KEY;
3657 key.offset = 0;
3658
4c66e0d4 3659 inode = btrfs_iget(fs_info->sb, &key, root);
2e19f1f9 3660 if (IS_ERR(inode))
0af3d00b 3661 return -ENOENT;
0af3d00b
JB
3662
3663truncate:
2ff7e61e 3664 ret = btrfs_check_trunc_cache_free_space(fs_info,
7b61cd92
MX
3665 &fs_info->global_block_rsv);
3666 if (ret)
3667 goto out;
3668
7a7eaa40 3669 trans = btrfs_join_transaction(root);
0af3d00b 3670 if (IS_ERR(trans)) {
3612b495 3671 ret = PTR_ERR(trans);
0af3d00b
JB
3672 goto out;
3673 }
3674
77ab86bf 3675 ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
0af3d00b 3676
3a45bb20 3677 btrfs_end_transaction(trans);
2ff7e61e 3678 btrfs_btree_balance_dirty(fs_info);
0af3d00b
JB
3679out:
3680 iput(inode);
3681 return ret;
3682}
3683
5d4f98a2 3684/*
19b546d7
QW
3685 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3686 * cache inode, to avoid free space cache data extent blocking data relocation.
5d4f98a2 3687 */
19b546d7
QW
3688static int delete_v1_space_cache(struct extent_buffer *leaf,
3689 struct btrfs_block_group *block_group,
3690 u64 data_bytenr)
5d4f98a2 3691{
19b546d7
QW
3692 u64 space_cache_ino;
3693 struct btrfs_file_extent_item *ei;
5d4f98a2 3694 struct btrfs_key key;
19b546d7
QW
3695 bool found = false;
3696 int i;
5d4f98a2
YZ
3697 int ret;
3698
19b546d7
QW
3699 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3700 return 0;
5d4f98a2 3701
19b546d7
QW
3702 for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3703 btrfs_item_key_to_cpu(leaf, &key, i);
3704 if (key.type != BTRFS_EXTENT_DATA_KEY)
3705 continue;
3706 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3707 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG &&
3708 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3709 found = true;
3710 space_cache_ino = key.objectid;
5d4f98a2 3711 break;
5d4f98a2 3712 }
5d4f98a2 3713 }
19b546d7
QW
3714 if (!found)
3715 return -ENOENT;
3716 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3717 space_cache_ino);
3718 return ret;
5d4f98a2
YZ
3719}
3720
3721/*
2c016dc2 3722 * helper to find all tree blocks that reference a given data extent
5d4f98a2
YZ
3723 */
3724static noinline_for_stack
3725int add_data_references(struct reloc_control *rc,
3726 struct btrfs_key *extent_key,
3727 struct btrfs_path *path,
3728 struct rb_root *blocks)
3729{
19b546d7
QW
3730 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3731 struct ulist *leaves = NULL;
3732 struct ulist_iterator leaf_uiter;
3733 struct ulist_node *ref_node = NULL;
3734 const u32 blocksize = fs_info->nodesize;
647f63bd 3735 int ret = 0;
5d4f98a2 3736
19b546d7
QW
3737 btrfs_release_path(path);
3738 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3739 0, &leaves, NULL, true);
3740 if (ret < 0)
3741 return ret;
5d4f98a2 3742
19b546d7
QW
3743 ULIST_ITER_INIT(&leaf_uiter);
3744 while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
3745 struct extent_buffer *eb;
5d4f98a2 3746
19b546d7
QW
3747 eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
3748 if (IS_ERR(eb)) {
3749 ret = PTR_ERR(eb);
5d4f98a2 3750 break;
5d4f98a2 3751 }
19b546d7
QW
3752 ret = delete_v1_space_cache(eb, rc->block_group,
3753 extent_key->objectid);
3754 free_extent_buffer(eb);
3755 if (ret < 0)
3756 break;
3757 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3758 if (ret < 0)
5d4f98a2 3759 break;
5d4f98a2 3760 }
19b546d7 3761 if (ret < 0)
5d4f98a2 3762 free_block_list(blocks);
19b546d7
QW
3763 ulist_free(leaves);
3764 return ret;
5d4f98a2
YZ
3765}
3766
3767/*
2c016dc2 3768 * helper to find next unprocessed extent
5d4f98a2
YZ
3769 */
3770static noinline_for_stack
147d256e 3771int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3fd0a558 3772 struct btrfs_key *extent_key)
5d4f98a2 3773{
0b246afa 3774 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
3775 struct btrfs_key key;
3776 struct extent_buffer *leaf;
3777 u64 start, end, last;
3778 int ret;
3779
b3470b5d 3780 last = rc->block_group->start + rc->block_group->length;
5d4f98a2
YZ
3781 while (1) {
3782 cond_resched();
3783 if (rc->search_start >= last) {
3784 ret = 1;
3785 break;
3786 }
3787
3788 key.objectid = rc->search_start;
3789 key.type = BTRFS_EXTENT_ITEM_KEY;
3790 key.offset = 0;
3791
3792 path->search_commit_root = 1;
3793 path->skip_locking = 1;
3794 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3795 0, 0);
3796 if (ret < 0)
3797 break;
3798next:
3799 leaf = path->nodes[0];
3800 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3801 ret = btrfs_next_leaf(rc->extent_root, path);
3802 if (ret != 0)
3803 break;
3804 leaf = path->nodes[0];
3805 }
3806
3807 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3808 if (key.objectid >= last) {
3809 ret = 1;
3810 break;
3811 }
3812
3173a18f
JB
3813 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3814 key.type != BTRFS_METADATA_ITEM_KEY) {
3815 path->slots[0]++;
3816 goto next;
3817 }
3818
3819 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5d4f98a2
YZ
3820 key.objectid + key.offset <= rc->search_start) {
3821 path->slots[0]++;
3822 goto next;
3823 }
3824
3173a18f 3825 if (key.type == BTRFS_METADATA_ITEM_KEY &&
0b246afa 3826 key.objectid + fs_info->nodesize <=
3173a18f
JB
3827 rc->search_start) {
3828 path->slots[0]++;
3829 goto next;
3830 }
3831
5d4f98a2
YZ
3832 ret = find_first_extent_bit(&rc->processed_blocks,
3833 key.objectid, &start, &end,
e6138876 3834 EXTENT_DIRTY, NULL);
5d4f98a2
YZ
3835
3836 if (ret == 0 && start <= key.objectid) {
b3b4aa74 3837 btrfs_release_path(path);
5d4f98a2
YZ
3838 rc->search_start = end + 1;
3839 } else {
3173a18f
JB
3840 if (key.type == BTRFS_EXTENT_ITEM_KEY)
3841 rc->search_start = key.objectid + key.offset;
3842 else
3843 rc->search_start = key.objectid +
0b246afa 3844 fs_info->nodesize;
3fd0a558 3845 memcpy(extent_key, &key, sizeof(key));
5d4f98a2
YZ
3846 return 0;
3847 }
3848 }
b3b4aa74 3849 btrfs_release_path(path);
5d4f98a2
YZ
3850 return ret;
3851}
3852
3853static void set_reloc_control(struct reloc_control *rc)
3854{
3855 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
7585717f
CM
3856
3857 mutex_lock(&fs_info->reloc_mutex);
5d4f98a2 3858 fs_info->reloc_ctl = rc;
7585717f 3859 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2
YZ
3860}
3861
3862static void unset_reloc_control(struct reloc_control *rc)
3863{
3864 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
7585717f
CM
3865
3866 mutex_lock(&fs_info->reloc_mutex);
5d4f98a2 3867 fs_info->reloc_ctl = NULL;
7585717f 3868 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2
YZ
3869}
3870
3871static int check_extent_flags(u64 flags)
3872{
3873 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3874 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3875 return 1;
3876 if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3877 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3878 return 1;
3879 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3880 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3881 return 1;
3882 return 0;
3883}
3884
3fd0a558
YZ
3885static noinline_for_stack
3886int prepare_to_relocate(struct reloc_control *rc)
3887{
3888 struct btrfs_trans_handle *trans;
ac2fabac 3889 int ret;
3fd0a558 3890
2ff7e61e 3891 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
66d8f3dd 3892 BTRFS_BLOCK_RSV_TEMP);
3fd0a558
YZ
3893 if (!rc->block_rsv)
3894 return -ENOMEM;
3895
3fd0a558 3896 memset(&rc->cluster, 0, sizeof(rc->cluster));
b3470b5d 3897 rc->search_start = rc->block_group->start;
3fd0a558
YZ
3898 rc->extents_found = 0;
3899 rc->nodes_relocated = 0;
3900 rc->merging_rsv_size = 0;
0647bf56 3901 rc->reserved_bytes = 0;
da17066c 3902 rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
0647bf56 3903 RELOCATION_RESERVED_NODES;
ac2fabac
JB
3904 ret = btrfs_block_rsv_refill(rc->extent_root,
3905 rc->block_rsv, rc->block_rsv->size,
3906 BTRFS_RESERVE_FLUSH_ALL);
3907 if (ret)
3908 return ret;
3fd0a558
YZ
3909
3910 rc->create_reloc_tree = 1;
3911 set_reloc_control(rc);
3912
7a7eaa40 3913 trans = btrfs_join_transaction(rc->extent_root);
28818947
LB
3914 if (IS_ERR(trans)) {
3915 unset_reloc_control(rc);
3916 /*
3917 * extent tree is not a ref_cow tree and has no reloc_root to
3918 * cleanup. And callers are responsible to free the above
3919 * block rsv.
3920 */
3921 return PTR_ERR(trans);
3922 }
3a45bb20 3923 btrfs_commit_transaction(trans);
3fd0a558
YZ
3924 return 0;
3925}
76dda93c 3926
5d4f98a2
YZ
3927static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3928{
2ff7e61e 3929 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
3930 struct rb_root blocks = RB_ROOT;
3931 struct btrfs_key key;
3932 struct btrfs_trans_handle *trans = NULL;
3933 struct btrfs_path *path;
3934 struct btrfs_extent_item *ei;
5d4f98a2
YZ
3935 u64 flags;
3936 u32 item_size;
3937 int ret;
3938 int err = 0;
c87f08ca 3939 int progress = 0;
5d4f98a2
YZ
3940
3941 path = btrfs_alloc_path();
3fd0a558 3942 if (!path)
5d4f98a2 3943 return -ENOMEM;
e4058b54 3944 path->reada = READA_FORWARD;
5d4f98a2 3945
3fd0a558
YZ
3946 ret = prepare_to_relocate(rc);
3947 if (ret) {
3948 err = ret;
3949 goto out_free;
3950 }
5d4f98a2
YZ
3951
3952 while (1) {
0647bf56
WS
3953 rc->reserved_bytes = 0;
3954 ret = btrfs_block_rsv_refill(rc->extent_root,
3955 rc->block_rsv, rc->block_rsv->size,
3956 BTRFS_RESERVE_FLUSH_ALL);
3957 if (ret) {
3958 err = ret;
3959 break;
3960 }
c87f08ca 3961 progress++;
a22285a6 3962 trans = btrfs_start_transaction(rc->extent_root, 0);
0f788c58
LB
3963 if (IS_ERR(trans)) {
3964 err = PTR_ERR(trans);
3965 trans = NULL;
3966 break;
3967 }
c87f08ca 3968restart:
3fd0a558 3969 if (update_backref_cache(trans, &rc->backref_cache)) {
3a45bb20 3970 btrfs_end_transaction(trans);
42a657f5 3971 trans = NULL;
3fd0a558
YZ
3972 continue;
3973 }
3974
147d256e 3975 ret = find_next_extent(rc, path, &key);
5d4f98a2
YZ
3976 if (ret < 0)
3977 err = ret;
3978 if (ret != 0)
3979 break;
3980
3981 rc->extents_found++;
3982
3983 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3984 struct btrfs_extent_item);
3fd0a558 3985 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
5d4f98a2
YZ
3986 if (item_size >= sizeof(*ei)) {
3987 flags = btrfs_extent_flags(path->nodes[0], ei);
3988 ret = check_extent_flags(flags);
3989 BUG_ON(ret);
6d8ff4e4 3990 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
ba3c2b19
NB
3991 err = -EINVAL;
3992 btrfs_print_v0_err(trans->fs_info);
3993 btrfs_abort_transaction(trans, err);
3994 break;
5d4f98a2 3995 } else {
5d4f98a2 3996 BUG();
5d4f98a2
YZ
3997 }
3998
3999 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4000 ret = add_tree_block(rc, &key, path, &blocks);
4001 } else if (rc->stage == UPDATE_DATA_PTRS &&
3fd0a558 4002 (flags & BTRFS_EXTENT_FLAG_DATA)) {
5d4f98a2
YZ
4003 ret = add_data_references(rc, &key, path, &blocks);
4004 } else {
b3b4aa74 4005 btrfs_release_path(path);
5d4f98a2
YZ
4006 ret = 0;
4007 }
4008 if (ret < 0) {
3fd0a558 4009 err = ret;
5d4f98a2
YZ
4010 break;
4011 }
4012
4013 if (!RB_EMPTY_ROOT(&blocks)) {
4014 ret = relocate_tree_blocks(trans, rc, &blocks);
4015 if (ret < 0) {
3fd0a558
YZ
4016 if (ret != -EAGAIN) {
4017 err = ret;
4018 break;
4019 }
4020 rc->extents_found--;
4021 rc->search_start = key.objectid;
4022 }
4023 }
4024
3a45bb20 4025 btrfs_end_transaction_throttle(trans);
2ff7e61e 4026 btrfs_btree_balance_dirty(fs_info);
5d4f98a2 4027 trans = NULL;
5d4f98a2
YZ
4028
4029 if (rc->stage == MOVE_DATA_EXTENTS &&
4030 (flags & BTRFS_EXTENT_FLAG_DATA)) {
4031 rc->found_file_extent = 1;
0257bb82 4032 ret = relocate_data_extent(rc->data_inode,
3fd0a558 4033 &key, &rc->cluster);
5d4f98a2
YZ
4034 if (ret < 0) {
4035 err = ret;
4036 break;
4037 }
4038 }
f31ea088
QW
4039 if (btrfs_should_cancel_balance(fs_info)) {
4040 err = -ECANCELED;
4041 break;
4042 }
5d4f98a2 4043 }
c87f08ca 4044 if (trans && progress && err == -ENOSPC) {
43a7e99d 4045 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
9689457b 4046 if (ret == 1) {
c87f08ca
CM
4047 err = 0;
4048 progress = 0;
4049 goto restart;
4050 }
4051 }
3fd0a558 4052
b3b4aa74 4053 btrfs_release_path(path);
91166212 4054 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
5d4f98a2
YZ
4055
4056 if (trans) {
3a45bb20 4057 btrfs_end_transaction_throttle(trans);
2ff7e61e 4058 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
4059 }
4060
0257bb82 4061 if (!err) {
3fd0a558
YZ
4062 ret = relocate_file_extent_cluster(rc->data_inode,
4063 &rc->cluster);
0257bb82
YZ
4064 if (ret < 0)
4065 err = ret;
4066 }
4067
3fd0a558
YZ
4068 rc->create_reloc_tree = 0;
4069 set_reloc_control(rc);
0257bb82 4070
3fd0a558 4071 backref_cache_cleanup(&rc->backref_cache);
63f018be 4072 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
5d4f98a2 4073
7f913c7c
QW
4074 /*
4075 * Even in the case when the relocation is cancelled, we should all go
4076 * through prepare_to_merge() and merge_reloc_roots().
4077 *
4078 * For error (including cancelled balance), prepare_to_merge() will
4079 * mark all reloc trees orphan, then queue them for cleanup in
4080 * merge_reloc_roots()
4081 */
3fd0a558 4082 err = prepare_to_merge(rc, err);
5d4f98a2
YZ
4083
4084 merge_reloc_roots(rc);
4085
3fd0a558 4086 rc->merge_reloc_tree = 0;
5d4f98a2 4087 unset_reloc_control(rc);
63f018be 4088 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
5d4f98a2
YZ
4089
4090 /* get rid of pinned extents */
7a7eaa40 4091 trans = btrfs_join_transaction(rc->extent_root);
62b99540 4092 if (IS_ERR(trans)) {
3612b495 4093 err = PTR_ERR(trans);
62b99540
QW
4094 goto out_free;
4095 }
3a45bb20 4096 btrfs_commit_transaction(trans);
6217b0fa 4097out_free:
d2311e69
QW
4098 ret = clean_dirty_subvols(rc);
4099 if (ret < 0 && !err)
4100 err = ret;
2ff7e61e 4101 btrfs_free_block_rsv(fs_info, rc->block_rsv);
3fd0a558 4102 btrfs_free_path(path);
5d4f98a2
YZ
4103 return err;
4104}
4105
4106static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
0257bb82 4107 struct btrfs_root *root, u64 objectid)
5d4f98a2
YZ
4108{
4109 struct btrfs_path *path;
4110 struct btrfs_inode_item *item;
4111 struct extent_buffer *leaf;
4112 int ret;
4113
4114 path = btrfs_alloc_path();
4115 if (!path)
4116 return -ENOMEM;
4117
4118 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4119 if (ret)
4120 goto out;
4121
4122 leaf = path->nodes[0];
4123 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
b159fa28 4124 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
5d4f98a2 4125 btrfs_set_inode_generation(leaf, item, 1);
0257bb82 4126 btrfs_set_inode_size(leaf, item, 0);
5d4f98a2 4127 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3fd0a558
YZ
4128 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4129 BTRFS_INODE_PREALLOC);
5d4f98a2 4130 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
4131out:
4132 btrfs_free_path(path);
4133 return ret;
4134}
4135
4136/*
4137 * helper to create inode for data relocation.
4138 * the inode is in data relocation tree and its link count is 0
4139 */
3fd0a558
YZ
4140static noinline_for_stack
4141struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
32da5386 4142 struct btrfs_block_group *group)
5d4f98a2
YZ
4143{
4144 struct inode *inode = NULL;
4145 struct btrfs_trans_handle *trans;
4146 struct btrfs_root *root;
4147 struct btrfs_key key;
4624900d 4148 u64 objectid;
5d4f98a2
YZ
4149 int err = 0;
4150
4151 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4152 if (IS_ERR(root))
4153 return ERR_CAST(root);
4154
a22285a6 4155 trans = btrfs_start_transaction(root, 6);
76deacf0 4156 if (IS_ERR(trans)) {
00246528 4157 btrfs_put_root(root);
3fd0a558 4158 return ERR_CAST(trans);
76deacf0 4159 }
5d4f98a2 4160
581bb050 4161 err = btrfs_find_free_objectid(root, &objectid);
5d4f98a2
YZ
4162 if (err)
4163 goto out;
4164
0257bb82 4165 err = __insert_orphan_inode(trans, root, objectid);
5d4f98a2
YZ
4166 BUG_ON(err);
4167
4168 key.objectid = objectid;
4169 key.type = BTRFS_INODE_ITEM_KEY;
4170 key.offset = 0;
4c66e0d4 4171 inode = btrfs_iget(fs_info->sb, &key, root);
2e19f1f9 4172 BUG_ON(IS_ERR(inode));
b3470b5d 4173 BTRFS_I(inode)->index_cnt = group->start;
5d4f98a2 4174
73f2e545 4175 err = btrfs_orphan_add(trans, BTRFS_I(inode));
5d4f98a2 4176out:
00246528 4177 btrfs_put_root(root);
3a45bb20 4178 btrfs_end_transaction(trans);
2ff7e61e 4179 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
4180 if (err) {
4181 if (inode)
4182 iput(inode);
4183 inode = ERR_PTR(err);
4184 }
4185 return inode;
4186}
4187
c258d6e3 4188static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3fd0a558
YZ
4189{
4190 struct reloc_control *rc;
4191
4192 rc = kzalloc(sizeof(*rc), GFP_NOFS);
4193 if (!rc)
4194 return NULL;
4195
4196 INIT_LIST_HEAD(&rc->reloc_roots);
d2311e69 4197 INIT_LIST_HEAD(&rc->dirty_subvol_roots);
584fb121 4198 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
3fd0a558 4199 mapping_tree_init(&rc->reloc_root_tree);
43eb5f29
QW
4200 extent_io_tree_init(fs_info, &rc->processed_blocks,
4201 IO_TREE_RELOC_BLOCKS, NULL);
3fd0a558
YZ
4202 return rc;
4203}
4204
1a0afa0e
JB
4205static void free_reloc_control(struct reloc_control *rc)
4206{
4207 struct mapping_node *node, *tmp;
4208
4209 free_reloc_roots(&rc->reloc_roots);
4210 rbtree_postorder_for_each_entry_safe(node, tmp,
4211 &rc->reloc_root_tree.rb_root, rb_node)
4212 kfree(node);
4213
4214 kfree(rc);
4215}
4216
ebce0e01
AB
4217/*
4218 * Print the block group being relocated
4219 */
4220static void describe_relocation(struct btrfs_fs_info *fs_info,
32da5386 4221 struct btrfs_block_group *block_group)
ebce0e01 4222{
f89e09cf 4223 char buf[128] = {'\0'};
ebce0e01 4224
f89e09cf 4225 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
ebce0e01
AB
4226
4227 btrfs_info(fs_info,
4228 "relocating block group %llu flags %s",
b3470b5d 4229 block_group->start, buf);
ebce0e01
AB
4230}
4231
430640e3
QW
4232static const char *stage_to_string(int stage)
4233{
4234 if (stage == MOVE_DATA_EXTENTS)
4235 return "move data extents";
4236 if (stage == UPDATE_DATA_PTRS)
4237 return "update data pointers";
4238 return "unknown";
4239}
4240
5d4f98a2
YZ
4241/*
4242 * function to relocate all extents in a block group.
4243 */
6bccf3ab 4244int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
5d4f98a2 4245{
32da5386 4246 struct btrfs_block_group *bg;
6bccf3ab 4247 struct btrfs_root *extent_root = fs_info->extent_root;
5d4f98a2 4248 struct reloc_control *rc;
0af3d00b
JB
4249 struct inode *inode;
4250 struct btrfs_path *path;
5d4f98a2 4251 int ret;
f0486c68 4252 int rw = 0;
5d4f98a2
YZ
4253 int err = 0;
4254
eede2bf3
OS
4255 bg = btrfs_lookup_block_group(fs_info, group_start);
4256 if (!bg)
4257 return -ENOENT;
4258
4259 if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4260 btrfs_put_block_group(bg);
4261 return -ETXTBSY;
4262 }
4263
c258d6e3 4264 rc = alloc_reloc_control(fs_info);
eede2bf3
OS
4265 if (!rc) {
4266 btrfs_put_block_group(bg);
5d4f98a2 4267 return -ENOMEM;
eede2bf3 4268 }
5d4f98a2 4269
f0486c68 4270 rc->extent_root = extent_root;
eede2bf3 4271 rc->block_group = bg;
5d4f98a2 4272
b12de528 4273 ret = btrfs_inc_block_group_ro(rc->block_group, true);
868f401a
Z
4274 if (ret) {
4275 err = ret;
4276 goto out;
f0486c68 4277 }
868f401a 4278 rw = 1;
f0486c68 4279
0af3d00b
JB
4280 path = btrfs_alloc_path();
4281 if (!path) {
4282 err = -ENOMEM;
4283 goto out;
4284 }
4285
7949f339 4286 inode = lookup_free_space_inode(rc->block_group, path);
0af3d00b
JB
4287 btrfs_free_path(path);
4288
4289 if (!IS_ERR(inode))
1bbc621e 4290 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
0af3d00b
JB
4291 else
4292 ret = PTR_ERR(inode);
4293
4294 if (ret && ret != -ENOENT) {
4295 err = ret;
4296 goto out;
4297 }
4298
5d4f98a2
YZ
4299 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4300 if (IS_ERR(rc->data_inode)) {
4301 err = PTR_ERR(rc->data_inode);
4302 rc->data_inode = NULL;
4303 goto out;
4304 }
4305
0b246afa 4306 describe_relocation(fs_info, rc->block_group);
5d4f98a2 4307
9cfa3e34 4308 btrfs_wait_block_group_reservations(rc->block_group);
f78c436c 4309 btrfs_wait_nocow_writers(rc->block_group);
6374e57a 4310 btrfs_wait_ordered_roots(fs_info, U64_MAX,
b3470b5d
DS
4311 rc->block_group->start,
4312 rc->block_group->length);
5d4f98a2
YZ
4313
4314 while (1) {
430640e3
QW
4315 int finishes_stage;
4316
76dda93c 4317 mutex_lock(&fs_info->cleaner_mutex);
5d4f98a2 4318 ret = relocate_block_group(rc);
76dda93c 4319 mutex_unlock(&fs_info->cleaner_mutex);
ff612ba7 4320 if (ret < 0)
5d4f98a2 4321 err = ret;
5d4f98a2 4322
430640e3 4323 finishes_stage = rc->stage;
ff612ba7
JB
4324 /*
4325 * We may have gotten ENOSPC after we already dirtied some
4326 * extents. If writeout happens while we're relocating a
4327 * different block group we could end up hitting the
4328 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4329 * btrfs_reloc_cow_block. Make sure we write everything out
4330 * properly so we don't trip over this problem, and then break
4331 * out of the loop if we hit an error.
4332 */
5d4f98a2 4333 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
0ef8b726
JB
4334 ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4335 (u64)-1);
ff612ba7 4336 if (ret)
0ef8b726 4337 err = ret;
5d4f98a2
YZ
4338 invalidate_mapping_pages(rc->data_inode->i_mapping,
4339 0, -1);
4340 rc->stage = UPDATE_DATA_PTRS;
5d4f98a2 4341 }
ff612ba7
JB
4342
4343 if (err < 0)
4344 goto out;
4345
4346 if (rc->extents_found == 0)
4347 break;
4348
430640e3
QW
4349 btrfs_info(fs_info, "found %llu extents, stage: %s",
4350 rc->extents_found, stage_to_string(finishes_stage));
5d4f98a2
YZ
4351 }
4352
5d4f98a2
YZ
4353 WARN_ON(rc->block_group->pinned > 0);
4354 WARN_ON(rc->block_group->reserved > 0);
bf38be65 4355 WARN_ON(rc->block_group->used > 0);
5d4f98a2 4356out:
f0486c68 4357 if (err && rw)
2ff7e61e 4358 btrfs_dec_block_group_ro(rc->block_group);
5d4f98a2 4359 iput(rc->data_inode);
5d4f98a2 4360 btrfs_put_block_group(rc->block_group);
1a0afa0e 4361 free_reloc_control(rc);
5d4f98a2
YZ
4362 return err;
4363}
4364
76dda93c
YZ
4365static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4366{
0b246afa 4367 struct btrfs_fs_info *fs_info = root->fs_info;
76dda93c 4368 struct btrfs_trans_handle *trans;
79787eaa 4369 int ret, err;
76dda93c 4370
0b246afa 4371 trans = btrfs_start_transaction(fs_info->tree_root, 0);
79787eaa
JM
4372 if (IS_ERR(trans))
4373 return PTR_ERR(trans);
76dda93c
YZ
4374
4375 memset(&root->root_item.drop_progress, 0,
4376 sizeof(root->root_item.drop_progress));
4377 root->root_item.drop_level = 0;
4378 btrfs_set_root_refs(&root->root_item, 0);
0b246afa 4379 ret = btrfs_update_root(trans, fs_info->tree_root,
76dda93c 4380 &root->root_key, &root->root_item);
76dda93c 4381
3a45bb20 4382 err = btrfs_end_transaction(trans);
79787eaa
JM
4383 if (err)
4384 return err;
4385 return ret;
76dda93c
YZ
4386}
4387
5d4f98a2
YZ
4388/*
4389 * recover relocation interrupted by system crash.
4390 *
4391 * this function resumes merging reloc trees with corresponding fs trees.
4392 * this is important for keeping the sharing of tree blocks
4393 */
4394int btrfs_recover_relocation(struct btrfs_root *root)
4395{
0b246afa 4396 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
4397 LIST_HEAD(reloc_roots);
4398 struct btrfs_key key;
4399 struct btrfs_root *fs_root;
4400 struct btrfs_root *reloc_root;
4401 struct btrfs_path *path;
4402 struct extent_buffer *leaf;
4403 struct reloc_control *rc = NULL;
4404 struct btrfs_trans_handle *trans;
4405 int ret;
4406 int err = 0;
4407
4408 path = btrfs_alloc_path();
4409 if (!path)
4410 return -ENOMEM;
e4058b54 4411 path->reada = READA_BACK;
5d4f98a2
YZ
4412
4413 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4414 key.type = BTRFS_ROOT_ITEM_KEY;
4415 key.offset = (u64)-1;
4416
4417 while (1) {
0b246afa 4418 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
5d4f98a2
YZ
4419 path, 0, 0);
4420 if (ret < 0) {
4421 err = ret;
4422 goto out;
4423 }
4424 if (ret > 0) {
4425 if (path->slots[0] == 0)
4426 break;
4427 path->slots[0]--;
4428 }
4429 leaf = path->nodes[0];
4430 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
b3b4aa74 4431 btrfs_release_path(path);
5d4f98a2
YZ
4432
4433 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4434 key.type != BTRFS_ROOT_ITEM_KEY)
4435 break;
4436
3dbf1738 4437 reloc_root = btrfs_read_tree_root(root, &key);
5d4f98a2
YZ
4438 if (IS_ERR(reloc_root)) {
4439 err = PTR_ERR(reloc_root);
4440 goto out;
4441 }
4442
3dbf1738 4443 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
5d4f98a2
YZ
4444 list_add(&reloc_root->root_list, &reloc_roots);
4445
4446 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
0b246afa 4447 fs_root = read_fs_root(fs_info,
5d4f98a2
YZ
4448 reloc_root->root_key.offset);
4449 if (IS_ERR(fs_root)) {
76dda93c
YZ
4450 ret = PTR_ERR(fs_root);
4451 if (ret != -ENOENT) {
4452 err = ret;
4453 goto out;
4454 }
79787eaa
JM
4455 ret = mark_garbage_root(reloc_root);
4456 if (ret < 0) {
4457 err = ret;
4458 goto out;
4459 }
932fd26d 4460 } else {
00246528 4461 btrfs_put_root(fs_root);
5d4f98a2
YZ
4462 }
4463 }
4464
4465 if (key.offset == 0)
4466 break;
4467
4468 key.offset--;
4469 }
b3b4aa74 4470 btrfs_release_path(path);
5d4f98a2
YZ
4471
4472 if (list_empty(&reloc_roots))
4473 goto out;
4474
c258d6e3 4475 rc = alloc_reloc_control(fs_info);
5d4f98a2
YZ
4476 if (!rc) {
4477 err = -ENOMEM;
4478 goto out;
4479 }
4480
0b246afa 4481 rc->extent_root = fs_info->extent_root;
5d4f98a2
YZ
4482
4483 set_reloc_control(rc);
4484
7a7eaa40 4485 trans = btrfs_join_transaction(rc->extent_root);
3612b495 4486 if (IS_ERR(trans)) {
3612b495 4487 err = PTR_ERR(trans);
fb2d83ee 4488 goto out_unset;
3612b495 4489 }
3fd0a558
YZ
4490
4491 rc->merge_reloc_tree = 1;
4492
5d4f98a2
YZ
4493 while (!list_empty(&reloc_roots)) {
4494 reloc_root = list_entry(reloc_roots.next,
4495 struct btrfs_root, root_list);
4496 list_del(&reloc_root->root_list);
4497
4498 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4499 list_add_tail(&reloc_root->root_list,
4500 &rc->reloc_roots);
4501 continue;
4502 }
4503
0b246afa 4504 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
79787eaa
JM
4505 if (IS_ERR(fs_root)) {
4506 err = PTR_ERR(fs_root);
ca1aa281 4507 list_add_tail(&reloc_root->root_list, &reloc_roots);
1402d17d 4508 btrfs_end_transaction(trans);
fb2d83ee 4509 goto out_unset;
79787eaa 4510 }
5d4f98a2 4511
ffd7b339 4512 err = __add_reloc_root(reloc_root);
79787eaa 4513 BUG_ON(err < 0); /* -ENOMEM or logic error */
f44deb74 4514 fs_root->reloc_root = btrfs_grab_root(reloc_root);
00246528 4515 btrfs_put_root(fs_root);
5d4f98a2
YZ
4516 }
4517
3a45bb20 4518 err = btrfs_commit_transaction(trans);
79787eaa 4519 if (err)
fb2d83ee 4520 goto out_unset;
5d4f98a2
YZ
4521
4522 merge_reloc_roots(rc);
4523
4524 unset_reloc_control(rc);
4525
7a7eaa40 4526 trans = btrfs_join_transaction(rc->extent_root);
62b99540 4527 if (IS_ERR(trans)) {
3612b495 4528 err = PTR_ERR(trans);
6217b0fa 4529 goto out_clean;
62b99540 4530 }
3a45bb20 4531 err = btrfs_commit_transaction(trans);
6217b0fa 4532out_clean:
d2311e69
QW
4533 ret = clean_dirty_subvols(rc);
4534 if (ret < 0 && !err)
4535 err = ret;
fb2d83ee
JB
4536out_unset:
4537 unset_reloc_control(rc);
1a0afa0e 4538 free_reloc_control(rc);
3612b495 4539out:
aca1bba6
LB
4540 if (!list_empty(&reloc_roots))
4541 free_reloc_roots(&reloc_roots);
4542
5d4f98a2
YZ
4543 btrfs_free_path(path);
4544
4545 if (err == 0) {
4546 /* cleanup orphan inode in data relocation tree */
0b246afa 4547 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
932fd26d 4548 if (IS_ERR(fs_root)) {
5d4f98a2 4549 err = PTR_ERR(fs_root);
932fd26d 4550 } else {
9f583209 4551 err = btrfs_orphan_cleanup(fs_root);
00246528 4552 btrfs_put_root(fs_root);
932fd26d 4553 }
5d4f98a2
YZ
4554 }
4555 return err;
4556}
4557
4558/*
4559 * helper to add ordered checksum for data relocation.
4560 *
4561 * cloning checksum properly handles the nodatasum extents.
4562 * it also saves CPU time to re-calculate the checksum.
4563 */
4564int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4565{
0b246afa 4566 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5d4f98a2 4567 struct btrfs_ordered_sum *sums;
5d4f98a2 4568 struct btrfs_ordered_extent *ordered;
5d4f98a2
YZ
4569 int ret;
4570 u64 disk_bytenr;
4577b014 4571 u64 new_bytenr;
5d4f98a2
YZ
4572 LIST_HEAD(list);
4573
4574 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
bffe633e 4575 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
5d4f98a2
YZ
4576
4577 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
0b246afa 4578 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
a2de733c 4579 disk_bytenr + len - 1, &list, 0);
79787eaa
JM
4580 if (ret)
4581 goto out;
5d4f98a2
YZ
4582
4583 while (!list_empty(&list)) {
4584 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4585 list_del_init(&sums->list);
4586
4577b014
JB
4587 /*
4588 * We need to offset the new_bytenr based on where the csum is.
4589 * We need to do this because we will read in entire prealloc
4590 * extents but we may have written to say the middle of the
4591 * prealloc extent, so we need to make sure the csum goes with
4592 * the right disk offset.
4593 *
4594 * We can do this because the data reloc inode refers strictly
4595 * to the on disk bytes, so we don't have to worry about
4596 * disk_len vs real len like with real inodes since it's all
4597 * disk length.
4598 */
bffe633e 4599 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
4577b014 4600 sums->bytenr = new_bytenr;
5d4f98a2 4601
f9756261 4602 btrfs_add_ordered_sum(ordered, sums);
5d4f98a2 4603 }
79787eaa 4604out:
5d4f98a2 4605 btrfs_put_ordered_extent(ordered);
411fc6bc 4606 return ret;
5d4f98a2 4607}
3fd0a558 4608
83d4cfd4
JB
4609int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4610 struct btrfs_root *root, struct extent_buffer *buf,
4611 struct extent_buffer *cow)
3fd0a558 4612{
0b246afa 4613 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558 4614 struct reloc_control *rc;
a26195a5 4615 struct btrfs_backref_node *node;
3fd0a558
YZ
4616 int first_cow = 0;
4617 int level;
83d4cfd4 4618 int ret = 0;
3fd0a558 4619
0b246afa 4620 rc = fs_info->reloc_ctl;
3fd0a558 4621 if (!rc)
83d4cfd4 4622 return 0;
3fd0a558
YZ
4623
4624 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4625 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4626
4627 level = btrfs_header_level(buf);
4628 if (btrfs_header_generation(buf) <=
4629 btrfs_root_last_snapshot(&root->root_item))
4630 first_cow = 1;
4631
4632 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4633 rc->create_reloc_tree) {
4634 WARN_ON(!first_cow && level == 0);
4635
4636 node = rc->backref_cache.path[level];
4637 BUG_ON(node->bytenr != buf->start &&
4638 node->new_bytenr != buf->start);
4639
4640 drop_node_buffer(node);
67439dad 4641 atomic_inc(&cow->refs);
3fd0a558
YZ
4642 node->eb = cow;
4643 node->new_bytenr = cow->start;
4644
4645 if (!node->pending) {
4646 list_move_tail(&node->list,
4647 &rc->backref_cache.pending[level]);
4648 node->pending = 1;
4649 }
4650
4651 if (first_cow)
9569cc20 4652 mark_block_processed(rc, node);
3fd0a558
YZ
4653
4654 if (first_cow && level > 0)
4655 rc->nodes_relocated += buf->len;
4656 }
4657
83d4cfd4 4658 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
3fd0a558 4659 ret = replace_file_extents(trans, rc, root, cow);
83d4cfd4 4660 return ret;
3fd0a558
YZ
4661}
4662
4663/*
4664 * called before creating snapshot. it calculates metadata reservation
01327610 4665 * required for relocating tree blocks in the snapshot
3fd0a558 4666 */
147d256e 4667void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
3fd0a558
YZ
4668 u64 *bytes_to_reserve)
4669{
10995c04
QW
4670 struct btrfs_root *root = pending->root;
4671 struct reloc_control *rc = root->fs_info->reloc_ctl;
3fd0a558 4672
6282675e 4673 if (!rc || !have_reloc_root(root))
3fd0a558
YZ
4674 return;
4675
3fd0a558
YZ
4676 if (!rc->merge_reloc_tree)
4677 return;
4678
4679 root = root->reloc_root;
4680 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4681 /*
4682 * relocation is in the stage of merging trees. the space
4683 * used by merging a reloc tree is twice the size of
4684 * relocated tree nodes in the worst case. half for cowing
4685 * the reloc tree, half for cowing the fs tree. the space
4686 * used by cowing the reloc tree will be freed after the
4687 * tree is dropped. if we create snapshot, cowing the fs
4688 * tree may use more space than it frees. so we need
4689 * reserve extra space.
4690 */
4691 *bytes_to_reserve += rc->nodes_relocated;
4692}
4693
4694/*
4695 * called after snapshot is created. migrate block reservation
4696 * and create reloc root for the newly created snapshot
f44deb74
JB
4697 *
4698 * This is similar to btrfs_init_reloc_root(), we come out of here with two
4699 * references held on the reloc_root, one for root->reloc_root and one for
4700 * rc->reloc_roots.
3fd0a558 4701 */
49b25e05 4702int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3fd0a558
YZ
4703 struct btrfs_pending_snapshot *pending)
4704{
4705 struct btrfs_root *root = pending->root;
4706 struct btrfs_root *reloc_root;
4707 struct btrfs_root *new_root;
10995c04 4708 struct reloc_control *rc = root->fs_info->reloc_ctl;
3fd0a558
YZ
4709 int ret;
4710
6282675e 4711 if (!rc || !have_reloc_root(root))
49b25e05 4712 return 0;
3fd0a558
YZ
4713
4714 rc = root->fs_info->reloc_ctl;
4715 rc->merging_rsv_size += rc->nodes_relocated;
4716
4717 if (rc->merge_reloc_tree) {
4718 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4719 rc->block_rsv,
3a584174 4720 rc->nodes_relocated, true);
49b25e05
JM
4721 if (ret)
4722 return ret;
3fd0a558
YZ
4723 }
4724
4725 new_root = pending->snap;
4726 reloc_root = create_reloc_root(trans, root->reloc_root,
4727 new_root->root_key.objectid);
49b25e05
JM
4728 if (IS_ERR(reloc_root))
4729 return PTR_ERR(reloc_root);
3fd0a558 4730
ffd7b339
JM
4731 ret = __add_reloc_root(reloc_root);
4732 BUG_ON(ret < 0);
f44deb74 4733 new_root->reloc_root = btrfs_grab_root(reloc_root);
3fd0a558 4734
49b25e05 4735 if (rc->create_reloc_tree)
3fd0a558 4736 ret = clone_backref_node(trans, rc, root, reloc_root);
49b25e05 4737 return ret;
3fd0a558 4738}