]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/relocation.c
btrfs: backref: move btrfs_backref_(node|edge|cache) structures to backref.h
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / relocation.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
5d4f98a2
YZ
2/*
3 * Copyright (C) 2009 Oracle. All rights reserved.
5d4f98a2
YZ
4 */
5
6#include <linux/sched.h>
7#include <linux/pagemap.h>
8#include <linux/writeback.h>
9#include <linux/blkdev.h>
10#include <linux/rbtree.h>
5a0e3ad6 11#include <linux/slab.h>
726a3421 12#include <linux/error-injection.h>
5d4f98a2
YZ
13#include "ctree.h"
14#include "disk-io.h"
15#include "transaction.h"
16#include "volumes.h"
17#include "locking.h"
18#include "btrfs_inode.h"
19#include "async-thread.h"
0af3d00b 20#include "free-space-cache.h"
581bb050 21#include "inode-map.h"
62b99540 22#include "qgroup.h"
cdccee99 23#include "print-tree.h"
86736342 24#include "delalloc-space.h"
aac0023c 25#include "block-group.h"
19b546d7 26#include "backref.h"
5d4f98a2 27
0c891389
QW
28/*
29 * Relocation overview
30 *
31 * [What does relocation do]
32 *
33 * The objective of relocation is to relocate all extents of the target block
34 * group to other block groups.
35 * This is utilized by resize (shrink only), profile converting, compacting
36 * space, or balance routine to spread chunks over devices.
37 *
38 * Before | After
39 * ------------------------------------------------------------------
40 * BG A: 10 data extents | BG A: deleted
41 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
42 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
43 *
44 * [How does relocation work]
45 *
46 * 1. Mark the target block group read-only
47 * New extents won't be allocated from the target block group.
48 *
49 * 2.1 Record each extent in the target block group
50 * To build a proper map of extents to be relocated.
51 *
52 * 2.2 Build data reloc tree and reloc trees
53 * Data reloc tree will contain an inode, recording all newly relocated
54 * data extents.
55 * There will be only one data reloc tree for one data block group.
56 *
57 * Reloc tree will be a special snapshot of its source tree, containing
58 * relocated tree blocks.
59 * Each tree referring to a tree block in target block group will get its
60 * reloc tree built.
61 *
62 * 2.3 Swap source tree with its corresponding reloc tree
63 * Each involved tree only refers to new extents after swap.
64 *
65 * 3. Cleanup reloc trees and data reloc tree.
66 * As old extents in the target block group are still referenced by reloc
67 * trees, we need to clean them up before really freeing the target block
68 * group.
69 *
70 * The main complexity is in steps 2.2 and 2.3.
71 *
72 * The entry point of relocation is relocate_block_group() function.
73 */
74
5d4f98a2 75/*
a26195a5 76 * btrfs_backref_node, mapping_node and tree_block start with this
5d4f98a2
YZ
77 */
78struct tree_entry {
79 struct rb_node rb_node;
80 u64 bytenr;
81};
82
2a979612 83#define RELOCATION_RESERVED_NODES 256
5d4f98a2
YZ
84/*
85 * map address of tree root to tree
86 */
87struct mapping_node {
88 struct rb_node rb_node;
89 u64 bytenr;
90 void *data;
91};
92
93struct mapping_tree {
94 struct rb_root rb_root;
95 spinlock_t lock;
96};
97
98/*
99 * present a tree block to process
100 */
101struct tree_block {
102 struct rb_node rb_node;
103 u64 bytenr;
104 struct btrfs_key key;
105 unsigned int level:8;
106 unsigned int key_ready:1;
107};
108
0257bb82
YZ
109#define MAX_EXTENTS 128
110
111struct file_extent_cluster {
112 u64 start;
113 u64 end;
114 u64 boundary[MAX_EXTENTS];
115 unsigned int nr;
116};
117
5d4f98a2
YZ
118struct reloc_control {
119 /* block group to relocate */
32da5386 120 struct btrfs_block_group *block_group;
5d4f98a2
YZ
121 /* extent tree */
122 struct btrfs_root *extent_root;
123 /* inode for moving data */
124 struct inode *data_inode;
3fd0a558
YZ
125
126 struct btrfs_block_rsv *block_rsv;
127
a26195a5 128 struct btrfs_backref_cache backref_cache;
3fd0a558
YZ
129
130 struct file_extent_cluster cluster;
5d4f98a2
YZ
131 /* tree blocks have been processed */
132 struct extent_io_tree processed_blocks;
133 /* map start of tree root to corresponding reloc tree */
134 struct mapping_tree reloc_root_tree;
135 /* list of reloc trees */
136 struct list_head reloc_roots;
d2311e69
QW
137 /* list of subvolume trees that get relocated */
138 struct list_head dirty_subvol_roots;
3fd0a558
YZ
139 /* size of metadata reservation for merging reloc trees */
140 u64 merging_rsv_size;
141 /* size of relocated tree nodes */
142 u64 nodes_relocated;
0647bf56
WS
143 /* reserved size for block group relocation*/
144 u64 reserved_bytes;
3fd0a558 145
5d4f98a2
YZ
146 u64 search_start;
147 u64 extents_found;
3fd0a558 148
3fd0a558
YZ
149 unsigned int stage:8;
150 unsigned int create_reloc_tree:1;
151 unsigned int merge_reloc_tree:1;
5d4f98a2 152 unsigned int found_file_extent:1;
5d4f98a2
YZ
153};
154
155/* stages of data relocation */
156#define MOVE_DATA_EXTENTS 0
157#define UPDATE_DATA_PTRS 1
158
a26195a5
QW
159static void remove_backref_node(struct btrfs_backref_cache *cache,
160 struct btrfs_backref_node *node);
9569cc20
QW
161
162static void mark_block_processed(struct reloc_control *rc,
a26195a5 163 struct btrfs_backref_node *node)
9569cc20
QW
164{
165 u32 blocksize;
166
167 if (node->level == 0 ||
168 in_range(node->bytenr, rc->block_group->start,
169 rc->block_group->length)) {
170 blocksize = rc->extent_root->fs_info->nodesize;
171 set_extent_bits(&rc->processed_blocks, node->bytenr,
172 node->bytenr + blocksize - 1, EXTENT_DIRTY);
173 }
174 node->processed = 1;
175}
176
5d4f98a2
YZ
177
178static void mapping_tree_init(struct mapping_tree *tree)
179{
6bef4d31 180 tree->rb_root = RB_ROOT;
5d4f98a2
YZ
181 spin_lock_init(&tree->lock);
182}
183
33a0f1f7 184static void backref_cache_init(struct btrfs_fs_info *fs_info,
a26195a5 185 struct btrfs_backref_cache *cache, int is_reloc)
5d4f98a2
YZ
186{
187 int i;
6bef4d31 188 cache->rb_root = RB_ROOT;
5d4f98a2
YZ
189 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
190 INIT_LIST_HEAD(&cache->pending[i]);
3fd0a558
YZ
191 INIT_LIST_HEAD(&cache->changed);
192 INIT_LIST_HEAD(&cache->detached);
193 INIT_LIST_HEAD(&cache->leaves);
84780289
QW
194 INIT_LIST_HEAD(&cache->pending_edge);
195 INIT_LIST_HEAD(&cache->useless_node);
33a0f1f7 196 cache->fs_info = fs_info;
2433bea5 197 cache->is_reloc = is_reloc;
3fd0a558
YZ
198}
199
a26195a5 200static void backref_cache_cleanup(struct btrfs_backref_cache *cache)
3fd0a558 201{
a26195a5 202 struct btrfs_backref_node *node;
3fd0a558
YZ
203 int i;
204
205 while (!list_empty(&cache->detached)) {
206 node = list_entry(cache->detached.next,
a26195a5 207 struct btrfs_backref_node, list);
3fd0a558
YZ
208 remove_backref_node(cache, node);
209 }
210
211 while (!list_empty(&cache->leaves)) {
212 node = list_entry(cache->leaves.next,
a26195a5 213 struct btrfs_backref_node, lower);
3fd0a558
YZ
214 remove_backref_node(cache, node);
215 }
216
217 cache->last_trans = 0;
218
219 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
f4907095 220 ASSERT(list_empty(&cache->pending[i]));
84780289
QW
221 ASSERT(list_empty(&cache->pending_edge));
222 ASSERT(list_empty(&cache->useless_node));
f4907095
LB
223 ASSERT(list_empty(&cache->changed));
224 ASSERT(list_empty(&cache->detached));
225 ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
226 ASSERT(!cache->nr_nodes);
227 ASSERT(!cache->nr_edges);
3fd0a558
YZ
228}
229
a26195a5
QW
230static struct btrfs_backref_node *alloc_backref_node(
231 struct btrfs_backref_cache *cache, u64 bytenr, int level)
3fd0a558 232{
a26195a5 233 struct btrfs_backref_node *node;
3fd0a558 234
0304f2d8 235 ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL);
3fd0a558 236 node = kzalloc(sizeof(*node), GFP_NOFS);
0304f2d8
QW
237 if (!node)
238 return node;
239
240 INIT_LIST_HEAD(&node->list);
241 INIT_LIST_HEAD(&node->upper);
242 INIT_LIST_HEAD(&node->lower);
243 RB_CLEAR_NODE(&node->rb_node);
244 cache->nr_nodes++;
245 node->level = level;
246 node->bytenr = bytenr;
247
3fd0a558
YZ
248 return node;
249}
250
a26195a5
QW
251static void free_backref_node(struct btrfs_backref_cache *cache,
252 struct btrfs_backref_node *node)
3fd0a558
YZ
253{
254 if (node) {
255 cache->nr_nodes--;
00246528 256 btrfs_put_root(node->root);
3fd0a558
YZ
257 kfree(node);
258 }
259}
260
a26195a5
QW
261static struct btrfs_backref_edge *alloc_backref_edge(
262 struct btrfs_backref_cache *cache)
3fd0a558 263{
a26195a5 264 struct btrfs_backref_edge *edge;
3fd0a558
YZ
265
266 edge = kzalloc(sizeof(*edge), GFP_NOFS);
267 if (edge)
268 cache->nr_edges++;
269 return edge;
5d4f98a2
YZ
270}
271
2a979612
QW
272#define LINK_LOWER (1 << 0)
273#define LINK_UPPER (1 << 1)
a26195a5
QW
274static void link_backref_edge(struct btrfs_backref_edge *edge,
275 struct btrfs_backref_node *lower,
276 struct btrfs_backref_node *upper,
2a979612
QW
277 int link_which)
278{
279 ASSERT(upper && lower && upper->level == lower->level + 1);
280 edge->node[LOWER] = lower;
281 edge->node[UPPER] = upper;
282 if (link_which & LINK_LOWER)
283 list_add_tail(&edge->list[LOWER], &lower->upper);
284 if (link_which & LINK_UPPER)
285 list_add_tail(&edge->list[UPPER], &upper->lower);
286}
287
a26195a5
QW
288static void free_backref_edge(struct btrfs_backref_cache *cache,
289 struct btrfs_backref_edge *edge)
5d4f98a2 290{
3fd0a558
YZ
291 if (edge) {
292 cache->nr_edges--;
293 kfree(edge);
294 }
5d4f98a2
YZ
295}
296
297static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
298 struct rb_node *node)
299{
300 struct rb_node **p = &root->rb_node;
301 struct rb_node *parent = NULL;
302 struct tree_entry *entry;
303
304 while (*p) {
305 parent = *p;
306 entry = rb_entry(parent, struct tree_entry, rb_node);
307
308 if (bytenr < entry->bytenr)
309 p = &(*p)->rb_left;
310 else if (bytenr > entry->bytenr)
311 p = &(*p)->rb_right;
312 else
313 return parent;
314 }
315
316 rb_link_node(node, parent, p);
317 rb_insert_color(node, root);
318 return NULL;
319}
320
321static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
322{
323 struct rb_node *n = root->rb_node;
324 struct tree_entry *entry;
325
326 while (n) {
327 entry = rb_entry(n, struct tree_entry, rb_node);
328
329 if (bytenr < entry->bytenr)
330 n = n->rb_left;
331 else if (bytenr > entry->bytenr)
332 n = n->rb_right;
333 else
334 return n;
335 }
336 return NULL;
337}
338
48a3b636 339static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
43c04fb1
JM
340{
341
342 struct btrfs_fs_info *fs_info = NULL;
a26195a5
QW
343 struct btrfs_backref_node *bnode = rb_entry(rb_node,
344 struct btrfs_backref_node, rb_node);
43c04fb1
JM
345 if (bnode->root)
346 fs_info = bnode->root->fs_info;
5d163e0e
JM
347 btrfs_panic(fs_info, errno,
348 "Inconsistency in backref cache found at offset %llu",
349 bytenr);
43c04fb1
JM
350}
351
5d4f98a2
YZ
352/*
353 * walk up backref nodes until reach node presents tree root
354 */
a26195a5
QW
355static struct btrfs_backref_node *walk_up_backref(
356 struct btrfs_backref_node *node,
357 struct btrfs_backref_edge *edges[], int *index)
5d4f98a2 358{
a26195a5 359 struct btrfs_backref_edge *edge;
5d4f98a2
YZ
360 int idx = *index;
361
362 while (!list_empty(&node->upper)) {
363 edge = list_entry(node->upper.next,
a26195a5 364 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
365 edges[idx++] = edge;
366 node = edge->node[UPPER];
367 }
3fd0a558 368 BUG_ON(node->detached);
5d4f98a2
YZ
369 *index = idx;
370 return node;
371}
372
373/*
374 * walk down backref nodes to find start of next reference path
375 */
a26195a5
QW
376static struct btrfs_backref_node *walk_down_backref(
377 struct btrfs_backref_edge *edges[], int *index)
5d4f98a2 378{
a26195a5
QW
379 struct btrfs_backref_edge *edge;
380 struct btrfs_backref_node *lower;
5d4f98a2
YZ
381 int idx = *index;
382
383 while (idx > 0) {
384 edge = edges[idx - 1];
385 lower = edge->node[LOWER];
386 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
387 idx--;
388 continue;
389 }
390 edge = list_entry(edge->list[LOWER].next,
a26195a5 391 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
392 edges[idx - 1] = edge;
393 *index = idx;
394 return edge->node[UPPER];
395 }
396 *index = 0;
397 return NULL;
398}
399
a26195a5 400static void unlock_node_buffer(struct btrfs_backref_node *node)
3fd0a558
YZ
401{
402 if (node->locked) {
403 btrfs_tree_unlock(node->eb);
404 node->locked = 0;
405 }
406}
407
a26195a5 408static void drop_node_buffer(struct btrfs_backref_node *node)
5d4f98a2
YZ
409{
410 if (node->eb) {
3fd0a558 411 unlock_node_buffer(node);
5d4f98a2
YZ
412 free_extent_buffer(node->eb);
413 node->eb = NULL;
414 }
415}
416
a26195a5
QW
417static void drop_backref_node(struct btrfs_backref_cache *tree,
418 struct btrfs_backref_node *node)
5d4f98a2 419{
5d4f98a2
YZ
420 BUG_ON(!list_empty(&node->upper));
421
422 drop_node_buffer(node);
3fd0a558 423 list_del(&node->list);
5d4f98a2 424 list_del(&node->lower);
3fd0a558
YZ
425 if (!RB_EMPTY_NODE(&node->rb_node))
426 rb_erase(&node->rb_node, &tree->rb_root);
427 free_backref_node(tree, node);
5d4f98a2
YZ
428}
429
430/*
431 * remove a backref node from the backref cache
432 */
a26195a5
QW
433static void remove_backref_node(struct btrfs_backref_cache *cache,
434 struct btrfs_backref_node *node)
5d4f98a2 435{
a26195a5
QW
436 struct btrfs_backref_node *upper;
437 struct btrfs_backref_edge *edge;
5d4f98a2
YZ
438
439 if (!node)
440 return;
441
3fd0a558 442 BUG_ON(!node->lowest && !node->detached);
5d4f98a2 443 while (!list_empty(&node->upper)) {
a26195a5 444 edge = list_entry(node->upper.next, struct btrfs_backref_edge,
5d4f98a2
YZ
445 list[LOWER]);
446 upper = edge->node[UPPER];
447 list_del(&edge->list[LOWER]);
448 list_del(&edge->list[UPPER]);
3fd0a558
YZ
449 free_backref_edge(cache, edge);
450
451 if (RB_EMPTY_NODE(&upper->rb_node)) {
452 BUG_ON(!list_empty(&node->upper));
453 drop_backref_node(cache, node);
454 node = upper;
455 node->lowest = 1;
456 continue;
457 }
5d4f98a2 458 /*
3fd0a558 459 * add the node to leaf node list if no other
5d4f98a2
YZ
460 * child block cached.
461 */
462 if (list_empty(&upper->lower)) {
3fd0a558 463 list_add_tail(&upper->lower, &cache->leaves);
5d4f98a2
YZ
464 upper->lowest = 1;
465 }
466 }
3fd0a558 467
5d4f98a2
YZ
468 drop_backref_node(cache, node);
469}
470
a26195a5
QW
471static void update_backref_node(struct btrfs_backref_cache *cache,
472 struct btrfs_backref_node *node, u64 bytenr)
3fd0a558
YZ
473{
474 struct rb_node *rb_node;
475 rb_erase(&node->rb_node, &cache->rb_root);
476 node->bytenr = bytenr;
477 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
43c04fb1
JM
478 if (rb_node)
479 backref_tree_panic(rb_node, -EEXIST, bytenr);
3fd0a558
YZ
480}
481
482/*
483 * update backref cache after a transaction commit
484 */
485static int update_backref_cache(struct btrfs_trans_handle *trans,
a26195a5 486 struct btrfs_backref_cache *cache)
3fd0a558 487{
a26195a5 488 struct btrfs_backref_node *node;
3fd0a558
YZ
489 int level = 0;
490
491 if (cache->last_trans == 0) {
492 cache->last_trans = trans->transid;
493 return 0;
494 }
495
496 if (cache->last_trans == trans->transid)
497 return 0;
498
499 /*
500 * detached nodes are used to avoid unnecessary backref
501 * lookup. transaction commit changes the extent tree.
502 * so the detached nodes are no longer useful.
503 */
504 while (!list_empty(&cache->detached)) {
505 node = list_entry(cache->detached.next,
a26195a5 506 struct btrfs_backref_node, list);
3fd0a558
YZ
507 remove_backref_node(cache, node);
508 }
509
510 while (!list_empty(&cache->changed)) {
511 node = list_entry(cache->changed.next,
a26195a5 512 struct btrfs_backref_node, list);
3fd0a558
YZ
513 list_del_init(&node->list);
514 BUG_ON(node->pending);
515 update_backref_node(cache, node, node->new_bytenr);
516 }
517
518 /*
519 * some nodes can be left in the pending list if there were
520 * errors during processing the pending nodes.
521 */
522 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
523 list_for_each_entry(node, &cache->pending[level], list) {
524 BUG_ON(!node->pending);
525 if (node->bytenr == node->new_bytenr)
526 continue;
527 update_backref_node(cache, node, node->new_bytenr);
528 }
529 }
530
531 cache->last_trans = 0;
532 return 1;
533}
534
6282675e
QW
535static bool reloc_root_is_dead(struct btrfs_root *root)
536{
537 /*
538 * Pair with set_bit/clear_bit in clean_dirty_subvols and
539 * btrfs_update_reloc_root. We need to see the updated bit before
540 * trying to access reloc_root
541 */
542 smp_rmb();
543 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
544 return true;
545 return false;
546}
547
548/*
549 * Check if this subvolume tree has valid reloc tree.
550 *
551 * Reloc tree after swap is considered dead, thus not considered as valid.
552 * This is enough for most callers, as they don't distinguish dead reloc root
553 * from no reloc root. But should_ignore_root() below is a special case.
554 */
555static bool have_reloc_root(struct btrfs_root *root)
556{
557 if (reloc_root_is_dead(root))
558 return false;
559 if (!root->reloc_root)
560 return false;
561 return true;
562}
f2a97a9d 563
3fd0a558
YZ
564static int should_ignore_root(struct btrfs_root *root)
565{
566 struct btrfs_root *reloc_root;
567
27cdeb70 568 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
3fd0a558
YZ
569 return 0;
570
6282675e
QW
571 /* This root has been merged with its reloc tree, we can ignore it */
572 if (reloc_root_is_dead(root))
573 return 1;
574
3fd0a558
YZ
575 reloc_root = root->reloc_root;
576 if (!reloc_root)
577 return 0;
578
4d4225fc
JB
579 if (btrfs_header_generation(reloc_root->commit_root) ==
580 root->fs_info->running_transaction->transid)
3fd0a558
YZ
581 return 0;
582 /*
583 * if there is reloc tree and it was created in previous
584 * transaction backref lookup can find the reloc tree,
585 * so backref node for the fs tree root is useless for
586 * relocation.
587 */
588 return 1;
589}
5d4f98a2
YZ
590/*
591 * find reloc tree by address of tree root
592 */
2433bea5 593struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
5d4f98a2 594{
2433bea5 595 struct reloc_control *rc = fs_info->reloc_ctl;
5d4f98a2
YZ
596 struct rb_node *rb_node;
597 struct mapping_node *node;
598 struct btrfs_root *root = NULL;
599
2433bea5 600 ASSERT(rc);
5d4f98a2
YZ
601 spin_lock(&rc->reloc_root_tree.lock);
602 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
603 if (rb_node) {
604 node = rb_entry(rb_node, struct mapping_node, rb_node);
605 root = (struct btrfs_root *)node->data;
606 }
607 spin_unlock(&rc->reloc_root_tree.lock);
00246528 608 return btrfs_grab_root(root);
5d4f98a2
YZ
609}
610
5d4f98a2
YZ
611static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
612 u64 root_objectid)
613{
614 struct btrfs_key key;
615
616 key.objectid = root_objectid;
617 key.type = BTRFS_ROOT_ITEM_KEY;
e1922118 618 key.offset = (u64)-1;
5d4f98a2 619
bc44d7c4 620 return btrfs_get_fs_root(fs_info, &key, false);
5d4f98a2
YZ
621}
622
4007ea87
QW
623/*
624 * Handle direct tree backref
625 *
626 * Direct tree backref means, the backref item shows its parent bytenr
627 * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
628 *
629 * @ref_key: The converted backref key.
630 * For keyed backref, it's the item key.
631 * For inlined backref, objectid is the bytenr,
632 * type is btrfs_inline_ref_type, offset is
633 * btrfs_inline_ref_offset.
634 */
a26195a5 635static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
4007ea87 636 struct btrfs_key *ref_key,
a26195a5 637 struct btrfs_backref_node *cur)
4007ea87 638{
a26195a5
QW
639 struct btrfs_backref_edge *edge;
640 struct btrfs_backref_node *upper;
4007ea87
QW
641 struct rb_node *rb_node;
642
643 ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
644
645 /* Only reloc root uses backref pointing to itself */
646 if (ref_key->objectid == ref_key->offset) {
647 struct btrfs_root *root;
648
649 cur->is_reloc_root = 1;
650 /* Only reloc backref cache cares about a specific root */
651 if (cache->is_reloc) {
652 root = find_reloc_root(cache->fs_info, cur->bytenr);
653 if (WARN_ON(!root))
654 return -ENOENT;
655 cur->root = root;
656 } else {
657 /*
658 * For generic purpose backref cache, reloc root node
659 * is useless.
660 */
661 list_add(&cur->list, &cache->useless_node);
662 }
663 return 0;
664 }
665
666 edge = alloc_backref_edge(cache);
667 if (!edge)
668 return -ENOMEM;
669
670 rb_node = tree_search(&cache->rb_root, ref_key->offset);
671 if (!rb_node) {
672 /* Parent node not yet cached */
0304f2d8
QW
673 upper = alloc_backref_node(cache, ref_key->offset,
674 cur->level + 1);
4007ea87
QW
675 if (!upper) {
676 free_backref_edge(cache, edge);
677 return -ENOMEM;
678 }
4007ea87
QW
679
680 /*
681 * Backrefs for the upper level block isn't cached, add the
682 * block to pending list
683 */
684 list_add_tail(&edge->list[UPPER], &cache->pending_edge);
685 } else {
686 /* Parent node already cached */
a26195a5 687 upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
4007ea87
QW
688 ASSERT(upper->checked);
689 INIT_LIST_HEAD(&edge->list[UPPER]);
690 }
2a979612 691 link_backref_edge(edge, cur, upper, LINK_LOWER);
4007ea87
QW
692 return 0;
693}
694
4d81ea8b
QW
695/*
696 * Handle indirect tree backref
697 *
698 * Indirect tree backref means, we only know which tree the node belongs to.
699 * We still need to do a tree search to find out the parents. This is for
700 * TREE_BLOCK_REF backref (keyed or inlined).
701 *
702 * @ref_key: The same as @ref_key in handle_direct_tree_backref()
703 * @tree_key: The first key of this tree block.
704 * @path: A clean (released) path, to avoid allocating path everytime
705 * the function get called.
706 */
a26195a5 707static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
4d81ea8b
QW
708 struct btrfs_path *path,
709 struct btrfs_key *ref_key,
710 struct btrfs_key *tree_key,
a26195a5 711 struct btrfs_backref_node *cur)
4d81ea8b
QW
712{
713 struct btrfs_fs_info *fs_info = cache->fs_info;
a26195a5
QW
714 struct btrfs_backref_node *upper;
715 struct btrfs_backref_node *lower;
716 struct btrfs_backref_edge *edge;
4d81ea8b
QW
717 struct extent_buffer *eb;
718 struct btrfs_root *root;
719 struct rb_node *rb_node;
720 int level;
721 bool need_check = true;
722 int ret;
723
724 root = read_fs_root(fs_info, ref_key->offset);
725 if (IS_ERR(root))
726 return PTR_ERR(root);
727 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
728 cur->cowonly = 1;
729
730 if (btrfs_root_level(&root->root_item) == cur->level) {
731 /* Tree root */
732 ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
733 if (should_ignore_root(root)) {
734 btrfs_put_root(root);
735 list_add(&cur->list, &cache->useless_node);
736 } else {
737 cur->root = root;
738 }
739 return 0;
740 }
741
742 level = cur->level + 1;
743
744 /* Search the tree to find parent blocks referring to the block */
745 path->search_commit_root = 1;
746 path->skip_locking = 1;
747 path->lowest_level = level;
748 ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
749 path->lowest_level = 0;
750 if (ret < 0) {
751 btrfs_put_root(root);
752 return ret;
753 }
754 if (ret > 0 && path->slots[level] > 0)
755 path->slots[level]--;
756
757 eb = path->nodes[level];
758 if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
759 btrfs_err(fs_info,
760"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
761 cur->bytenr, level - 1, root->root_key.objectid,
762 tree_key->objectid, tree_key->type, tree_key->offset);
763 btrfs_put_root(root);
764 ret = -ENOENT;
765 goto out;
766 }
767 lower = cur;
768
769 /* Add all nodes and edges in the path */
770 for (; level < BTRFS_MAX_LEVEL; level++) {
771 if (!path->nodes[level]) {
772 ASSERT(btrfs_root_bytenr(&root->root_item) ==
773 lower->bytenr);
774 if (should_ignore_root(root)) {
775 btrfs_put_root(root);
776 list_add(&lower->list, &cache->useless_node);
777 } else {
778 lower->root = root;
779 }
780 break;
781 }
782
783 edge = alloc_backref_edge(cache);
784 if (!edge) {
785 btrfs_put_root(root);
786 ret = -ENOMEM;
787 goto out;
788 }
789
790 eb = path->nodes[level];
791 rb_node = tree_search(&cache->rb_root, eb->start);
792 if (!rb_node) {
0304f2d8
QW
793 upper = alloc_backref_node(cache, eb->start,
794 lower->level + 1);
4d81ea8b
QW
795 if (!upper) {
796 btrfs_put_root(root);
797 free_backref_edge(cache, edge);
798 ret = -ENOMEM;
799 goto out;
800 }
4d81ea8b 801 upper->owner = btrfs_header_owner(eb);
4d81ea8b
QW
802 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
803 upper->cowonly = 1;
804
805 /*
806 * If we know the block isn't shared we can avoid
807 * checking its backrefs.
808 */
809 if (btrfs_block_can_be_shared(root, eb))
810 upper->checked = 0;
811 else
812 upper->checked = 1;
813
814 /*
815 * Add the block to pending list if we need to check its
816 * backrefs, we only do this once while walking up a
817 * tree as we will catch anything else later on.
818 */
819 if (!upper->checked && need_check) {
820 need_check = false;
821 list_add_tail(&edge->list[UPPER],
822 &cache->pending_edge);
823 } else {
824 if (upper->checked)
825 need_check = true;
826 INIT_LIST_HEAD(&edge->list[UPPER]);
827 }
828 } else {
a26195a5
QW
829 upper = rb_entry(rb_node, struct btrfs_backref_node,
830 rb_node);
4d81ea8b
QW
831 ASSERT(upper->checked);
832 INIT_LIST_HEAD(&edge->list[UPPER]);
833 if (!upper->owner)
834 upper->owner = btrfs_header_owner(eb);
835 }
2a979612 836 link_backref_edge(edge, lower, upper, LINK_LOWER);
4d81ea8b
QW
837
838 if (rb_node) {
839 btrfs_put_root(root);
840 break;
841 }
842 lower = upper;
843 upper = NULL;
844 }
845out:
846 btrfs_release_path(path);
847 return ret;
848}
849
a26195a5 850static int handle_one_tree_block(struct btrfs_backref_cache *cache,
e7d571c7
QW
851 struct btrfs_path *path,
852 struct btrfs_backref_iter *iter,
853 struct btrfs_key *node_key,
a26195a5 854 struct btrfs_backref_node *cur)
5d4f98a2 855{
e7d571c7 856 struct btrfs_fs_info *fs_info = cache->fs_info;
a26195a5
QW
857 struct btrfs_backref_edge *edge;
858 struct btrfs_backref_node *exist;
5d4f98a2 859 int ret;
5d4f98a2 860
71f572a9 861 ret = btrfs_backref_iter_start(iter, cur->bytenr);
e7d571c7
QW
862 if (ret < 0)
863 return ret;
71f572a9
QW
864 /*
865 * We skip the first btrfs_tree_block_info, as we don't use the key
866 * stored in it, but fetch it from the tree block
867 */
868 if (btrfs_backref_has_tree_block_info(iter)) {
869 ret = btrfs_backref_iter_next(iter);
e7d571c7 870 if (ret < 0)
71f572a9 871 goto out;
71f572a9
QW
872 /* No extra backref? This means the tree block is corrupted */
873 if (ret > 0) {
e7d571c7 874 ret = -EUCLEAN;
71f572a9
QW
875 goto out;
876 }
877 }
5d4f98a2
YZ
878 WARN_ON(cur->checked);
879 if (!list_empty(&cur->upper)) {
880 /*
70f23fd6 881 * the backref was added previously when processing
5d4f98a2
YZ
882 * backref of type BTRFS_TREE_BLOCK_REF_KEY
883 */
75bfb9af 884 ASSERT(list_is_singular(&cur->upper));
a26195a5 885 edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
5d4f98a2 886 list[LOWER]);
75bfb9af 887 ASSERT(list_empty(&edge->list[UPPER]));
5d4f98a2
YZ
888 exist = edge->node[UPPER];
889 /*
890 * add the upper level block to pending list if we need
891 * check its backrefs
892 */
893 if (!exist->checked)
84780289 894 list_add_tail(&edge->list[UPPER], &cache->pending_edge);
5d4f98a2
YZ
895 } else {
896 exist = NULL;
897 }
898
71f572a9
QW
899 for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
900 struct extent_buffer *eb;
901 struct btrfs_key key;
902 int type;
5d4f98a2 903
71f572a9
QW
904 cond_resched();
905 eb = btrfs_backref_get_eb(iter);
5d4f98a2 906
71f572a9
QW
907 key.objectid = iter->bytenr;
908 if (btrfs_backref_iter_is_inline_ref(iter)) {
909 struct btrfs_extent_inline_ref *iref;
5d4f98a2 910
5d4f98a2 911 /* update key for inline back ref */
71f572a9
QW
912 iref = (struct btrfs_extent_inline_ref *)
913 ((unsigned long)iter->cur_ptr);
3de28d57
LB
914 type = btrfs_get_extent_inline_ref_type(eb, iref,
915 BTRFS_REF_TYPE_BLOCK);
916 if (type == BTRFS_REF_TYPE_INVALID) {
e7d571c7 917 ret = -EUCLEAN;
3de28d57
LB
918 goto out;
919 }
920 key.type = type;
5d4f98a2 921 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
71f572a9
QW
922 } else {
923 key.type = iter->cur_key.type;
924 key.offset = iter->cur_key.offset;
5d4f98a2
YZ
925 }
926
fa6ac715
QW
927 /*
928 * Parent node found and matches current inline ref, no need to
929 * rebuild this node for this inline ref.
930 */
5d4f98a2
YZ
931 if (exist &&
932 ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
933 exist->owner == key.offset) ||
934 (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
935 exist->bytenr == key.offset))) {
936 exist = NULL;
71f572a9 937 continue;
5d4f98a2
YZ
938 }
939
fa6ac715 940 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
5d4f98a2 941 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
4007ea87 942 ret = handle_direct_tree_backref(cache, &key, cur);
e7d571c7 943 if (ret < 0)
5d4f98a2 944 goto out;
71f572a9 945 continue;
6d8ff4e4 946 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
e7d571c7
QW
947 ret = -EINVAL;
948 btrfs_print_v0_err(fs_info);
949 btrfs_handle_fs_error(fs_info, ret, NULL);
ba3c2b19 950 goto out;
5d4f98a2 951 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
71f572a9 952 continue;
5d4f98a2
YZ
953 }
954
fa6ac715
QW
955 /*
956 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
957 * means the root objectid. We need to search the tree to get
958 * its parent bytenr.
959 */
4d81ea8b
QW
960 ret = handle_indirect_tree_backref(cache, path, &key, node_key,
961 cur);
e7d571c7 962 if (ret < 0)
5d4f98a2 963 goto out;
5d4f98a2 964 }
71f572a9 965 ret = 0;
5d4f98a2
YZ
966 cur->checked = 1;
967 WARN_ON(exist);
e7d571c7
QW
968out:
969 btrfs_backref_iter_release(iter);
970 return ret;
971}
972
1f872924
QW
973/*
974 * In handle_one_tree_backref(), we have only linked the lower node to the edge,
975 * but the upper node hasn't been linked to the edge.
a26195a5
QW
976 * This means we can only iterate through btrfs_backref_node::upper to reach
977 * parent edges, but not through btrfs_backref_node::lower to reach children
978 * edges.
1f872924 979 *
a26195a5
QW
980 * This function will finish the btrfs_backref_node::lower to related edges,
981 * so that backref cache can be bi-directionally iterated.
1f872924
QW
982 *
983 * Also, this will add the nodes to backref cache for the next run.
984 */
a26195a5
QW
985static int finish_upper_links(struct btrfs_backref_cache *cache,
986 struct btrfs_backref_node *start)
1f872924
QW
987{
988 struct list_head *useless_node = &cache->useless_node;
a26195a5 989 struct btrfs_backref_edge *edge;
1f872924
QW
990 struct rb_node *rb_node;
991 LIST_HEAD(pending_edge);
992
993 ASSERT(start->checked);
994
995 /* Insert this node to cache if it's not COW-only */
996 if (!start->cowonly) {
997 rb_node = tree_insert(&cache->rb_root, start->bytenr,
998 &start->rb_node);
999 if (rb_node)
1000 backref_tree_panic(rb_node, -EEXIST, start->bytenr);
1001 list_add_tail(&start->lower, &cache->leaves);
1002 }
1003
1004 /*
1005 * Use breadth first search to iterate all related edges.
1006 *
1007 * The starting points are all the edges of this node
1008 */
1009 list_for_each_entry(edge, &start->upper, list[LOWER])
1010 list_add_tail(&edge->list[UPPER], &pending_edge);
1011
1012 while (!list_empty(&pending_edge)) {
a26195a5
QW
1013 struct btrfs_backref_node *upper;
1014 struct btrfs_backref_node *lower;
1f872924
QW
1015 struct rb_node *rb_node;
1016
a26195a5
QW
1017 edge = list_first_entry(&pending_edge,
1018 struct btrfs_backref_edge, list[UPPER]);
1f872924
QW
1019 list_del_init(&edge->list[UPPER]);
1020 upper = edge->node[UPPER];
1021 lower = edge->node[LOWER];
1022
1023 /* Parent is detached, no need to keep any edges */
1024 if (upper->detached) {
1025 list_del(&edge->list[LOWER]);
1026 free_backref_edge(cache, edge);
1027
1028 /* Lower node is orphan, queue for cleanup */
1029 if (list_empty(&lower->upper))
1030 list_add(&lower->list, useless_node);
1031 continue;
1032 }
1033
1034 /*
1035 * All new nodes added in current build_backref_tree() haven't
1036 * been linked to the cache rb tree.
1037 * So if we have upper->rb_node populated, this means a cache
1038 * hit. We only need to link the edge, as @upper and all its
1039 * parent have already been linked.
1040 */
1041 if (!RB_EMPTY_NODE(&upper->rb_node)) {
1042 if (upper->lowest) {
1043 list_del_init(&upper->lower);
1044 upper->lowest = 0;
1045 }
1046
1047 list_add_tail(&edge->list[UPPER], &upper->lower);
1048 continue;
1049 }
1050
1051 /* Sanity check, we shouldn't have any unchecked nodes */
1052 if (!upper->checked) {
1053 ASSERT(0);
1054 return -EUCLEAN;
1055 }
1056
1057 /* Sanity check, COW-only node has non-COW-only parent */
1058 if (start->cowonly != upper->cowonly) {
1059 ASSERT(0);
1060 return -EUCLEAN;
1061 }
1062
1063 /* Only cache non-COW-only (subvolume trees) tree blocks */
1064 if (!upper->cowonly) {
1065 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1066 &upper->rb_node);
1067 if (rb_node) {
1068 backref_tree_panic(rb_node, -EEXIST,
1069 upper->bytenr);
1070 return -EUCLEAN;
1071 }
1072 }
1073
1074 list_add_tail(&edge->list[UPPER], &upper->lower);
1075
1076 /*
1077 * Also queue all the parent edges of this uncached node to
1078 * finish the upper linkage
1079 */
1080 list_for_each_entry(edge, &upper->upper, list[LOWER])
1081 list_add_tail(&edge->list[UPPER], &pending_edge);
1082 }
1083 return 0;
1084}
1085
29db137b
QW
1086/*
1087 * For useless nodes, do two major clean ups:
1088 *
1089 * - Cleanup the children edges and nodes
1090 * If child node is also orphan (no parent) during cleanup, then the child
1091 * node will also be cleaned up.
1092 *
1093 * - Freeing up leaves (level 0), keeps nodes detached
1094 * For nodes, the node is still cached as "detached"
1095 *
1096 * Return false if @node is not in the @useless_nodes list.
1097 * Return true if @node is in the @useless_nodes list.
1098 */
1099static bool handle_useless_nodes(struct reloc_control *rc,
a26195a5 1100 struct btrfs_backref_node *node)
29db137b 1101{
a26195a5 1102 struct btrfs_backref_cache *cache = &rc->backref_cache;
29db137b
QW
1103 struct list_head *useless_node = &cache->useless_node;
1104 bool ret = false;
1105
1106 while (!list_empty(useless_node)) {
a26195a5 1107 struct btrfs_backref_node *cur;
29db137b 1108
a26195a5 1109 cur = list_first_entry(useless_node, struct btrfs_backref_node,
29db137b
QW
1110 list);
1111 list_del_init(&cur->list);
1112
1113 /* Only tree root nodes can be added to @useless_nodes */
1114 ASSERT(list_empty(&cur->upper));
1115
1116 if (cur == node)
1117 ret = true;
1118
1119 /* The node is the lowest node */
1120 if (cur->lowest) {
1121 list_del_init(&cur->lower);
1122 cur->lowest = 0;
1123 }
1124
1125 /* Cleanup the lower edges */
1126 while (!list_empty(&cur->lower)) {
a26195a5
QW
1127 struct btrfs_backref_edge *edge;
1128 struct btrfs_backref_node *lower;
29db137b
QW
1129
1130 edge = list_entry(cur->lower.next,
a26195a5 1131 struct btrfs_backref_edge, list[UPPER]);
29db137b
QW
1132 list_del(&edge->list[UPPER]);
1133 list_del(&edge->list[LOWER]);
1134 lower = edge->node[LOWER];
1135 free_backref_edge(cache, edge);
1136
1137 /* Child node is also orphan, queue for cleanup */
1138 if (list_empty(&lower->upper))
1139 list_add(&lower->list, useless_node);
1140 }
1141 /* Mark this block processed for relocation */
1142 mark_block_processed(rc, cur);
1143
1144 /*
1145 * Backref nodes for tree leaves are deleted from the cache.
1146 * Backref nodes for upper level tree blocks are left in the
1147 * cache to avoid unnecessary backref lookup.
1148 */
1149 if (cur->level > 0) {
1150 list_add(&cur->list, &cache->detached);
1151 cur->detached = 1;
1152 } else {
1153 rb_erase(&cur->rb_node, &cache->rb_root);
1154 free_backref_node(cache, cur);
1155 }
1156 }
1157 return ret;
1158}
1159
e7d571c7
QW
1160/*
1161 * Build backref tree for a given tree block. Root of the backref tree
1162 * corresponds the tree block, leaves of the backref tree correspond roots of
1163 * b-trees that reference the tree block.
1164 *
1165 * The basic idea of this function is check backrefs of a given block to find
1166 * upper level blocks that reference the block, and then check backrefs of
1167 * these upper level blocks recursively. The recursion stops when tree root is
1168 * reached or backrefs for the block is cached.
1169 *
1170 * NOTE: if we find that backrefs for a block are cached, we know backrefs for
1171 * all upper level blocks that directly/indirectly reference the block are also
1172 * cached.
1173 */
a26195a5 1174static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
e7d571c7
QW
1175 struct reloc_control *rc, struct btrfs_key *node_key,
1176 int level, u64 bytenr)
1177{
1178 struct btrfs_backref_iter *iter;
a26195a5 1179 struct btrfs_backref_cache *cache = &rc->backref_cache;
e7d571c7
QW
1180 /* For searching parent of TREE_BLOCK_REF */
1181 struct btrfs_path *path;
a26195a5
QW
1182 struct btrfs_backref_node *cur;
1183 struct btrfs_backref_node *upper;
1184 struct btrfs_backref_node *lower;
1185 struct btrfs_backref_node *node = NULL;
1186 struct btrfs_backref_edge *edge;
e7d571c7
QW
1187 int ret;
1188 int err = 0;
5d4f98a2 1189
e7d571c7
QW
1190 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
1191 if (!iter)
1192 return ERR_PTR(-ENOMEM);
1193 path = btrfs_alloc_path();
1194 if (!path) {
1195 err = -ENOMEM;
1196 goto out;
1197 }
1198
1199 node = alloc_backref_node(cache, bytenr, level);
1200 if (!node) {
1201 err = -ENOMEM;
1202 goto out;
5d4f98a2
YZ
1203 }
1204
e7d571c7
QW
1205 node->lowest = 1;
1206 cur = node;
1207
1208 /* Breadth-first search to build backref cache */
1209 do {
1210 ret = handle_one_tree_block(cache, path, iter, node_key, cur);
1211 if (ret < 0) {
1212 err = ret;
1213 goto out;
1214 }
1215 edge = list_first_entry_or_null(&cache->pending_edge,
a26195a5 1216 struct btrfs_backref_edge, list[UPPER]);
e7d571c7
QW
1217 /*
1218 * The pending list isn't empty, take the first block to
1219 * process
1220 */
1221 if (edge) {
1222 list_del_init(&edge->list[UPPER]);
1223 cur = edge->node[UPPER];
1224 }
1225 } while (edge);
1226
1f872924
QW
1227 /* Finish the upper linkage of newly added edges/nodes */
1228 ret = finish_upper_links(cache, node);
1229 if (ret < 0) {
1230 err = ret;
1231 goto out;
3fd0a558 1232 }
5d4f98a2 1233
29db137b
QW
1234 if (handle_useless_nodes(rc, node))
1235 node = NULL;
5d4f98a2 1236out:
71f572a9
QW
1237 btrfs_backref_iter_free(iter);
1238 btrfs_free_path(path);
5d4f98a2 1239 if (err) {
84780289
QW
1240 while (!list_empty(&cache->useless_node)) {
1241 lower = list_first_entry(&cache->useless_node,
a26195a5 1242 struct btrfs_backref_node, list);
75bfb9af 1243 list_del_init(&lower->list);
3fd0a558 1244 }
84780289
QW
1245 while (!list_empty(&cache->pending_edge)) {
1246 edge = list_first_entry(&cache->pending_edge,
a26195a5 1247 struct btrfs_backref_edge, list[UPPER]);
75bfb9af 1248 list_del(&edge->list[UPPER]);
3fd0a558 1249 list_del(&edge->list[LOWER]);
75bfb9af 1250 lower = edge->node[LOWER];
5d4f98a2 1251 upper = edge->node[UPPER];
3fd0a558 1252 free_backref_edge(cache, edge);
75bfb9af
JB
1253
1254 /*
1255 * Lower is no longer linked to any upper backref nodes
1256 * and isn't in the cache, we can free it ourselves.
1257 */
1258 if (list_empty(&lower->upper) &&
1259 RB_EMPTY_NODE(&lower->rb_node))
84780289 1260 list_add(&lower->list, &cache->useless_node);
75bfb9af
JB
1261
1262 if (!RB_EMPTY_NODE(&upper->rb_node))
1263 continue;
1264
01327610 1265 /* Add this guy's upper edges to the list to process */
75bfb9af 1266 list_for_each_entry(edge, &upper->upper, list[LOWER])
84780289
QW
1267 list_add_tail(&edge->list[UPPER],
1268 &cache->pending_edge);
75bfb9af 1269 if (list_empty(&upper->upper))
84780289 1270 list_add(&upper->list, &cache->useless_node);
75bfb9af
JB
1271 }
1272
84780289
QW
1273 while (!list_empty(&cache->useless_node)) {
1274 lower = list_first_entry(&cache->useless_node,
a26195a5 1275 struct btrfs_backref_node, list);
75bfb9af 1276 list_del_init(&lower->list);
0fd8c3da
LB
1277 if (lower == node)
1278 node = NULL;
75bfb9af 1279 free_backref_node(cache, lower);
5d4f98a2 1280 }
0fd8c3da 1281
8e19c973 1282 remove_backref_node(cache, node);
84780289
QW
1283 ASSERT(list_empty(&cache->useless_node) &&
1284 list_empty(&cache->pending_edge));
5d4f98a2
YZ
1285 return ERR_PTR(err);
1286 }
75bfb9af 1287 ASSERT(!node || !node->detached);
84780289
QW
1288 ASSERT(list_empty(&cache->useless_node) &&
1289 list_empty(&cache->pending_edge));
5d4f98a2
YZ
1290 return node;
1291}
1292
3fd0a558
YZ
1293/*
1294 * helper to add backref node for the newly created snapshot.
1295 * the backref node is created by cloning backref node that
1296 * corresponds to root of source tree
1297 */
1298static int clone_backref_node(struct btrfs_trans_handle *trans,
1299 struct reloc_control *rc,
1300 struct btrfs_root *src,
1301 struct btrfs_root *dest)
1302{
1303 struct btrfs_root *reloc_root = src->reloc_root;
a26195a5
QW
1304 struct btrfs_backref_cache *cache = &rc->backref_cache;
1305 struct btrfs_backref_node *node = NULL;
1306 struct btrfs_backref_node *new_node;
1307 struct btrfs_backref_edge *edge;
1308 struct btrfs_backref_edge *new_edge;
3fd0a558
YZ
1309 struct rb_node *rb_node;
1310
1311 if (cache->last_trans > 0)
1312 update_backref_cache(trans, cache);
1313
1314 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1315 if (rb_node) {
a26195a5 1316 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
3fd0a558
YZ
1317 if (node->detached)
1318 node = NULL;
1319 else
1320 BUG_ON(node->new_bytenr != reloc_root->node->start);
1321 }
1322
1323 if (!node) {
1324 rb_node = tree_search(&cache->rb_root,
1325 reloc_root->commit_root->start);
1326 if (rb_node) {
a26195a5 1327 node = rb_entry(rb_node, struct btrfs_backref_node,
3fd0a558
YZ
1328 rb_node);
1329 BUG_ON(node->detached);
1330 }
1331 }
1332
1333 if (!node)
1334 return 0;
1335
0304f2d8 1336 new_node = alloc_backref_node(cache, dest->node->start, node->level);
3fd0a558
YZ
1337 if (!new_node)
1338 return -ENOMEM;
1339
3fd0a558 1340 new_node->lowest = node->lowest;
6848ad64 1341 new_node->checked = 1;
00246528 1342 new_node->root = btrfs_grab_root(dest);
0b530bc5 1343 ASSERT(new_node->root);
3fd0a558
YZ
1344
1345 if (!node->lowest) {
1346 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1347 new_edge = alloc_backref_edge(cache);
1348 if (!new_edge)
1349 goto fail;
1350
2a979612
QW
1351 link_backref_edge(new_edge, edge->node[LOWER], new_node,
1352 LINK_UPPER);
3fd0a558 1353 }
76b9e23d
MX
1354 } else {
1355 list_add_tail(&new_node->lower, &cache->leaves);
3fd0a558
YZ
1356 }
1357
1358 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1359 &new_node->rb_node);
43c04fb1
JM
1360 if (rb_node)
1361 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
3fd0a558
YZ
1362
1363 if (!new_node->lowest) {
1364 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1365 list_add_tail(&new_edge->list[LOWER],
1366 &new_edge->node[LOWER]->upper);
1367 }
1368 }
1369 return 0;
1370fail:
1371 while (!list_empty(&new_node->lower)) {
1372 new_edge = list_entry(new_node->lower.next,
a26195a5 1373 struct btrfs_backref_edge, list[UPPER]);
3fd0a558
YZ
1374 list_del(&new_edge->list[UPPER]);
1375 free_backref_edge(cache, new_edge);
1376 }
1377 free_backref_node(cache, new_node);
1378 return -ENOMEM;
1379}
1380
5d4f98a2
YZ
1381/*
1382 * helper to add 'address of tree root -> reloc tree' mapping
1383 */
ffd7b339 1384static int __must_check __add_reloc_root(struct btrfs_root *root)
5d4f98a2 1385{
0b246afa 1386 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1387 struct rb_node *rb_node;
1388 struct mapping_node *node;
0b246afa 1389 struct reloc_control *rc = fs_info->reloc_ctl;
5d4f98a2
YZ
1390
1391 node = kmalloc(sizeof(*node), GFP_NOFS);
ffd7b339
JM
1392 if (!node)
1393 return -ENOMEM;
5d4f98a2 1394
ea287ab1 1395 node->bytenr = root->commit_root->start;
5d4f98a2
YZ
1396 node->data = root;
1397
1398 spin_lock(&rc->reloc_root_tree.lock);
1399 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1400 node->bytenr, &node->rb_node);
1401 spin_unlock(&rc->reloc_root_tree.lock);
ffd7b339 1402 if (rb_node) {
0b246afa 1403 btrfs_panic(fs_info, -EEXIST,
5d163e0e
JM
1404 "Duplicate root found for start=%llu while inserting into relocation tree",
1405 node->bytenr);
ffd7b339 1406 }
5d4f98a2
YZ
1407
1408 list_add_tail(&root->root_list, &rc->reloc_roots);
1409 return 0;
1410}
1411
1412/*
c974c464 1413 * helper to delete the 'address of tree root -> reloc tree'
5d4f98a2
YZ
1414 * mapping
1415 */
c974c464 1416static void __del_reloc_root(struct btrfs_root *root)
5d4f98a2 1417{
0b246afa 1418 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1419 struct rb_node *rb_node;
1420 struct mapping_node *node = NULL;
0b246afa 1421 struct reloc_control *rc = fs_info->reloc_ctl;
f44deb74 1422 bool put_ref = false;
5d4f98a2 1423
65c6e82b 1424 if (rc && root->node) {
389305b2
QW
1425 spin_lock(&rc->reloc_root_tree.lock);
1426 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
ea287ab1 1427 root->commit_root->start);
389305b2
QW
1428 if (rb_node) {
1429 node = rb_entry(rb_node, struct mapping_node, rb_node);
1430 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
ea287ab1 1431 RB_CLEAR_NODE(&node->rb_node);
389305b2
QW
1432 }
1433 spin_unlock(&rc->reloc_root_tree.lock);
1434 if (!node)
1435 return;
1436 BUG_ON((struct btrfs_root *)node->data != root);
5d4f98a2 1437 }
5d4f98a2 1438
f44deb74
JB
1439 /*
1440 * We only put the reloc root here if it's on the list. There's a lot
1441 * of places where the pattern is to splice the rc->reloc_roots, process
1442 * the reloc roots, and then add the reloc root back onto
1443 * rc->reloc_roots. If we call __del_reloc_root while it's off of the
1444 * list we don't want the reference being dropped, because the guy
1445 * messing with the list is in charge of the reference.
1446 */
0b246afa 1447 spin_lock(&fs_info->trans_lock);
f44deb74
JB
1448 if (!list_empty(&root->root_list)) {
1449 put_ref = true;
1450 list_del_init(&root->root_list);
1451 }
0b246afa 1452 spin_unlock(&fs_info->trans_lock);
f44deb74
JB
1453 if (put_ref)
1454 btrfs_put_root(root);
c974c464
WS
1455 kfree(node);
1456}
1457
1458/*
1459 * helper to update the 'address of tree root -> reloc tree'
1460 * mapping
1461 */
ea287ab1 1462static int __update_reloc_root(struct btrfs_root *root)
c974c464 1463{
0b246afa 1464 struct btrfs_fs_info *fs_info = root->fs_info;
c974c464
WS
1465 struct rb_node *rb_node;
1466 struct mapping_node *node = NULL;
0b246afa 1467 struct reloc_control *rc = fs_info->reloc_ctl;
c974c464
WS
1468
1469 spin_lock(&rc->reloc_root_tree.lock);
1470 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
ea287ab1 1471 root->commit_root->start);
c974c464
WS
1472 if (rb_node) {
1473 node = rb_entry(rb_node, struct mapping_node, rb_node);
1474 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
5d4f98a2 1475 }
c974c464
WS
1476 spin_unlock(&rc->reloc_root_tree.lock);
1477
1478 if (!node)
1479 return 0;
1480 BUG_ON((struct btrfs_root *)node->data != root);
1481
1482 spin_lock(&rc->reloc_root_tree.lock);
ea287ab1 1483 node->bytenr = root->node->start;
c974c464
WS
1484 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1485 node->bytenr, &node->rb_node);
1486 spin_unlock(&rc->reloc_root_tree.lock);
1487 if (rb_node)
1488 backref_tree_panic(rb_node, -EEXIST, node->bytenr);
5d4f98a2
YZ
1489 return 0;
1490}
1491
3fd0a558
YZ
1492static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1493 struct btrfs_root *root, u64 objectid)
5d4f98a2 1494{
0b246afa 1495 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1496 struct btrfs_root *reloc_root;
1497 struct extent_buffer *eb;
1498 struct btrfs_root_item *root_item;
1499 struct btrfs_key root_key;
1500 int ret;
1501
5d4f98a2
YZ
1502 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1503 BUG_ON(!root_item);
1504
1505 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1506 root_key.type = BTRFS_ROOT_ITEM_KEY;
3fd0a558 1507 root_key.offset = objectid;
5d4f98a2 1508
3fd0a558 1509 if (root->root_key.objectid == objectid) {
054570a1
FM
1510 u64 commit_root_gen;
1511
3fd0a558
YZ
1512 /* called by btrfs_init_reloc_root */
1513 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1514 BTRFS_TREE_RELOC_OBJECTID);
1515 BUG_ON(ret);
054570a1
FM
1516 /*
1517 * Set the last_snapshot field to the generation of the commit
1518 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1519 * correctly (returns true) when the relocation root is created
1520 * either inside the critical section of a transaction commit
1521 * (through transaction.c:qgroup_account_snapshot()) and when
1522 * it's created before the transaction commit is started.
1523 */
1524 commit_root_gen = btrfs_header_generation(root->commit_root);
1525 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
3fd0a558
YZ
1526 } else {
1527 /*
1528 * called by btrfs_reloc_post_snapshot_hook.
1529 * the source tree is a reloc tree, all tree blocks
1530 * modified after it was created have RELOC flag
1531 * set in their headers. so it's OK to not update
1532 * the 'last_snapshot'.
1533 */
1534 ret = btrfs_copy_root(trans, root, root->node, &eb,
1535 BTRFS_TREE_RELOC_OBJECTID);
1536 BUG_ON(ret);
1537 }
5d4f98a2 1538
5d4f98a2 1539 memcpy(root_item, &root->root_item, sizeof(*root_item));
5d4f98a2
YZ
1540 btrfs_set_root_bytenr(root_item, eb->start);
1541 btrfs_set_root_level(root_item, btrfs_header_level(eb));
1542 btrfs_set_root_generation(root_item, trans->transid);
3fd0a558
YZ
1543
1544 if (root->root_key.objectid == objectid) {
1545 btrfs_set_root_refs(root_item, 0);
1546 memset(&root_item->drop_progress, 0,
1547 sizeof(struct btrfs_disk_key));
1548 root_item->drop_level = 0;
1549 }
5d4f98a2
YZ
1550
1551 btrfs_tree_unlock(eb);
1552 free_extent_buffer(eb);
1553
0b246afa 1554 ret = btrfs_insert_root(trans, fs_info->tree_root,
5d4f98a2
YZ
1555 &root_key, root_item);
1556 BUG_ON(ret);
1557 kfree(root_item);
1558
3dbf1738 1559 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
5d4f98a2 1560 BUG_ON(IS_ERR(reloc_root));
3dbf1738 1561 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
5d4f98a2 1562 reloc_root->last_trans = trans->transid;
3fd0a558
YZ
1563 return reloc_root;
1564}
1565
1566/*
1567 * create reloc tree for a given fs tree. reloc tree is just a
1568 * snapshot of the fs tree with special root objectid.
f44deb74
JB
1569 *
1570 * The reloc_root comes out of here with two references, one for
1571 * root->reloc_root, and another for being on the rc->reloc_roots list.
3fd0a558
YZ
1572 */
1573int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1574 struct btrfs_root *root)
1575{
0b246afa 1576 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558 1577 struct btrfs_root *reloc_root;
0b246afa 1578 struct reloc_control *rc = fs_info->reloc_ctl;
20dd2cbf 1579 struct btrfs_block_rsv *rsv;
3fd0a558 1580 int clear_rsv = 0;
ffd7b339 1581 int ret;
3fd0a558 1582
aec7db3b 1583 if (!rc)
2abc726a
JB
1584 return 0;
1585
1fac4a54
QW
1586 /*
1587 * The subvolume has reloc tree but the swap is finished, no need to
1588 * create/update the dead reloc tree
1589 */
6282675e 1590 if (reloc_root_is_dead(root))
1fac4a54
QW
1591 return 0;
1592
aec7db3b
JB
1593 /*
1594 * This is subtle but important. We do not do
1595 * record_root_in_transaction for reloc roots, instead we record their
1596 * corresponding fs root, and then here we update the last trans for the
1597 * reloc root. This means that we have to do this for the entire life
1598 * of the reloc root, regardless of which stage of the relocation we are
1599 * in.
1600 */
3fd0a558
YZ
1601 if (root->reloc_root) {
1602 reloc_root = root->reloc_root;
1603 reloc_root->last_trans = trans->transid;
1604 return 0;
1605 }
1606
aec7db3b
JB
1607 /*
1608 * We are merging reloc roots, we do not need new reloc trees. Also
1609 * reloc trees never need their own reloc tree.
1610 */
1611 if (!rc->create_reloc_tree ||
1612 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1613 return 0;
1614
20dd2cbf
MX
1615 if (!trans->reloc_reserved) {
1616 rsv = trans->block_rsv;
3fd0a558
YZ
1617 trans->block_rsv = rc->block_rsv;
1618 clear_rsv = 1;
1619 }
1620 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1621 if (clear_rsv)
20dd2cbf 1622 trans->block_rsv = rsv;
5d4f98a2 1623
ffd7b339
JM
1624 ret = __add_reloc_root(reloc_root);
1625 BUG_ON(ret < 0);
f44deb74 1626 root->reloc_root = btrfs_grab_root(reloc_root);
5d4f98a2
YZ
1627 return 0;
1628}
1629
1630/*
1631 * update root item of reloc tree
1632 */
1633int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1634 struct btrfs_root *root)
1635{
0b246afa 1636 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1637 struct btrfs_root *reloc_root;
1638 struct btrfs_root_item *root_item;
5d4f98a2
YZ
1639 int ret;
1640
6282675e 1641 if (!have_reloc_root(root))
7585717f 1642 goto out;
5d4f98a2
YZ
1643
1644 reloc_root = root->reloc_root;
1645 root_item = &reloc_root->root_item;
1646
f44deb74
JB
1647 /*
1648 * We are probably ok here, but __del_reloc_root() will drop its ref of
1649 * the root. We have the ref for root->reloc_root, but just in case
1650 * hold it while we update the reloc root.
1651 */
1652 btrfs_grab_root(reloc_root);
1653
d2311e69 1654 /* root->reloc_root will stay until current relocation finished */
0b246afa 1655 if (fs_info->reloc_ctl->merge_reloc_tree &&
3fd0a558 1656 btrfs_root_refs(root_item) == 0) {
d2311e69 1657 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
6282675e
QW
1658 /*
1659 * Mark the tree as dead before we change reloc_root so
1660 * have_reloc_root will not touch it from now on.
1661 */
1662 smp_wmb();
c974c464 1663 __del_reloc_root(reloc_root);
5d4f98a2
YZ
1664 }
1665
5d4f98a2 1666 if (reloc_root->commit_root != reloc_root->node) {
ea287ab1 1667 __update_reloc_root(reloc_root);
5d4f98a2
YZ
1668 btrfs_set_root_node(root_item, reloc_root->node);
1669 free_extent_buffer(reloc_root->commit_root);
1670 reloc_root->commit_root = btrfs_root_node(reloc_root);
1671 }
1672
0b246afa 1673 ret = btrfs_update_root(trans, fs_info->tree_root,
5d4f98a2
YZ
1674 &reloc_root->root_key, root_item);
1675 BUG_ON(ret);
f44deb74 1676 btrfs_put_root(reloc_root);
7585717f 1677out:
5d4f98a2
YZ
1678 return 0;
1679}
1680
1681/*
1682 * helper to find first cached inode with inode number >= objectid
1683 * in a subvolume
1684 */
1685static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1686{
1687 struct rb_node *node;
1688 struct rb_node *prev;
1689 struct btrfs_inode *entry;
1690 struct inode *inode;
1691
1692 spin_lock(&root->inode_lock);
1693again:
1694 node = root->inode_tree.rb_node;
1695 prev = NULL;
1696 while (node) {
1697 prev = node;
1698 entry = rb_entry(node, struct btrfs_inode, rb_node);
1699
4a0cc7ca 1700 if (objectid < btrfs_ino(entry))
5d4f98a2 1701 node = node->rb_left;
4a0cc7ca 1702 else if (objectid > btrfs_ino(entry))
5d4f98a2
YZ
1703 node = node->rb_right;
1704 else
1705 break;
1706 }
1707 if (!node) {
1708 while (prev) {
1709 entry = rb_entry(prev, struct btrfs_inode, rb_node);
4a0cc7ca 1710 if (objectid <= btrfs_ino(entry)) {
5d4f98a2
YZ
1711 node = prev;
1712 break;
1713 }
1714 prev = rb_next(prev);
1715 }
1716 }
1717 while (node) {
1718 entry = rb_entry(node, struct btrfs_inode, rb_node);
1719 inode = igrab(&entry->vfs_inode);
1720 if (inode) {
1721 spin_unlock(&root->inode_lock);
1722 return inode;
1723 }
1724
4a0cc7ca 1725 objectid = btrfs_ino(entry) + 1;
5d4f98a2
YZ
1726 if (cond_resched_lock(&root->inode_lock))
1727 goto again;
1728
1729 node = rb_next(node);
1730 }
1731 spin_unlock(&root->inode_lock);
1732 return NULL;
1733}
1734
5d4f98a2
YZ
1735/*
1736 * get new location of data
1737 */
1738static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1739 u64 bytenr, u64 num_bytes)
1740{
1741 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1742 struct btrfs_path *path;
1743 struct btrfs_file_extent_item *fi;
1744 struct extent_buffer *leaf;
1745 int ret;
1746
1747 path = btrfs_alloc_path();
1748 if (!path)
1749 return -ENOMEM;
1750
1751 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
f85b7379
DS
1752 ret = btrfs_lookup_file_extent(NULL, root, path,
1753 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
5d4f98a2
YZ
1754 if (ret < 0)
1755 goto out;
1756 if (ret > 0) {
1757 ret = -ENOENT;
1758 goto out;
1759 }
1760
1761 leaf = path->nodes[0];
1762 fi = btrfs_item_ptr(leaf, path->slots[0],
1763 struct btrfs_file_extent_item);
1764
1765 BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1766 btrfs_file_extent_compression(leaf, fi) ||
1767 btrfs_file_extent_encryption(leaf, fi) ||
1768 btrfs_file_extent_other_encoding(leaf, fi));
1769
1770 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
83d4cfd4 1771 ret = -EINVAL;
5d4f98a2
YZ
1772 goto out;
1773 }
1774
3fd0a558 1775 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5d4f98a2
YZ
1776 ret = 0;
1777out:
1778 btrfs_free_path(path);
1779 return ret;
1780}
1781
1782/*
1783 * update file extent items in the tree leaf to point to
1784 * the new locations.
1785 */
3fd0a558
YZ
1786static noinline_for_stack
1787int replace_file_extents(struct btrfs_trans_handle *trans,
1788 struct reloc_control *rc,
1789 struct btrfs_root *root,
1790 struct extent_buffer *leaf)
5d4f98a2 1791{
0b246afa 1792 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1793 struct btrfs_key key;
1794 struct btrfs_file_extent_item *fi;
1795 struct inode *inode = NULL;
5d4f98a2
YZ
1796 u64 parent;
1797 u64 bytenr;
3fd0a558 1798 u64 new_bytenr = 0;
5d4f98a2
YZ
1799 u64 num_bytes;
1800 u64 end;
1801 u32 nritems;
1802 u32 i;
83d4cfd4 1803 int ret = 0;
5d4f98a2
YZ
1804 int first = 1;
1805 int dirty = 0;
1806
1807 if (rc->stage != UPDATE_DATA_PTRS)
1808 return 0;
1809
1810 /* reloc trees always use full backref */
1811 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1812 parent = leaf->start;
1813 else
1814 parent = 0;
1815
1816 nritems = btrfs_header_nritems(leaf);
1817 for (i = 0; i < nritems; i++) {
82fa113f
QW
1818 struct btrfs_ref ref = { 0 };
1819
5d4f98a2
YZ
1820 cond_resched();
1821 btrfs_item_key_to_cpu(leaf, &key, i);
1822 if (key.type != BTRFS_EXTENT_DATA_KEY)
1823 continue;
1824 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1825 if (btrfs_file_extent_type(leaf, fi) ==
1826 BTRFS_FILE_EXTENT_INLINE)
1827 continue;
1828 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1829 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1830 if (bytenr == 0)
1831 continue;
9569cc20
QW
1832 if (!in_range(bytenr, rc->block_group->start,
1833 rc->block_group->length))
5d4f98a2
YZ
1834 continue;
1835
1836 /*
1837 * if we are modifying block in fs tree, wait for readpage
1838 * to complete and drop the extent cache
1839 */
1840 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5d4f98a2
YZ
1841 if (first) {
1842 inode = find_next_inode(root, key.objectid);
5d4f98a2 1843 first = 0;
4a0cc7ca 1844 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
3fd0a558 1845 btrfs_add_delayed_iput(inode);
5d4f98a2 1846 inode = find_next_inode(root, key.objectid);
5d4f98a2 1847 }
4a0cc7ca 1848 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
5d4f98a2
YZ
1849 end = key.offset +
1850 btrfs_file_extent_num_bytes(leaf, fi);
1851 WARN_ON(!IS_ALIGNED(key.offset,
0b246afa
JM
1852 fs_info->sectorsize));
1853 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
5d4f98a2
YZ
1854 end--;
1855 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
d0082371 1856 key.offset, end);
5d4f98a2
YZ
1857 if (!ret)
1858 continue;
1859
dcdbc059
NB
1860 btrfs_drop_extent_cache(BTRFS_I(inode),
1861 key.offset, end, 1);
5d4f98a2 1862 unlock_extent(&BTRFS_I(inode)->io_tree,
d0082371 1863 key.offset, end);
5d4f98a2
YZ
1864 }
1865 }
1866
1867 ret = get_new_location(rc->data_inode, &new_bytenr,
1868 bytenr, num_bytes);
83d4cfd4
JB
1869 if (ret) {
1870 /*
1871 * Don't have to abort since we've not changed anything
1872 * in the file extent yet.
1873 */
1874 break;
3fd0a558 1875 }
5d4f98a2
YZ
1876
1877 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1878 dirty = 1;
1879
1880 key.offset -= btrfs_file_extent_offset(leaf, fi);
82fa113f
QW
1881 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1882 num_bytes, parent);
1883 ref.real_root = root->root_key.objectid;
1884 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1885 key.objectid, key.offset);
1886 ret = btrfs_inc_extent_ref(trans, &ref);
83d4cfd4 1887 if (ret) {
66642832 1888 btrfs_abort_transaction(trans, ret);
83d4cfd4
JB
1889 break;
1890 }
5d4f98a2 1891
ffd4bb2a
QW
1892 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1893 num_bytes, parent);
1894 ref.real_root = root->root_key.objectid;
1895 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1896 key.objectid, key.offset);
1897 ret = btrfs_free_extent(trans, &ref);
83d4cfd4 1898 if (ret) {
66642832 1899 btrfs_abort_transaction(trans, ret);
83d4cfd4
JB
1900 break;
1901 }
5d4f98a2
YZ
1902 }
1903 if (dirty)
1904 btrfs_mark_buffer_dirty(leaf);
3fd0a558
YZ
1905 if (inode)
1906 btrfs_add_delayed_iput(inode);
83d4cfd4 1907 return ret;
5d4f98a2
YZ
1908}
1909
1910static noinline_for_stack
1911int memcmp_node_keys(struct extent_buffer *eb, int slot,
1912 struct btrfs_path *path, int level)
1913{
1914 struct btrfs_disk_key key1;
1915 struct btrfs_disk_key key2;
1916 btrfs_node_key(eb, &key1, slot);
1917 btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1918 return memcmp(&key1, &key2, sizeof(key1));
1919}
1920
1921/*
1922 * try to replace tree blocks in fs tree with the new blocks
1923 * in reloc tree. tree blocks haven't been modified since the
1924 * reloc tree was create can be replaced.
1925 *
1926 * if a block was replaced, level of the block + 1 is returned.
1927 * if no block got replaced, 0 is returned. if there are other
1928 * errors, a negative error number is returned.
1929 */
3fd0a558 1930static noinline_for_stack
3d0174f7 1931int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
3fd0a558
YZ
1932 struct btrfs_root *dest, struct btrfs_root *src,
1933 struct btrfs_path *path, struct btrfs_key *next_key,
1934 int lowest_level, int max_level)
5d4f98a2 1935{
0b246afa 1936 struct btrfs_fs_info *fs_info = dest->fs_info;
5d4f98a2
YZ
1937 struct extent_buffer *eb;
1938 struct extent_buffer *parent;
82fa113f 1939 struct btrfs_ref ref = { 0 };
5d4f98a2
YZ
1940 struct btrfs_key key;
1941 u64 old_bytenr;
1942 u64 new_bytenr;
1943 u64 old_ptr_gen;
1944 u64 new_ptr_gen;
1945 u64 last_snapshot;
1946 u32 blocksize;
3fd0a558 1947 int cow = 0;
5d4f98a2
YZ
1948 int level;
1949 int ret;
1950 int slot;
1951
1952 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1953 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
5d4f98a2
YZ
1954
1955 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
3fd0a558 1956again:
5d4f98a2
YZ
1957 slot = path->slots[lowest_level];
1958 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1959
1960 eb = btrfs_lock_root_node(dest);
8bead258 1961 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
1962 level = btrfs_header_level(eb);
1963
1964 if (level < lowest_level) {
1965 btrfs_tree_unlock(eb);
1966 free_extent_buffer(eb);
1967 return 0;
1968 }
1969
3fd0a558
YZ
1970 if (cow) {
1971 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1972 BUG_ON(ret);
1973 }
8bead258 1974 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
1975
1976 if (next_key) {
1977 next_key->objectid = (u64)-1;
1978 next_key->type = (u8)-1;
1979 next_key->offset = (u64)-1;
1980 }
1981
1982 parent = eb;
1983 while (1) {
581c1760
QW
1984 struct btrfs_key first_key;
1985
5d4f98a2
YZ
1986 level = btrfs_header_level(parent);
1987 BUG_ON(level < lowest_level);
1988
1989 ret = btrfs_bin_search(parent, &key, level, &slot);
cbca7d59
FM
1990 if (ret < 0)
1991 break;
5d4f98a2
YZ
1992 if (ret && slot > 0)
1993 slot--;
1994
1995 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1996 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1997
1998 old_bytenr = btrfs_node_blockptr(parent, slot);
0b246afa 1999 blocksize = fs_info->nodesize;
5d4f98a2 2000 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
17515f1b 2001 btrfs_node_key_to_cpu(parent, &first_key, slot);
5d4f98a2
YZ
2002
2003 if (level <= max_level) {
2004 eb = path->nodes[level];
2005 new_bytenr = btrfs_node_blockptr(eb,
2006 path->slots[level]);
2007 new_ptr_gen = btrfs_node_ptr_generation(eb,
2008 path->slots[level]);
2009 } else {
2010 new_bytenr = 0;
2011 new_ptr_gen = 0;
2012 }
2013
fae7f21c 2014 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
5d4f98a2
YZ
2015 ret = level;
2016 break;
2017 }
2018
2019 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
2020 memcmp_node_keys(parent, slot, path, level)) {
3fd0a558 2021 if (level <= lowest_level) {
5d4f98a2
YZ
2022 ret = 0;
2023 break;
2024 }
2025
581c1760
QW
2026 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
2027 level - 1, &first_key);
64c043de
LB
2028 if (IS_ERR(eb)) {
2029 ret = PTR_ERR(eb);
264813ac 2030 break;
64c043de
LB
2031 } else if (!extent_buffer_uptodate(eb)) {
2032 ret = -EIO;
416bc658 2033 free_extent_buffer(eb);
379cde74 2034 break;
416bc658 2035 }
5d4f98a2 2036 btrfs_tree_lock(eb);
3fd0a558
YZ
2037 if (cow) {
2038 ret = btrfs_cow_block(trans, dest, eb, parent,
2039 slot, &eb);
2040 BUG_ON(ret);
5d4f98a2 2041 }
8bead258 2042 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
2043
2044 btrfs_tree_unlock(parent);
2045 free_extent_buffer(parent);
2046
2047 parent = eb;
2048 continue;
2049 }
2050
3fd0a558
YZ
2051 if (!cow) {
2052 btrfs_tree_unlock(parent);
2053 free_extent_buffer(parent);
2054 cow = 1;
2055 goto again;
2056 }
2057
5d4f98a2
YZ
2058 btrfs_node_key_to_cpu(path->nodes[level], &key,
2059 path->slots[level]);
b3b4aa74 2060 btrfs_release_path(path);
5d4f98a2
YZ
2061
2062 path->lowest_level = level;
2063 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
2064 path->lowest_level = 0;
2065 BUG_ON(ret);
2066
824d8dff
QW
2067 /*
2068 * Info qgroup to trace both subtrees.
2069 *
2070 * We must trace both trees.
2071 * 1) Tree reloc subtree
2072 * If not traced, we will leak data numbers
2073 * 2) Fs subtree
2074 * If not traced, we will double count old data
f616f5cd
QW
2075 *
2076 * We don't scan the subtree right now, but only record
2077 * the swapped tree blocks.
2078 * The real subtree rescan is delayed until we have new
2079 * CoW on the subtree root node before transaction commit.
824d8dff 2080 */
370a11b8
QW
2081 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
2082 rc->block_group, parent, slot,
2083 path->nodes[level], path->slots[level],
2084 last_snapshot);
2085 if (ret < 0)
2086 break;
5d4f98a2
YZ
2087 /*
2088 * swap blocks in fs tree and reloc tree.
2089 */
2090 btrfs_set_node_blockptr(parent, slot, new_bytenr);
2091 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
2092 btrfs_mark_buffer_dirty(parent);
2093
2094 btrfs_set_node_blockptr(path->nodes[level],
2095 path->slots[level], old_bytenr);
2096 btrfs_set_node_ptr_generation(path->nodes[level],
2097 path->slots[level], old_ptr_gen);
2098 btrfs_mark_buffer_dirty(path->nodes[level]);
2099
82fa113f
QW
2100 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
2101 blocksize, path->nodes[level]->start);
2102 ref.skip_qgroup = true;
2103 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
2104 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2 2105 BUG_ON(ret);
82fa113f
QW
2106 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
2107 blocksize, 0);
2108 ref.skip_qgroup = true;
2109 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
2110 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2
YZ
2111 BUG_ON(ret);
2112
ffd4bb2a
QW
2113 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
2114 blocksize, path->nodes[level]->start);
2115 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
2116 ref.skip_qgroup = true;
2117 ret = btrfs_free_extent(trans, &ref);
5d4f98a2
YZ
2118 BUG_ON(ret);
2119
ffd4bb2a
QW
2120 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
2121 blocksize, 0);
2122 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
2123 ref.skip_qgroup = true;
2124 ret = btrfs_free_extent(trans, &ref);
5d4f98a2
YZ
2125 BUG_ON(ret);
2126
2127 btrfs_unlock_up_safe(path, 0);
2128
2129 ret = level;
2130 break;
2131 }
2132 btrfs_tree_unlock(parent);
2133 free_extent_buffer(parent);
2134 return ret;
2135}
2136
2137/*
2138 * helper to find next relocated block in reloc tree
2139 */
2140static noinline_for_stack
2141int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2142 int *level)
2143{
2144 struct extent_buffer *eb;
2145 int i;
2146 u64 last_snapshot;
2147 u32 nritems;
2148
2149 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2150
2151 for (i = 0; i < *level; i++) {
2152 free_extent_buffer(path->nodes[i]);
2153 path->nodes[i] = NULL;
2154 }
2155
2156 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
2157 eb = path->nodes[i];
2158 nritems = btrfs_header_nritems(eb);
2159 while (path->slots[i] + 1 < nritems) {
2160 path->slots[i]++;
2161 if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
2162 last_snapshot)
2163 continue;
2164
2165 *level = i;
2166 return 0;
2167 }
2168 free_extent_buffer(path->nodes[i]);
2169 path->nodes[i] = NULL;
2170 }
2171 return 1;
2172}
2173
2174/*
2175 * walk down reloc tree to find relocated block of lowest level
2176 */
2177static noinline_for_stack
2178int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2179 int *level)
2180{
2ff7e61e 2181 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
2182 struct extent_buffer *eb = NULL;
2183 int i;
2184 u64 bytenr;
2185 u64 ptr_gen = 0;
2186 u64 last_snapshot;
5d4f98a2
YZ
2187 u32 nritems;
2188
2189 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2190
2191 for (i = *level; i > 0; i--) {
581c1760
QW
2192 struct btrfs_key first_key;
2193
5d4f98a2
YZ
2194 eb = path->nodes[i];
2195 nritems = btrfs_header_nritems(eb);
2196 while (path->slots[i] < nritems) {
2197 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2198 if (ptr_gen > last_snapshot)
2199 break;
2200 path->slots[i]++;
2201 }
2202 if (path->slots[i] >= nritems) {
2203 if (i == *level)
2204 break;
2205 *level = i + 1;
2206 return 0;
2207 }
2208 if (i == 1) {
2209 *level = i;
2210 return 0;
2211 }
2212
2213 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
581c1760
QW
2214 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
2215 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
2216 &first_key);
64c043de
LB
2217 if (IS_ERR(eb)) {
2218 return PTR_ERR(eb);
2219 } else if (!extent_buffer_uptodate(eb)) {
416bc658
JB
2220 free_extent_buffer(eb);
2221 return -EIO;
2222 }
5d4f98a2
YZ
2223 BUG_ON(btrfs_header_level(eb) != i - 1);
2224 path->nodes[i - 1] = eb;
2225 path->slots[i - 1] = 0;
2226 }
2227 return 1;
2228}
2229
2230/*
2231 * invalidate extent cache for file extents whose key in range of
2232 * [min_key, max_key)
2233 */
2234static int invalidate_extent_cache(struct btrfs_root *root,
2235 struct btrfs_key *min_key,
2236 struct btrfs_key *max_key)
2237{
0b246afa 2238 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
2239 struct inode *inode = NULL;
2240 u64 objectid;
2241 u64 start, end;
33345d01 2242 u64 ino;
5d4f98a2
YZ
2243
2244 objectid = min_key->objectid;
2245 while (1) {
2246 cond_resched();
2247 iput(inode);
2248
2249 if (objectid > max_key->objectid)
2250 break;
2251
2252 inode = find_next_inode(root, objectid);
2253 if (!inode)
2254 break;
4a0cc7ca 2255 ino = btrfs_ino(BTRFS_I(inode));
5d4f98a2 2256
33345d01 2257 if (ino > max_key->objectid) {
5d4f98a2
YZ
2258 iput(inode);
2259 break;
2260 }
2261
33345d01 2262 objectid = ino + 1;
5d4f98a2
YZ
2263 if (!S_ISREG(inode->i_mode))
2264 continue;
2265
33345d01 2266 if (unlikely(min_key->objectid == ino)) {
5d4f98a2
YZ
2267 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2268 continue;
2269 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2270 start = 0;
2271 else {
2272 start = min_key->offset;
0b246afa 2273 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
5d4f98a2
YZ
2274 }
2275 } else {
2276 start = 0;
2277 }
2278
33345d01 2279 if (unlikely(max_key->objectid == ino)) {
5d4f98a2
YZ
2280 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2281 continue;
2282 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2283 end = (u64)-1;
2284 } else {
2285 if (max_key->offset == 0)
2286 continue;
2287 end = max_key->offset;
0b246afa 2288 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
5d4f98a2
YZ
2289 end--;
2290 }
2291 } else {
2292 end = (u64)-1;
2293 }
2294
2295 /* the lock_extent waits for readpage to complete */
d0082371 2296 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
dcdbc059 2297 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
d0082371 2298 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
5d4f98a2
YZ
2299 }
2300 return 0;
2301}
2302
2303static int find_next_key(struct btrfs_path *path, int level,
2304 struct btrfs_key *key)
2305
2306{
2307 while (level < BTRFS_MAX_LEVEL) {
2308 if (!path->nodes[level])
2309 break;
2310 if (path->slots[level] + 1 <
2311 btrfs_header_nritems(path->nodes[level])) {
2312 btrfs_node_key_to_cpu(path->nodes[level], key,
2313 path->slots[level] + 1);
2314 return 0;
2315 }
2316 level++;
2317 }
2318 return 1;
2319}
2320
d2311e69
QW
2321/*
2322 * Insert current subvolume into reloc_control::dirty_subvol_roots
2323 */
2324static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
2325 struct reloc_control *rc,
2326 struct btrfs_root *root)
2327{
2328 struct btrfs_root *reloc_root = root->reloc_root;
2329 struct btrfs_root_item *reloc_root_item;
2330
2331 /* @root must be a subvolume tree root with a valid reloc tree */
2332 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
2333 ASSERT(reloc_root);
2334
2335 reloc_root_item = &reloc_root->root_item;
2336 memset(&reloc_root_item->drop_progress, 0,
2337 sizeof(reloc_root_item->drop_progress));
2338 reloc_root_item->drop_level = 0;
2339 btrfs_set_root_refs(reloc_root_item, 0);
2340 btrfs_update_reloc_root(trans, root);
2341
2342 if (list_empty(&root->reloc_dirty_list)) {
00246528 2343 btrfs_grab_root(root);
d2311e69
QW
2344 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
2345 }
2346}
2347
2348static int clean_dirty_subvols(struct reloc_control *rc)
2349{
2350 struct btrfs_root *root;
2351 struct btrfs_root *next;
2352 int ret = 0;
30d40577 2353 int ret2;
d2311e69
QW
2354
2355 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
2356 reloc_dirty_list) {
30d40577
QW
2357 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2358 /* Merged subvolume, cleanup its reloc root */
2359 struct btrfs_root *reloc_root = root->reloc_root;
d2311e69 2360
30d40577
QW
2361 list_del_init(&root->reloc_dirty_list);
2362 root->reloc_root = NULL;
6282675e
QW
2363 /*
2364 * Need barrier to ensure clear_bit() only happens after
2365 * root->reloc_root = NULL. Pairs with have_reloc_root.
2366 */
2367 smp_wmb();
1fac4a54 2368 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
f28de8d8 2369 if (reloc_root) {
f44deb74
JB
2370 /*
2371 * btrfs_drop_snapshot drops our ref we hold for
2372 * ->reloc_root. If it fails however we must
2373 * drop the ref ourselves.
2374 */
f28de8d8 2375 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
f44deb74
JB
2376 if (ret2 < 0) {
2377 btrfs_put_root(reloc_root);
2378 if (!ret)
2379 ret = ret2;
2380 }
f28de8d8 2381 }
00246528 2382 btrfs_put_root(root);
30d40577
QW
2383 } else {
2384 /* Orphan reloc tree, just clean it up */
0078a9f9 2385 ret2 = btrfs_drop_snapshot(root, 0, 1);
f44deb74
JB
2386 if (ret2 < 0) {
2387 btrfs_put_root(root);
2388 if (!ret)
2389 ret = ret2;
2390 }
d2311e69 2391 }
d2311e69
QW
2392 }
2393 return ret;
2394}
2395
5d4f98a2
YZ
2396/*
2397 * merge the relocated tree blocks in reloc tree with corresponding
2398 * fs tree.
2399 */
2400static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2401 struct btrfs_root *root)
2402{
0b246afa 2403 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
2404 struct btrfs_key key;
2405 struct btrfs_key next_key;
9e6a0c52 2406 struct btrfs_trans_handle *trans = NULL;
5d4f98a2
YZ
2407 struct btrfs_root *reloc_root;
2408 struct btrfs_root_item *root_item;
2409 struct btrfs_path *path;
3fd0a558 2410 struct extent_buffer *leaf;
5d4f98a2
YZ
2411 int level;
2412 int max_level;
2413 int replaced = 0;
2414 int ret;
2415 int err = 0;
3fd0a558 2416 u32 min_reserved;
5d4f98a2
YZ
2417
2418 path = btrfs_alloc_path();
2419 if (!path)
2420 return -ENOMEM;
e4058b54 2421 path->reada = READA_FORWARD;
5d4f98a2
YZ
2422
2423 reloc_root = root->reloc_root;
2424 root_item = &reloc_root->root_item;
2425
2426 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2427 level = btrfs_root_level(root_item);
67439dad 2428 atomic_inc(&reloc_root->node->refs);
5d4f98a2
YZ
2429 path->nodes[level] = reloc_root->node;
2430 path->slots[level] = 0;
2431 } else {
2432 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2433
2434 level = root_item->drop_level;
2435 BUG_ON(level == 0);
2436 path->lowest_level = level;
2437 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
33c66f43 2438 path->lowest_level = 0;
5d4f98a2
YZ
2439 if (ret < 0) {
2440 btrfs_free_path(path);
2441 return ret;
2442 }
2443
2444 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2445 path->slots[level]);
2446 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2447
2448 btrfs_unlock_up_safe(path, 0);
2449 }
2450
0b246afa 2451 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
3fd0a558 2452 memset(&next_key, 0, sizeof(next_key));
5d4f98a2 2453
3fd0a558 2454 while (1) {
08e007d2
MX
2455 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2456 BTRFS_RESERVE_FLUSH_ALL);
3fd0a558 2457 if (ret) {
9e6a0c52
JB
2458 err = ret;
2459 goto out;
5d4f98a2 2460 }
9e6a0c52
JB
2461 trans = btrfs_start_transaction(root, 0);
2462 if (IS_ERR(trans)) {
2463 err = PTR_ERR(trans);
2464 trans = NULL;
2465 goto out;
2466 }
2abc726a
JB
2467
2468 /*
2469 * At this point we no longer have a reloc_control, so we can't
2470 * depend on btrfs_init_reloc_root to update our last_trans.
2471 *
2472 * But that's ok, we started the trans handle on our
2473 * corresponding fs_root, which means it's been added to the
2474 * dirty list. At commit time we'll still call
2475 * btrfs_update_reloc_root() and update our root item
2476 * appropriately.
2477 */
2478 reloc_root->last_trans = trans->transid;
9e6a0c52 2479 trans->block_rsv = rc->block_rsv;
5d4f98a2 2480
5d4f98a2 2481 replaced = 0;
5d4f98a2
YZ
2482 max_level = level;
2483
2484 ret = walk_down_reloc_tree(reloc_root, path, &level);
2485 if (ret < 0) {
2486 err = ret;
2487 goto out;
2488 }
2489 if (ret > 0)
2490 break;
2491
2492 if (!find_next_key(path, level, &key) &&
2493 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2494 ret = 0;
5d4f98a2 2495 } else {
3d0174f7 2496 ret = replace_path(trans, rc, root, reloc_root, path,
3fd0a558 2497 &next_key, level, max_level);
5d4f98a2
YZ
2498 }
2499 if (ret < 0) {
2500 err = ret;
2501 goto out;
2502 }
2503
2504 if (ret > 0) {
2505 level = ret;
2506 btrfs_node_key_to_cpu(path->nodes[level], &key,
2507 path->slots[level]);
2508 replaced = 1;
5d4f98a2
YZ
2509 }
2510
2511 ret = walk_up_reloc_tree(reloc_root, path, &level);
2512 if (ret > 0)
2513 break;
2514
2515 BUG_ON(level == 0);
2516 /*
2517 * save the merging progress in the drop_progress.
2518 * this is OK since root refs == 1 in this case.
2519 */
2520 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2521 path->slots[level]);
2522 root_item->drop_level = level;
2523
3a45bb20 2524 btrfs_end_transaction_throttle(trans);
9e6a0c52 2525 trans = NULL;
5d4f98a2 2526
2ff7e61e 2527 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
2528
2529 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2530 invalidate_extent_cache(root, &key, &next_key);
2531 }
2532
2533 /*
2534 * handle the case only one block in the fs tree need to be
2535 * relocated and the block is tree root.
2536 */
2537 leaf = btrfs_lock_root_node(root);
2538 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2539 btrfs_tree_unlock(leaf);
2540 free_extent_buffer(leaf);
2541 if (ret < 0)
2542 err = ret;
2543out:
2544 btrfs_free_path(path);
2545
d2311e69
QW
2546 if (err == 0)
2547 insert_dirty_subvol(trans, rc, root);
5d4f98a2 2548
9e6a0c52 2549 if (trans)
3a45bb20 2550 btrfs_end_transaction_throttle(trans);
5d4f98a2 2551
2ff7e61e 2552 btrfs_btree_balance_dirty(fs_info);
5d4f98a2 2553
5d4f98a2
YZ
2554 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2555 invalidate_extent_cache(root, &key, &next_key);
2556
2557 return err;
2558}
2559
3fd0a558
YZ
2560static noinline_for_stack
2561int prepare_to_merge(struct reloc_control *rc, int err)
5d4f98a2 2562{
3fd0a558 2563 struct btrfs_root *root = rc->extent_root;
0b246afa 2564 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 2565 struct btrfs_root *reloc_root;
3fd0a558
YZ
2566 struct btrfs_trans_handle *trans;
2567 LIST_HEAD(reloc_roots);
2568 u64 num_bytes = 0;
2569 int ret;
3fd0a558 2570
0b246afa
JM
2571 mutex_lock(&fs_info->reloc_mutex);
2572 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
3fd0a558 2573 rc->merging_rsv_size += rc->nodes_relocated * 2;
0b246afa 2574 mutex_unlock(&fs_info->reloc_mutex);
7585717f 2575
3fd0a558
YZ
2576again:
2577 if (!err) {
2578 num_bytes = rc->merging_rsv_size;
08e007d2
MX
2579 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2580 BTRFS_RESERVE_FLUSH_ALL);
3fd0a558
YZ
2581 if (ret)
2582 err = ret;
2583 }
2584
7a7eaa40 2585 trans = btrfs_join_transaction(rc->extent_root);
3612b495
TI
2586 if (IS_ERR(trans)) {
2587 if (!err)
2ff7e61e 2588 btrfs_block_rsv_release(fs_info, rc->block_rsv,
63f018be 2589 num_bytes, NULL);
3612b495
TI
2590 return PTR_ERR(trans);
2591 }
3fd0a558
YZ
2592
2593 if (!err) {
2594 if (num_bytes != rc->merging_rsv_size) {
3a45bb20 2595 btrfs_end_transaction(trans);
2ff7e61e 2596 btrfs_block_rsv_release(fs_info, rc->block_rsv,
63f018be 2597 num_bytes, NULL);
3fd0a558
YZ
2598 goto again;
2599 }
2600 }
5d4f98a2 2601
3fd0a558
YZ
2602 rc->merge_reloc_tree = 1;
2603
2604 while (!list_empty(&rc->reloc_roots)) {
2605 reloc_root = list_entry(rc->reloc_roots.next,
2606 struct btrfs_root, root_list);
2607 list_del_init(&reloc_root->root_list);
5d4f98a2 2608
0b246afa 2609 root = read_fs_root(fs_info, reloc_root->root_key.offset);
5d4f98a2
YZ
2610 BUG_ON(IS_ERR(root));
2611 BUG_ON(root->reloc_root != reloc_root);
2612
3fd0a558
YZ
2613 /*
2614 * set reference count to 1, so btrfs_recover_relocation
2615 * knows it should resumes merging
2616 */
2617 if (!err)
2618 btrfs_set_root_refs(&reloc_root->root_item, 1);
5d4f98a2 2619 btrfs_update_reloc_root(trans, root);
5d4f98a2 2620
3fd0a558 2621 list_add(&reloc_root->root_list, &reloc_roots);
00246528 2622 btrfs_put_root(root);
3fd0a558 2623 }
5d4f98a2 2624
3fd0a558 2625 list_splice(&reloc_roots, &rc->reloc_roots);
5d4f98a2 2626
3fd0a558 2627 if (!err)
3a45bb20 2628 btrfs_commit_transaction(trans);
3fd0a558 2629 else
3a45bb20 2630 btrfs_end_transaction(trans);
3fd0a558 2631 return err;
5d4f98a2
YZ
2632}
2633
aca1bba6
LB
2634static noinline_for_stack
2635void free_reloc_roots(struct list_head *list)
2636{
2637 struct btrfs_root *reloc_root;
2638
2639 while (!list_empty(list)) {
2640 reloc_root = list_entry(list->next, struct btrfs_root,
2641 root_list);
bb166d72 2642 __del_reloc_root(reloc_root);
aca1bba6
LB
2643 }
2644}
2645
3fd0a558 2646static noinline_for_stack
94404e82 2647void merge_reloc_roots(struct reloc_control *rc)
5d4f98a2 2648{
0b246afa 2649 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2 2650 struct btrfs_root *root;
3fd0a558
YZ
2651 struct btrfs_root *reloc_root;
2652 LIST_HEAD(reloc_roots);
2653 int found = 0;
aca1bba6 2654 int ret = 0;
3fd0a558
YZ
2655again:
2656 root = rc->extent_root;
7585717f
CM
2657
2658 /*
2659 * this serializes us with btrfs_record_root_in_transaction,
2660 * we have to make sure nobody is in the middle of
2661 * adding their roots to the list while we are
2662 * doing this splice
2663 */
0b246afa 2664 mutex_lock(&fs_info->reloc_mutex);
3fd0a558 2665 list_splice_init(&rc->reloc_roots, &reloc_roots);
0b246afa 2666 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2 2667
3fd0a558
YZ
2668 while (!list_empty(&reloc_roots)) {
2669 found = 1;
2670 reloc_root = list_entry(reloc_roots.next,
2671 struct btrfs_root, root_list);
5d4f98a2 2672
3fd0a558 2673 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
0b246afa 2674 root = read_fs_root(fs_info,
3fd0a558
YZ
2675 reloc_root->root_key.offset);
2676 BUG_ON(IS_ERR(root));
2677 BUG_ON(root->reloc_root != reloc_root);
5d4f98a2 2678
3fd0a558 2679 ret = merge_reloc_root(rc, root);
00246528 2680 btrfs_put_root(root);
b37b39cd 2681 if (ret) {
25e293c2
WS
2682 if (list_empty(&reloc_root->root_list))
2683 list_add_tail(&reloc_root->root_list,
2684 &reloc_roots);
aca1bba6 2685 goto out;
b37b39cd 2686 }
3fd0a558
YZ
2687 } else {
2688 list_del_init(&reloc_root->root_list);
30d40577
QW
2689 /* Don't forget to queue this reloc root for cleanup */
2690 list_add_tail(&reloc_root->reloc_dirty_list,
2691 &rc->dirty_subvol_roots);
3fd0a558 2692 }
5d4f98a2
YZ
2693 }
2694
3fd0a558
YZ
2695 if (found) {
2696 found = 0;
2697 goto again;
2698 }
aca1bba6
LB
2699out:
2700 if (ret) {
0b246afa 2701 btrfs_handle_fs_error(fs_info, ret, NULL);
aca1bba6
LB
2702 if (!list_empty(&reloc_roots))
2703 free_reloc_roots(&reloc_roots);
467bb1d2
WS
2704
2705 /* new reloc root may be added */
0b246afa 2706 mutex_lock(&fs_info->reloc_mutex);
467bb1d2 2707 list_splice_init(&rc->reloc_roots, &reloc_roots);
0b246afa 2708 mutex_unlock(&fs_info->reloc_mutex);
467bb1d2
WS
2709 if (!list_empty(&reloc_roots))
2710 free_reloc_roots(&reloc_roots);
aca1bba6
LB
2711 }
2712
7b7b7431
JB
2713 /*
2714 * We used to have
2715 *
2716 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2717 *
2718 * here, but it's wrong. If we fail to start the transaction in
2719 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2720 * have actually been removed from the reloc_root_tree rb tree. This is
2721 * fine because we're bailing here, and we hold a reference on the root
2722 * for the list that holds it, so these roots will be cleaned up when we
2723 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
2724 * will be cleaned up on unmount.
2725 *
2726 * The remaining nodes will be cleaned up by free_reloc_control.
2727 */
5d4f98a2
YZ
2728}
2729
2730static void free_block_list(struct rb_root *blocks)
2731{
2732 struct tree_block *block;
2733 struct rb_node *rb_node;
2734 while ((rb_node = rb_first(blocks))) {
2735 block = rb_entry(rb_node, struct tree_block, rb_node);
2736 rb_erase(rb_node, blocks);
2737 kfree(block);
2738 }
2739}
2740
2741static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2742 struct btrfs_root *reloc_root)
2743{
0b246afa 2744 struct btrfs_fs_info *fs_info = reloc_root->fs_info;
5d4f98a2 2745 struct btrfs_root *root;
442b1ac5 2746 int ret;
5d4f98a2
YZ
2747
2748 if (reloc_root->last_trans == trans->transid)
2749 return 0;
2750
0b246afa 2751 root = read_fs_root(fs_info, reloc_root->root_key.offset);
5d4f98a2
YZ
2752 BUG_ON(IS_ERR(root));
2753 BUG_ON(root->reloc_root != reloc_root);
442b1ac5 2754 ret = btrfs_record_root_in_trans(trans, root);
00246528 2755 btrfs_put_root(root);
5d4f98a2 2756
442b1ac5 2757 return ret;
5d4f98a2
YZ
2758}
2759
3fd0a558
YZ
2760static noinline_for_stack
2761struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2762 struct reloc_control *rc,
a26195a5
QW
2763 struct btrfs_backref_node *node,
2764 struct btrfs_backref_edge *edges[])
5d4f98a2 2765{
a26195a5 2766 struct btrfs_backref_node *next;
5d4f98a2 2767 struct btrfs_root *root;
3fd0a558
YZ
2768 int index = 0;
2769
5d4f98a2
YZ
2770 next = node;
2771 while (1) {
2772 cond_resched();
2773 next = walk_up_backref(next, edges, &index);
2774 root = next->root;
3fd0a558 2775 BUG_ON(!root);
27cdeb70 2776 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
5d4f98a2
YZ
2777
2778 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2779 record_reloc_root_in_trans(trans, root);
2780 break;
2781 }
2782
3fd0a558
YZ
2783 btrfs_record_root_in_trans(trans, root);
2784 root = root->reloc_root;
2785
2786 if (next->new_bytenr != root->node->start) {
2787 BUG_ON(next->new_bytenr);
2788 BUG_ON(!list_empty(&next->list));
2789 next->new_bytenr = root->node->start;
00246528
JB
2790 btrfs_put_root(next->root);
2791 next->root = btrfs_grab_root(root);
0b530bc5 2792 ASSERT(next->root);
3fd0a558
YZ
2793 list_add_tail(&next->list,
2794 &rc->backref_cache.changed);
9569cc20 2795 mark_block_processed(rc, next);
5d4f98a2
YZ
2796 break;
2797 }
2798
3fd0a558 2799 WARN_ON(1);
5d4f98a2
YZ
2800 root = NULL;
2801 next = walk_down_backref(edges, &index);
2802 if (!next || next->level <= node->level)
2803 break;
2804 }
3fd0a558
YZ
2805 if (!root)
2806 return NULL;
5d4f98a2 2807
3fd0a558
YZ
2808 next = node;
2809 /* setup backref node path for btrfs_reloc_cow_block */
2810 while (1) {
2811 rc->backref_cache.path[next->level] = next;
2812 if (--index < 0)
2813 break;
2814 next = edges[index]->node[UPPER];
5d4f98a2 2815 }
5d4f98a2
YZ
2816 return root;
2817}
2818
3fd0a558
YZ
2819/*
2820 * select a tree root for relocation. return NULL if the block
2821 * is reference counted. we should use do_relocation() in this
2822 * case. return a tree root pointer if the block isn't reference
2823 * counted. return -ENOENT if the block is root of reloc tree.
2824 */
5d4f98a2 2825static noinline_for_stack
a26195a5 2826struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
5d4f98a2 2827{
a26195a5 2828 struct btrfs_backref_node *next;
3fd0a558
YZ
2829 struct btrfs_root *root;
2830 struct btrfs_root *fs_root = NULL;
a26195a5 2831 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3fd0a558
YZ
2832 int index = 0;
2833
2834 next = node;
2835 while (1) {
2836 cond_resched();
2837 next = walk_up_backref(next, edges, &index);
2838 root = next->root;
2839 BUG_ON(!root);
2840
25985edc 2841 /* no other choice for non-references counted tree */
27cdeb70 2842 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
3fd0a558
YZ
2843 return root;
2844
2845 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2846 fs_root = root;
2847
2848 if (next != node)
2849 return NULL;
2850
2851 next = walk_down_backref(edges, &index);
2852 if (!next || next->level <= node->level)
2853 break;
2854 }
2855
2856 if (!fs_root)
2857 return ERR_PTR(-ENOENT);
2858 return fs_root;
5d4f98a2
YZ
2859}
2860
2861static noinline_for_stack
3fd0a558 2862u64 calcu_metadata_size(struct reloc_control *rc,
a26195a5 2863 struct btrfs_backref_node *node, int reserve)
5d4f98a2 2864{
0b246afa 2865 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5
QW
2866 struct btrfs_backref_node *next = node;
2867 struct btrfs_backref_edge *edge;
2868 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3fd0a558
YZ
2869 u64 num_bytes = 0;
2870 int index = 0;
2871
2872 BUG_ON(reserve && node->processed);
2873
2874 while (next) {
2875 cond_resched();
2876 while (1) {
2877 if (next->processed && (reserve || next != node))
2878 break;
2879
0b246afa 2880 num_bytes += fs_info->nodesize;
3fd0a558
YZ
2881
2882 if (list_empty(&next->upper))
2883 break;
2884
2885 edge = list_entry(next->upper.next,
a26195a5 2886 struct btrfs_backref_edge, list[LOWER]);
3fd0a558
YZ
2887 edges[index++] = edge;
2888 next = edge->node[UPPER];
2889 }
2890 next = walk_down_backref(edges, &index);
2891 }
2892 return num_bytes;
5d4f98a2
YZ
2893}
2894
3fd0a558
YZ
2895static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2896 struct reloc_control *rc,
a26195a5 2897 struct btrfs_backref_node *node)
5d4f98a2 2898{
3fd0a558 2899 struct btrfs_root *root = rc->extent_root;
da17066c 2900 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558
YZ
2901 u64 num_bytes;
2902 int ret;
0647bf56 2903 u64 tmp;
3fd0a558
YZ
2904
2905 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
5d4f98a2 2906
3fd0a558 2907 trans->block_rsv = rc->block_rsv;
0647bf56 2908 rc->reserved_bytes += num_bytes;
8ca17f0f
JB
2909
2910 /*
2911 * We are under a transaction here so we can only do limited flushing.
2912 * If we get an enospc just kick back -EAGAIN so we know to drop the
2913 * transaction and try to refill when we can flush all the things.
2914 */
0647bf56 2915 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
8ca17f0f 2916 BTRFS_RESERVE_FLUSH_LIMIT);
3fd0a558 2917 if (ret) {
da17066c 2918 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
8ca17f0f
JB
2919 while (tmp <= rc->reserved_bytes)
2920 tmp <<= 1;
2921 /*
2922 * only one thread can access block_rsv at this point,
2923 * so we don't need hold lock to protect block_rsv.
2924 * we expand more reservation size here to allow enough
52042d8e 2925 * space for relocation and we will return earlier in
8ca17f0f
JB
2926 * enospc case.
2927 */
da17066c
JM
2928 rc->block_rsv->size = tmp + fs_info->nodesize *
2929 RELOCATION_RESERVED_NODES;
8ca17f0f 2930 return -EAGAIN;
5d4f98a2 2931 }
3fd0a558 2932
3fd0a558
YZ
2933 return 0;
2934}
2935
5d4f98a2
YZ
2936/*
2937 * relocate a block tree, and then update pointers in upper level
2938 * blocks that reference the block to point to the new location.
2939 *
2940 * if called by link_to_upper, the block has already been relocated.
2941 * in that case this function just updates pointers.
2942 */
2943static int do_relocation(struct btrfs_trans_handle *trans,
3fd0a558 2944 struct reloc_control *rc,
a26195a5 2945 struct btrfs_backref_node *node,
5d4f98a2
YZ
2946 struct btrfs_key *key,
2947 struct btrfs_path *path, int lowest)
2948{
2ff7e61e 2949 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5
QW
2950 struct btrfs_backref_node *upper;
2951 struct btrfs_backref_edge *edge;
2952 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
5d4f98a2
YZ
2953 struct btrfs_root *root;
2954 struct extent_buffer *eb;
2955 u32 blocksize;
2956 u64 bytenr;
2957 u64 generation;
5d4f98a2
YZ
2958 int slot;
2959 int ret;
2960 int err = 0;
2961
2962 BUG_ON(lowest && node->eb);
2963
2964 path->lowest_level = node->level + 1;
3fd0a558 2965 rc->backref_cache.path[node->level] = node;
5d4f98a2 2966 list_for_each_entry(edge, &node->upper, list[LOWER]) {
581c1760 2967 struct btrfs_key first_key;
82fa113f 2968 struct btrfs_ref ref = { 0 };
581c1760 2969
5d4f98a2 2970 cond_resched();
5d4f98a2
YZ
2971
2972 upper = edge->node[UPPER];
dc4103f9 2973 root = select_reloc_root(trans, rc, upper, edges);
3fd0a558
YZ
2974 BUG_ON(!root);
2975
2976 if (upper->eb && !upper->locked) {
2977 if (!lowest) {
2978 ret = btrfs_bin_search(upper->eb, key,
2979 upper->level, &slot);
cbca7d59
FM
2980 if (ret < 0) {
2981 err = ret;
2982 goto next;
2983 }
3fd0a558
YZ
2984 BUG_ON(ret);
2985 bytenr = btrfs_node_blockptr(upper->eb, slot);
2986 if (node->eb->start == bytenr)
2987 goto next;
2988 }
5d4f98a2 2989 drop_node_buffer(upper);
3fd0a558 2990 }
5d4f98a2
YZ
2991
2992 if (!upper->eb) {
2993 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
3561b9db
LB
2994 if (ret) {
2995 if (ret < 0)
2996 err = ret;
2997 else
2998 err = -ENOENT;
2999
3000 btrfs_release_path(path);
5d4f98a2
YZ
3001 break;
3002 }
5d4f98a2 3003
3fd0a558
YZ
3004 if (!upper->eb) {
3005 upper->eb = path->nodes[upper->level];
3006 path->nodes[upper->level] = NULL;
3007 } else {
3008 BUG_ON(upper->eb != path->nodes[upper->level]);
3009 }
5d4f98a2 3010
3fd0a558
YZ
3011 upper->locked = 1;
3012 path->locks[upper->level] = 0;
5d4f98a2 3013
3fd0a558 3014 slot = path->slots[upper->level];
b3b4aa74 3015 btrfs_release_path(path);
5d4f98a2
YZ
3016 } else {
3017 ret = btrfs_bin_search(upper->eb, key, upper->level,
3018 &slot);
cbca7d59
FM
3019 if (ret < 0) {
3020 err = ret;
3021 goto next;
3022 }
5d4f98a2
YZ
3023 BUG_ON(ret);
3024 }
3025
3026 bytenr = btrfs_node_blockptr(upper->eb, slot);
3fd0a558 3027 if (lowest) {
4547f4d8
LB
3028 if (bytenr != node->bytenr) {
3029 btrfs_err(root->fs_info,
3030 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
3031 bytenr, node->bytenr, slot,
3032 upper->eb->start);
3033 err = -EIO;
3034 goto next;
3035 }
5d4f98a2 3036 } else {
3fd0a558
YZ
3037 if (node->eb->start == bytenr)
3038 goto next;
5d4f98a2
YZ
3039 }
3040
da17066c 3041 blocksize = root->fs_info->nodesize;
5d4f98a2 3042 generation = btrfs_node_ptr_generation(upper->eb, slot);
581c1760
QW
3043 btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
3044 eb = read_tree_block(fs_info, bytenr, generation,
3045 upper->level - 1, &first_key);
64c043de
LB
3046 if (IS_ERR(eb)) {
3047 err = PTR_ERR(eb);
3048 goto next;
3049 } else if (!extent_buffer_uptodate(eb)) {
416bc658 3050 free_extent_buffer(eb);
97d9a8a4
TI
3051 err = -EIO;
3052 goto next;
3053 }
5d4f98a2 3054 btrfs_tree_lock(eb);
8bead258 3055 btrfs_set_lock_blocking_write(eb);
5d4f98a2
YZ
3056
3057 if (!node->eb) {
3058 ret = btrfs_cow_block(trans, root, eb, upper->eb,
3059 slot, &eb);
3fd0a558
YZ
3060 btrfs_tree_unlock(eb);
3061 free_extent_buffer(eb);
5d4f98a2
YZ
3062 if (ret < 0) {
3063 err = ret;
3fd0a558 3064 goto next;
5d4f98a2 3065 }
3fd0a558 3066 BUG_ON(node->eb != eb);
5d4f98a2
YZ
3067 } else {
3068 btrfs_set_node_blockptr(upper->eb, slot,
3069 node->eb->start);
3070 btrfs_set_node_ptr_generation(upper->eb, slot,
3071 trans->transid);
3072 btrfs_mark_buffer_dirty(upper->eb);
3073
82fa113f
QW
3074 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
3075 node->eb->start, blocksize,
3076 upper->eb->start);
3077 ref.real_root = root->root_key.objectid;
3078 btrfs_init_tree_ref(&ref, node->level,
3079 btrfs_header_owner(upper->eb));
3080 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2
YZ
3081 BUG_ON(ret);
3082
3083 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
3084 BUG_ON(ret);
5d4f98a2 3085 }
3fd0a558
YZ
3086next:
3087 if (!upper->pending)
3088 drop_node_buffer(upper);
3089 else
3090 unlock_node_buffer(upper);
3091 if (err)
3092 break;
5d4f98a2 3093 }
3fd0a558
YZ
3094
3095 if (!err && node->pending) {
3096 drop_node_buffer(node);
3097 list_move_tail(&node->list, &rc->backref_cache.changed);
3098 node->pending = 0;
3099 }
3100
5d4f98a2 3101 path->lowest_level = 0;
3fd0a558 3102 BUG_ON(err == -ENOSPC);
5d4f98a2
YZ
3103 return err;
3104}
3105
3106static int link_to_upper(struct btrfs_trans_handle *trans,
3fd0a558 3107 struct reloc_control *rc,
a26195a5 3108 struct btrfs_backref_node *node,
5d4f98a2
YZ
3109 struct btrfs_path *path)
3110{
3111 struct btrfs_key key;
5d4f98a2
YZ
3112
3113 btrfs_node_key_to_cpu(node->eb, &key, 0);
3fd0a558 3114 return do_relocation(trans, rc, node, &key, path, 0);
5d4f98a2
YZ
3115}
3116
3117static int finish_pending_nodes(struct btrfs_trans_handle *trans,
3fd0a558
YZ
3118 struct reloc_control *rc,
3119 struct btrfs_path *path, int err)
5d4f98a2 3120{
3fd0a558 3121 LIST_HEAD(list);
a26195a5
QW
3122 struct btrfs_backref_cache *cache = &rc->backref_cache;
3123 struct btrfs_backref_node *node;
5d4f98a2
YZ
3124 int level;
3125 int ret;
5d4f98a2
YZ
3126
3127 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3128 while (!list_empty(&cache->pending[level])) {
3129 node = list_entry(cache->pending[level].next,
a26195a5 3130 struct btrfs_backref_node, list);
3fd0a558
YZ
3131 list_move_tail(&node->list, &list);
3132 BUG_ON(!node->pending);
5d4f98a2 3133
3fd0a558
YZ
3134 if (!err) {
3135 ret = link_to_upper(trans, rc, node, path);
3136 if (ret < 0)
3137 err = ret;
3138 }
5d4f98a2 3139 }
3fd0a558 3140 list_splice_init(&list, &cache->pending[level]);
5d4f98a2 3141 }
5d4f98a2
YZ
3142 return err;
3143}
3144
5d4f98a2
YZ
3145/*
3146 * mark a block and all blocks directly/indirectly reference the block
3147 * as processed.
3148 */
3149static void update_processed_blocks(struct reloc_control *rc,
a26195a5 3150 struct btrfs_backref_node *node)
5d4f98a2 3151{
a26195a5
QW
3152 struct btrfs_backref_node *next = node;
3153 struct btrfs_backref_edge *edge;
3154 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
5d4f98a2
YZ
3155 int index = 0;
3156
3157 while (next) {
3158 cond_resched();
3159 while (1) {
3160 if (next->processed)
3161 break;
3162
9569cc20 3163 mark_block_processed(rc, next);
5d4f98a2
YZ
3164
3165 if (list_empty(&next->upper))
3166 break;
3167
3168 edge = list_entry(next->upper.next,
a26195a5 3169 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
3170 edges[index++] = edge;
3171 next = edge->node[UPPER];
3172 }
3173 next = walk_down_backref(edges, &index);
3174 }
3175}
3176
7476dfda 3177static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
3fd0a558 3178{
da17066c 3179 u32 blocksize = rc->extent_root->fs_info->nodesize;
7476dfda 3180
3fd0a558
YZ
3181 if (test_range_bit(&rc->processed_blocks, bytenr,
3182 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
3183 return 1;
3184 return 0;
5d4f98a2
YZ
3185}
3186
2ff7e61e 3187static int get_tree_block_key(struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
3188 struct tree_block *block)
3189{
3190 struct extent_buffer *eb;
3191
581c1760
QW
3192 eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
3193 block->level, NULL);
64c043de
LB
3194 if (IS_ERR(eb)) {
3195 return PTR_ERR(eb);
3196 } else if (!extent_buffer_uptodate(eb)) {
416bc658
JB
3197 free_extent_buffer(eb);
3198 return -EIO;
3199 }
5d4f98a2
YZ
3200 if (block->level == 0)
3201 btrfs_item_key_to_cpu(eb, &block->key, 0);
3202 else
3203 btrfs_node_key_to_cpu(eb, &block->key, 0);
3204 free_extent_buffer(eb);
3205 block->key_ready = 1;
3206 return 0;
3207}
3208
5d4f98a2
YZ
3209/*
3210 * helper function to relocate a tree block
3211 */
3212static int relocate_tree_block(struct btrfs_trans_handle *trans,
3213 struct reloc_control *rc,
a26195a5 3214 struct btrfs_backref_node *node,
5d4f98a2
YZ
3215 struct btrfs_key *key,
3216 struct btrfs_path *path)
3217{
3218 struct btrfs_root *root;
3fd0a558
YZ
3219 int ret = 0;
3220
3221 if (!node)
3222 return 0;
5d4f98a2 3223
5f6b2e5c
JB
3224 /*
3225 * If we fail here we want to drop our backref_node because we are going
3226 * to start over and regenerate the tree for it.
3227 */
3228 ret = reserve_metadata_space(trans, rc, node);
3229 if (ret)
3230 goto out;
3231
3fd0a558 3232 BUG_ON(node->processed);
147d256e 3233 root = select_one_root(node);
3fd0a558 3234 if (root == ERR_PTR(-ENOENT)) {
5d4f98a2 3235 update_processed_blocks(rc, node);
3fd0a558 3236 goto out;
5d4f98a2
YZ
3237 }
3238
3fd0a558 3239 if (root) {
27cdeb70 3240 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3fd0a558
YZ
3241 BUG_ON(node->new_bytenr);
3242 BUG_ON(!list_empty(&node->list));
3243 btrfs_record_root_in_trans(trans, root);
3244 root = root->reloc_root;
3245 node->new_bytenr = root->node->start;
00246528
JB
3246 btrfs_put_root(node->root);
3247 node->root = btrfs_grab_root(root);
0b530bc5 3248 ASSERT(node->root);
3fd0a558
YZ
3249 list_add_tail(&node->list, &rc->backref_cache.changed);
3250 } else {
3251 path->lowest_level = node->level;
3252 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
b3b4aa74 3253 btrfs_release_path(path);
3fd0a558
YZ
3254 if (ret > 0)
3255 ret = 0;
3256 }
3257 if (!ret)
3258 update_processed_blocks(rc, node);
3259 } else {
3260 ret = do_relocation(trans, rc, node, key, path, 1);
3261 }
5d4f98a2 3262out:
0647bf56 3263 if (ret || node->level == 0 || node->cowonly)
3fd0a558 3264 remove_backref_node(&rc->backref_cache, node);
5d4f98a2
YZ
3265 return ret;
3266}
3267
3268/*
3269 * relocate a list of blocks
3270 */
3271static noinline_for_stack
3272int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3273 struct reloc_control *rc, struct rb_root *blocks)
3274{
2ff7e61e 3275 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5 3276 struct btrfs_backref_node *node;
5d4f98a2
YZ
3277 struct btrfs_path *path;
3278 struct tree_block *block;
98ff7b94 3279 struct tree_block *next;
5d4f98a2
YZ
3280 int ret;
3281 int err = 0;
3282
3283 path = btrfs_alloc_path();
e1a12670
LB
3284 if (!path) {
3285 err = -ENOMEM;
34c2b290 3286 goto out_free_blocks;
e1a12670 3287 }
5d4f98a2 3288
98ff7b94
QW
3289 /* Kick in readahead for tree blocks with missing keys */
3290 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
5d4f98a2 3291 if (!block->key_ready)
2ff7e61e 3292 readahead_tree_block(fs_info, block->bytenr);
5d4f98a2
YZ
3293 }
3294
98ff7b94
QW
3295 /* Get first keys */
3296 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
34c2b290 3297 if (!block->key_ready) {
2ff7e61e 3298 err = get_tree_block_key(fs_info, block);
34c2b290
DS
3299 if (err)
3300 goto out_free_path;
3301 }
5d4f98a2
YZ
3302 }
3303
98ff7b94
QW
3304 /* Do tree relocation */
3305 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3fd0a558 3306 node = build_backref_tree(rc, &block->key,
5d4f98a2
YZ
3307 block->level, block->bytenr);
3308 if (IS_ERR(node)) {
3309 err = PTR_ERR(node);
3310 goto out;
3311 }
3312
3313 ret = relocate_tree_block(trans, rc, node, &block->key,
3314 path);
3315 if (ret < 0) {
50dbbb71
JB
3316 err = ret;
3317 break;
5d4f98a2 3318 }
5d4f98a2 3319 }
5d4f98a2 3320out:
3fd0a558 3321 err = finish_pending_nodes(trans, rc, path, err);
5d4f98a2 3322
34c2b290 3323out_free_path:
5d4f98a2 3324 btrfs_free_path(path);
34c2b290 3325out_free_blocks:
e1a12670 3326 free_block_list(blocks);
5d4f98a2
YZ
3327 return err;
3328}
3329
efa56464
YZ
3330static noinline_for_stack
3331int prealloc_file_extent_cluster(struct inode *inode,
3332 struct file_extent_cluster *cluster)
3333{
3334 u64 alloc_hint = 0;
3335 u64 start;
3336 u64 end;
3337 u64 offset = BTRFS_I(inode)->index_cnt;
3338 u64 num_bytes;
3339 int nr = 0;
3340 int ret = 0;
dcb40c19
WX
3341 u64 prealloc_start = cluster->start - offset;
3342 u64 prealloc_end = cluster->end - offset;
18513091 3343 u64 cur_offset;
364ecf36 3344 struct extent_changeset *data_reserved = NULL;
efa56464
YZ
3345
3346 BUG_ON(cluster->start != cluster->boundary[0]);
5955102c 3347 inode_lock(inode);
efa56464 3348
364ecf36 3349 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
dcb40c19 3350 prealloc_end + 1 - prealloc_start);
efa56464
YZ
3351 if (ret)
3352 goto out;
3353
18513091 3354 cur_offset = prealloc_start;
efa56464
YZ
3355 while (nr < cluster->nr) {
3356 start = cluster->boundary[nr] - offset;
3357 if (nr + 1 < cluster->nr)
3358 end = cluster->boundary[nr + 1] - 1 - offset;
3359 else
3360 end = cluster->end - offset;
3361
d0082371 3362 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
efa56464 3363 num_bytes = end + 1 - start;
18513091 3364 if (cur_offset < start)
bc42bda2
QW
3365 btrfs_free_reserved_data_space(inode, data_reserved,
3366 cur_offset, start - cur_offset);
efa56464
YZ
3367 ret = btrfs_prealloc_file_range(inode, 0, start,
3368 num_bytes, num_bytes,
3369 end + 1, &alloc_hint);
18513091 3370 cur_offset = end + 1;
d0082371 3371 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
efa56464
YZ
3372 if (ret)
3373 break;
3374 nr++;
3375 }
18513091 3376 if (cur_offset < prealloc_end)
bc42bda2
QW
3377 btrfs_free_reserved_data_space(inode, data_reserved,
3378 cur_offset, prealloc_end + 1 - cur_offset);
efa56464 3379out:
5955102c 3380 inode_unlock(inode);
364ecf36 3381 extent_changeset_free(data_reserved);
efa56464
YZ
3382 return ret;
3383}
3384
5d4f98a2 3385static noinline_for_stack
0257bb82
YZ
3386int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3387 u64 block_start)
3388{
0257bb82
YZ
3389 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3390 struct extent_map *em;
3391 int ret = 0;
3392
172ddd60 3393 em = alloc_extent_map();
0257bb82
YZ
3394 if (!em)
3395 return -ENOMEM;
3396
3397 em->start = start;
3398 em->len = end + 1 - start;
3399 em->block_len = em->len;
3400 em->block_start = block_start;
0257bb82
YZ
3401 set_bit(EXTENT_FLAG_PINNED, &em->flags);
3402
d0082371 3403 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
0257bb82
YZ
3404 while (1) {
3405 write_lock(&em_tree->lock);
09a2a8f9 3406 ret = add_extent_mapping(em_tree, em, 0);
0257bb82
YZ
3407 write_unlock(&em_tree->lock);
3408 if (ret != -EEXIST) {
3409 free_extent_map(em);
3410 break;
3411 }
dcdbc059 3412 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
0257bb82 3413 }
d0082371 3414 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
0257bb82
YZ
3415 return ret;
3416}
3417
726a3421
QW
3418/*
3419 * Allow error injection to test balance cancellation
3420 */
3421int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
3422{
3423 return atomic_read(&fs_info->balance_cancel_req);
3424}
3425ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
3426
0257bb82
YZ
3427static int relocate_file_extent_cluster(struct inode *inode,
3428 struct file_extent_cluster *cluster)
5d4f98a2 3429{
2ff7e61e 3430 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5d4f98a2
YZ
3431 u64 page_start;
3432 u64 page_end;
0257bb82
YZ
3433 u64 offset = BTRFS_I(inode)->index_cnt;
3434 unsigned long index;
5d4f98a2 3435 unsigned long last_index;
5d4f98a2
YZ
3436 struct page *page;
3437 struct file_ra_state *ra;
3b16a4e3 3438 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
0257bb82 3439 int nr = 0;
5d4f98a2
YZ
3440 int ret = 0;
3441
0257bb82
YZ
3442 if (!cluster->nr)
3443 return 0;
3444
5d4f98a2
YZ
3445 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3446 if (!ra)
3447 return -ENOMEM;
3448
efa56464
YZ
3449 ret = prealloc_file_extent_cluster(inode, cluster);
3450 if (ret)
3451 goto out;
0257bb82 3452
efa56464 3453 file_ra_state_init(ra, inode->i_mapping);
5d4f98a2 3454
0257bb82
YZ
3455 ret = setup_extent_mapping(inode, cluster->start - offset,
3456 cluster->end - offset, cluster->start);
5d4f98a2 3457 if (ret)
efa56464 3458 goto out;
5d4f98a2 3459
09cbfeaf
KS
3460 index = (cluster->start - offset) >> PAGE_SHIFT;
3461 last_index = (cluster->end - offset) >> PAGE_SHIFT;
0257bb82 3462 while (index <= last_index) {
9f3db423
NB
3463 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3464 PAGE_SIZE);
efa56464
YZ
3465 if (ret)
3466 goto out;
3467
0257bb82 3468 page = find_lock_page(inode->i_mapping, index);
5d4f98a2 3469 if (!page) {
0257bb82
YZ
3470 page_cache_sync_readahead(inode->i_mapping,
3471 ra, NULL, index,
3472 last_index + 1 - index);
a94733d0 3473 page = find_or_create_page(inode->i_mapping, index,
3b16a4e3 3474 mask);
0257bb82 3475 if (!page) {
691fa059 3476 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3477 PAGE_SIZE, true);
44db1216 3478 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 3479 PAGE_SIZE);
0257bb82 3480 ret = -ENOMEM;
efa56464 3481 goto out;
0257bb82 3482 }
5d4f98a2 3483 }
0257bb82
YZ
3484
3485 if (PageReadahead(page)) {
3486 page_cache_async_readahead(inode->i_mapping,
3487 ra, NULL, page, index,
3488 last_index + 1 - index);
3489 }
3490
5d4f98a2
YZ
3491 if (!PageUptodate(page)) {
3492 btrfs_readpage(NULL, page);
3493 lock_page(page);
3494 if (!PageUptodate(page)) {
3495 unlock_page(page);
09cbfeaf 3496 put_page(page);
691fa059 3497 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3498 PAGE_SIZE, true);
8b62f87b 3499 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 3500 PAGE_SIZE);
5d4f98a2 3501 ret = -EIO;
efa56464 3502 goto out;
5d4f98a2
YZ
3503 }
3504 }
5d4f98a2 3505
4eee4fa4 3506 page_start = page_offset(page);
09cbfeaf 3507 page_end = page_start + PAGE_SIZE - 1;
0257bb82 3508
d0082371 3509 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
0257bb82 3510
5d4f98a2
YZ
3511 set_page_extent_mapped(page);
3512
0257bb82
YZ
3513 if (nr < cluster->nr &&
3514 page_start + offset == cluster->boundary[nr]) {
3515 set_extent_bits(&BTRFS_I(inode)->io_tree,
3516 page_start, page_end,
ceeb0ae7 3517 EXTENT_BOUNDARY);
0257bb82
YZ
3518 nr++;
3519 }
5d4f98a2 3520
765f3ceb 3521 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
330a5827 3522 NULL);
765f3ceb
NB
3523 if (ret) {
3524 unlock_page(page);
3525 put_page(page);
3526 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 3527 PAGE_SIZE, true);
765f3ceb 3528 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 3529 PAGE_SIZE);
765f3ceb
NB
3530
3531 clear_extent_bits(&BTRFS_I(inode)->io_tree,
3532 page_start, page_end,
3533 EXTENT_LOCKED | EXTENT_BOUNDARY);
3534 goto out;
3535
3536 }
5d4f98a2 3537 set_page_dirty(page);
5d4f98a2 3538
0257bb82 3539 unlock_extent(&BTRFS_I(inode)->io_tree,
d0082371 3540 page_start, page_end);
5d4f98a2 3541 unlock_page(page);
09cbfeaf 3542 put_page(page);
0257bb82
YZ
3543
3544 index++;
8702ba93 3545 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
efa56464 3546 balance_dirty_pages_ratelimited(inode->i_mapping);
2ff7e61e 3547 btrfs_throttle(fs_info);
7f913c7c
QW
3548 if (btrfs_should_cancel_balance(fs_info)) {
3549 ret = -ECANCELED;
3550 goto out;
3551 }
5d4f98a2 3552 }
0257bb82 3553 WARN_ON(nr != cluster->nr);
efa56464 3554out:
5d4f98a2 3555 kfree(ra);
5d4f98a2
YZ
3556 return ret;
3557}
3558
3559static noinline_for_stack
0257bb82
YZ
3560int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3561 struct file_extent_cluster *cluster)
5d4f98a2 3562{
0257bb82 3563 int ret;
5d4f98a2 3564
0257bb82
YZ
3565 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3566 ret = relocate_file_extent_cluster(inode, cluster);
3567 if (ret)
3568 return ret;
3569 cluster->nr = 0;
5d4f98a2 3570 }
5d4f98a2 3571
0257bb82
YZ
3572 if (!cluster->nr)
3573 cluster->start = extent_key->objectid;
3574 else
3575 BUG_ON(cluster->nr >= MAX_EXTENTS);
3576 cluster->end = extent_key->objectid + extent_key->offset - 1;
3577 cluster->boundary[cluster->nr] = extent_key->objectid;
3578 cluster->nr++;
3579
3580 if (cluster->nr >= MAX_EXTENTS) {
3581 ret = relocate_file_extent_cluster(inode, cluster);
3582 if (ret)
3583 return ret;
3584 cluster->nr = 0;
3585 }
3586 return 0;
5d4f98a2
YZ
3587}
3588
5d4f98a2
YZ
3589/*
3590 * helper to add a tree block to the list.
3591 * the major work is getting the generation and level of the block
3592 */
3593static int add_tree_block(struct reloc_control *rc,
3594 struct btrfs_key *extent_key,
3595 struct btrfs_path *path,
3596 struct rb_root *blocks)
3597{
3598 struct extent_buffer *eb;
3599 struct btrfs_extent_item *ei;
3600 struct btrfs_tree_block_info *bi;
3601 struct tree_block *block;
3602 struct rb_node *rb_node;
3603 u32 item_size;
3604 int level = -1;
7fdf4b60 3605 u64 generation;
5d4f98a2
YZ
3606
3607 eb = path->nodes[0];
3608 item_size = btrfs_item_size_nr(eb, path->slots[0]);
3609
3173a18f
JB
3610 if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3611 item_size >= sizeof(*ei) + sizeof(*bi)) {
5d4f98a2
YZ
3612 ei = btrfs_item_ptr(eb, path->slots[0],
3613 struct btrfs_extent_item);
3173a18f
JB
3614 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3615 bi = (struct btrfs_tree_block_info *)(ei + 1);
3616 level = btrfs_tree_block_level(eb, bi);
3617 } else {
3618 level = (int)extent_key->offset;
3619 }
5d4f98a2 3620 generation = btrfs_extent_generation(eb, ei);
6d8ff4e4 3621 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
ba3c2b19
NB
3622 btrfs_print_v0_err(eb->fs_info);
3623 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3624 return -EINVAL;
5d4f98a2 3625 } else {
5d4f98a2 3626 BUG();
5d4f98a2
YZ
3627 }
3628
b3b4aa74 3629 btrfs_release_path(path);
5d4f98a2
YZ
3630
3631 BUG_ON(level == -1);
3632
3633 block = kmalloc(sizeof(*block), GFP_NOFS);
3634 if (!block)
3635 return -ENOMEM;
3636
3637 block->bytenr = extent_key->objectid;
da17066c 3638 block->key.objectid = rc->extent_root->fs_info->nodesize;
5d4f98a2
YZ
3639 block->key.offset = generation;
3640 block->level = level;
3641 block->key_ready = 0;
3642
3643 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
43c04fb1
JM
3644 if (rb_node)
3645 backref_tree_panic(rb_node, -EEXIST, block->bytenr);
5d4f98a2
YZ
3646
3647 return 0;
3648}
3649
3650/*
3651 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3652 */
3653static int __add_tree_block(struct reloc_control *rc,
3654 u64 bytenr, u32 blocksize,
3655 struct rb_root *blocks)
3656{
0b246afa 3657 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
3658 struct btrfs_path *path;
3659 struct btrfs_key key;
3660 int ret;
0b246afa 3661 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5d4f98a2 3662
7476dfda 3663 if (tree_block_processed(bytenr, rc))
5d4f98a2
YZ
3664 return 0;
3665
3666 if (tree_search(blocks, bytenr))
3667 return 0;
3668
3669 path = btrfs_alloc_path();
3670 if (!path)
3671 return -ENOMEM;
aee68ee5 3672again:
5d4f98a2 3673 key.objectid = bytenr;
aee68ee5
JB
3674 if (skinny) {
3675 key.type = BTRFS_METADATA_ITEM_KEY;
3676 key.offset = (u64)-1;
3677 } else {
3678 key.type = BTRFS_EXTENT_ITEM_KEY;
3679 key.offset = blocksize;
3680 }
5d4f98a2
YZ
3681
3682 path->search_commit_root = 1;
3683 path->skip_locking = 1;
3684 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3685 if (ret < 0)
3686 goto out;
5d4f98a2 3687
aee68ee5
JB
3688 if (ret > 0 && skinny) {
3689 if (path->slots[0]) {
3690 path->slots[0]--;
3691 btrfs_item_key_to_cpu(path->nodes[0], &key,
3692 path->slots[0]);
3693 if (key.objectid == bytenr &&
3694 (key.type == BTRFS_METADATA_ITEM_KEY ||
3695 (key.type == BTRFS_EXTENT_ITEM_KEY &&
3696 key.offset == blocksize)))
3697 ret = 0;
3698 }
3699
3700 if (ret) {
3701 skinny = false;
3702 btrfs_release_path(path);
3703 goto again;
3704 }
3173a18f 3705 }
cdccee99
LB
3706 if (ret) {
3707 ASSERT(ret == 1);
3708 btrfs_print_leaf(path->nodes[0]);
3709 btrfs_err(fs_info,
3710 "tree block extent item (%llu) is not found in extent tree",
3711 bytenr);
3712 WARN_ON(1);
3713 ret = -EINVAL;
3714 goto out;
3715 }
3173a18f 3716
5d4f98a2
YZ
3717 ret = add_tree_block(rc, &key, path, blocks);
3718out:
3719 btrfs_free_path(path);
3720 return ret;
3721}
3722
0af3d00b 3723static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
32da5386 3724 struct btrfs_block_group *block_group,
1bbc621e
CM
3725 struct inode *inode,
3726 u64 ino)
0af3d00b
JB
3727{
3728 struct btrfs_key key;
0af3d00b
JB
3729 struct btrfs_root *root = fs_info->tree_root;
3730 struct btrfs_trans_handle *trans;
0af3d00b
JB
3731 int ret = 0;
3732
3733 if (inode)
3734 goto truncate;
3735
3736 key.objectid = ino;
3737 key.type = BTRFS_INODE_ITEM_KEY;
3738 key.offset = 0;
3739
4c66e0d4 3740 inode = btrfs_iget(fs_info->sb, &key, root);
2e19f1f9 3741 if (IS_ERR(inode))
0af3d00b 3742 return -ENOENT;
0af3d00b
JB
3743
3744truncate:
2ff7e61e 3745 ret = btrfs_check_trunc_cache_free_space(fs_info,
7b61cd92
MX
3746 &fs_info->global_block_rsv);
3747 if (ret)
3748 goto out;
3749
7a7eaa40 3750 trans = btrfs_join_transaction(root);
0af3d00b 3751 if (IS_ERR(trans)) {
3612b495 3752 ret = PTR_ERR(trans);
0af3d00b
JB
3753 goto out;
3754 }
3755
77ab86bf 3756 ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
0af3d00b 3757
3a45bb20 3758 btrfs_end_transaction(trans);
2ff7e61e 3759 btrfs_btree_balance_dirty(fs_info);
0af3d00b
JB
3760out:
3761 iput(inode);
3762 return ret;
3763}
3764
5d4f98a2 3765/*
19b546d7
QW
3766 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3767 * cache inode, to avoid free space cache data extent blocking data relocation.
5d4f98a2 3768 */
19b546d7
QW
3769static int delete_v1_space_cache(struct extent_buffer *leaf,
3770 struct btrfs_block_group *block_group,
3771 u64 data_bytenr)
5d4f98a2 3772{
19b546d7
QW
3773 u64 space_cache_ino;
3774 struct btrfs_file_extent_item *ei;
5d4f98a2 3775 struct btrfs_key key;
19b546d7
QW
3776 bool found = false;
3777 int i;
5d4f98a2
YZ
3778 int ret;
3779
19b546d7
QW
3780 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3781 return 0;
5d4f98a2 3782
19b546d7
QW
3783 for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3784 btrfs_item_key_to_cpu(leaf, &key, i);
3785 if (key.type != BTRFS_EXTENT_DATA_KEY)
3786 continue;
3787 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3788 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG &&
3789 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3790 found = true;
3791 space_cache_ino = key.objectid;
5d4f98a2 3792 break;
5d4f98a2 3793 }
5d4f98a2 3794 }
19b546d7
QW
3795 if (!found)
3796 return -ENOENT;
3797 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3798 space_cache_ino);
3799 return ret;
5d4f98a2
YZ
3800}
3801
3802/*
2c016dc2 3803 * helper to find all tree blocks that reference a given data extent
5d4f98a2
YZ
3804 */
3805static noinline_for_stack
3806int add_data_references(struct reloc_control *rc,
3807 struct btrfs_key *extent_key,
3808 struct btrfs_path *path,
3809 struct rb_root *blocks)
3810{
19b546d7
QW
3811 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3812 struct ulist *leaves = NULL;
3813 struct ulist_iterator leaf_uiter;
3814 struct ulist_node *ref_node = NULL;
3815 const u32 blocksize = fs_info->nodesize;
647f63bd 3816 int ret = 0;
5d4f98a2 3817
19b546d7
QW
3818 btrfs_release_path(path);
3819 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3820 0, &leaves, NULL, true);
3821 if (ret < 0)
3822 return ret;
5d4f98a2 3823
19b546d7
QW
3824 ULIST_ITER_INIT(&leaf_uiter);
3825 while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
3826 struct extent_buffer *eb;
5d4f98a2 3827
19b546d7
QW
3828 eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
3829 if (IS_ERR(eb)) {
3830 ret = PTR_ERR(eb);
5d4f98a2 3831 break;
5d4f98a2 3832 }
19b546d7
QW
3833 ret = delete_v1_space_cache(eb, rc->block_group,
3834 extent_key->objectid);
3835 free_extent_buffer(eb);
3836 if (ret < 0)
3837 break;
3838 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3839 if (ret < 0)
5d4f98a2 3840 break;
5d4f98a2 3841 }
19b546d7 3842 if (ret < 0)
5d4f98a2 3843 free_block_list(blocks);
19b546d7
QW
3844 ulist_free(leaves);
3845 return ret;
5d4f98a2
YZ
3846}
3847
3848/*
2c016dc2 3849 * helper to find next unprocessed extent
5d4f98a2
YZ
3850 */
3851static noinline_for_stack
147d256e 3852int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3fd0a558 3853 struct btrfs_key *extent_key)
5d4f98a2 3854{
0b246afa 3855 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
3856 struct btrfs_key key;
3857 struct extent_buffer *leaf;
3858 u64 start, end, last;
3859 int ret;
3860
b3470b5d 3861 last = rc->block_group->start + rc->block_group->length;
5d4f98a2
YZ
3862 while (1) {
3863 cond_resched();
3864 if (rc->search_start >= last) {
3865 ret = 1;
3866 break;
3867 }
3868
3869 key.objectid = rc->search_start;
3870 key.type = BTRFS_EXTENT_ITEM_KEY;
3871 key.offset = 0;
3872
3873 path->search_commit_root = 1;
3874 path->skip_locking = 1;
3875 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3876 0, 0);
3877 if (ret < 0)
3878 break;
3879next:
3880 leaf = path->nodes[0];
3881 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3882 ret = btrfs_next_leaf(rc->extent_root, path);
3883 if (ret != 0)
3884 break;
3885 leaf = path->nodes[0];
3886 }
3887
3888 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3889 if (key.objectid >= last) {
3890 ret = 1;
3891 break;
3892 }
3893
3173a18f
JB
3894 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3895 key.type != BTRFS_METADATA_ITEM_KEY) {
3896 path->slots[0]++;
3897 goto next;
3898 }
3899
3900 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5d4f98a2
YZ
3901 key.objectid + key.offset <= rc->search_start) {
3902 path->slots[0]++;
3903 goto next;
3904 }
3905
3173a18f 3906 if (key.type == BTRFS_METADATA_ITEM_KEY &&
0b246afa 3907 key.objectid + fs_info->nodesize <=
3173a18f
JB
3908 rc->search_start) {
3909 path->slots[0]++;
3910 goto next;
3911 }
3912
5d4f98a2
YZ
3913 ret = find_first_extent_bit(&rc->processed_blocks,
3914 key.objectid, &start, &end,
e6138876 3915 EXTENT_DIRTY, NULL);
5d4f98a2
YZ
3916
3917 if (ret == 0 && start <= key.objectid) {
b3b4aa74 3918 btrfs_release_path(path);
5d4f98a2
YZ
3919 rc->search_start = end + 1;
3920 } else {
3173a18f
JB
3921 if (key.type == BTRFS_EXTENT_ITEM_KEY)
3922 rc->search_start = key.objectid + key.offset;
3923 else
3924 rc->search_start = key.objectid +
0b246afa 3925 fs_info->nodesize;
3fd0a558 3926 memcpy(extent_key, &key, sizeof(key));
5d4f98a2
YZ
3927 return 0;
3928 }
3929 }
b3b4aa74 3930 btrfs_release_path(path);
5d4f98a2
YZ
3931 return ret;
3932}
3933
3934static void set_reloc_control(struct reloc_control *rc)
3935{
3936 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
7585717f
CM
3937
3938 mutex_lock(&fs_info->reloc_mutex);
5d4f98a2 3939 fs_info->reloc_ctl = rc;
7585717f 3940 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2
YZ
3941}
3942
3943static void unset_reloc_control(struct reloc_control *rc)
3944{
3945 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
7585717f
CM
3946
3947 mutex_lock(&fs_info->reloc_mutex);
5d4f98a2 3948 fs_info->reloc_ctl = NULL;
7585717f 3949 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2
YZ
3950}
3951
3952static int check_extent_flags(u64 flags)
3953{
3954 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3955 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3956 return 1;
3957 if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3958 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3959 return 1;
3960 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3961 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3962 return 1;
3963 return 0;
3964}
3965
3fd0a558
YZ
3966static noinline_for_stack
3967int prepare_to_relocate(struct reloc_control *rc)
3968{
3969 struct btrfs_trans_handle *trans;
ac2fabac 3970 int ret;
3fd0a558 3971
2ff7e61e 3972 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
66d8f3dd 3973 BTRFS_BLOCK_RSV_TEMP);
3fd0a558
YZ
3974 if (!rc->block_rsv)
3975 return -ENOMEM;
3976
3fd0a558 3977 memset(&rc->cluster, 0, sizeof(rc->cluster));
b3470b5d 3978 rc->search_start = rc->block_group->start;
3fd0a558
YZ
3979 rc->extents_found = 0;
3980 rc->nodes_relocated = 0;
3981 rc->merging_rsv_size = 0;
0647bf56 3982 rc->reserved_bytes = 0;
da17066c 3983 rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
0647bf56 3984 RELOCATION_RESERVED_NODES;
ac2fabac
JB
3985 ret = btrfs_block_rsv_refill(rc->extent_root,
3986 rc->block_rsv, rc->block_rsv->size,
3987 BTRFS_RESERVE_FLUSH_ALL);
3988 if (ret)
3989 return ret;
3fd0a558
YZ
3990
3991 rc->create_reloc_tree = 1;
3992 set_reloc_control(rc);
3993
7a7eaa40 3994 trans = btrfs_join_transaction(rc->extent_root);
28818947
LB
3995 if (IS_ERR(trans)) {
3996 unset_reloc_control(rc);
3997 /*
3998 * extent tree is not a ref_cow tree and has no reloc_root to
3999 * cleanup. And callers are responsible to free the above
4000 * block rsv.
4001 */
4002 return PTR_ERR(trans);
4003 }
3a45bb20 4004 btrfs_commit_transaction(trans);
3fd0a558
YZ
4005 return 0;
4006}
76dda93c 4007
5d4f98a2
YZ
4008static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
4009{
2ff7e61e 4010 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
4011 struct rb_root blocks = RB_ROOT;
4012 struct btrfs_key key;
4013 struct btrfs_trans_handle *trans = NULL;
4014 struct btrfs_path *path;
4015 struct btrfs_extent_item *ei;
5d4f98a2
YZ
4016 u64 flags;
4017 u32 item_size;
4018 int ret;
4019 int err = 0;
c87f08ca 4020 int progress = 0;
5d4f98a2
YZ
4021
4022 path = btrfs_alloc_path();
3fd0a558 4023 if (!path)
5d4f98a2 4024 return -ENOMEM;
e4058b54 4025 path->reada = READA_FORWARD;
5d4f98a2 4026
3fd0a558
YZ
4027 ret = prepare_to_relocate(rc);
4028 if (ret) {
4029 err = ret;
4030 goto out_free;
4031 }
5d4f98a2
YZ
4032
4033 while (1) {
0647bf56
WS
4034 rc->reserved_bytes = 0;
4035 ret = btrfs_block_rsv_refill(rc->extent_root,
4036 rc->block_rsv, rc->block_rsv->size,
4037 BTRFS_RESERVE_FLUSH_ALL);
4038 if (ret) {
4039 err = ret;
4040 break;
4041 }
c87f08ca 4042 progress++;
a22285a6 4043 trans = btrfs_start_transaction(rc->extent_root, 0);
0f788c58
LB
4044 if (IS_ERR(trans)) {
4045 err = PTR_ERR(trans);
4046 trans = NULL;
4047 break;
4048 }
c87f08ca 4049restart:
3fd0a558 4050 if (update_backref_cache(trans, &rc->backref_cache)) {
3a45bb20 4051 btrfs_end_transaction(trans);
42a657f5 4052 trans = NULL;
3fd0a558
YZ
4053 continue;
4054 }
4055
147d256e 4056 ret = find_next_extent(rc, path, &key);
5d4f98a2
YZ
4057 if (ret < 0)
4058 err = ret;
4059 if (ret != 0)
4060 break;
4061
4062 rc->extents_found++;
4063
4064 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4065 struct btrfs_extent_item);
3fd0a558 4066 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
5d4f98a2
YZ
4067 if (item_size >= sizeof(*ei)) {
4068 flags = btrfs_extent_flags(path->nodes[0], ei);
4069 ret = check_extent_flags(flags);
4070 BUG_ON(ret);
6d8ff4e4 4071 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
ba3c2b19
NB
4072 err = -EINVAL;
4073 btrfs_print_v0_err(trans->fs_info);
4074 btrfs_abort_transaction(trans, err);
4075 break;
5d4f98a2 4076 } else {
5d4f98a2 4077 BUG();
5d4f98a2
YZ
4078 }
4079
4080 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4081 ret = add_tree_block(rc, &key, path, &blocks);
4082 } else if (rc->stage == UPDATE_DATA_PTRS &&
3fd0a558 4083 (flags & BTRFS_EXTENT_FLAG_DATA)) {
5d4f98a2
YZ
4084 ret = add_data_references(rc, &key, path, &blocks);
4085 } else {
b3b4aa74 4086 btrfs_release_path(path);
5d4f98a2
YZ
4087 ret = 0;
4088 }
4089 if (ret < 0) {
3fd0a558 4090 err = ret;
5d4f98a2
YZ
4091 break;
4092 }
4093
4094 if (!RB_EMPTY_ROOT(&blocks)) {
4095 ret = relocate_tree_blocks(trans, rc, &blocks);
4096 if (ret < 0) {
3fd0a558
YZ
4097 if (ret != -EAGAIN) {
4098 err = ret;
4099 break;
4100 }
4101 rc->extents_found--;
4102 rc->search_start = key.objectid;
4103 }
4104 }
4105
3a45bb20 4106 btrfs_end_transaction_throttle(trans);
2ff7e61e 4107 btrfs_btree_balance_dirty(fs_info);
5d4f98a2 4108 trans = NULL;
5d4f98a2
YZ
4109
4110 if (rc->stage == MOVE_DATA_EXTENTS &&
4111 (flags & BTRFS_EXTENT_FLAG_DATA)) {
4112 rc->found_file_extent = 1;
0257bb82 4113 ret = relocate_data_extent(rc->data_inode,
3fd0a558 4114 &key, &rc->cluster);
5d4f98a2
YZ
4115 if (ret < 0) {
4116 err = ret;
4117 break;
4118 }
4119 }
f31ea088
QW
4120 if (btrfs_should_cancel_balance(fs_info)) {
4121 err = -ECANCELED;
4122 break;
4123 }
5d4f98a2 4124 }
c87f08ca 4125 if (trans && progress && err == -ENOSPC) {
43a7e99d 4126 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
9689457b 4127 if (ret == 1) {
c87f08ca
CM
4128 err = 0;
4129 progress = 0;
4130 goto restart;
4131 }
4132 }
3fd0a558 4133
b3b4aa74 4134 btrfs_release_path(path);
91166212 4135 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
5d4f98a2
YZ
4136
4137 if (trans) {
3a45bb20 4138 btrfs_end_transaction_throttle(trans);
2ff7e61e 4139 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
4140 }
4141
0257bb82 4142 if (!err) {
3fd0a558
YZ
4143 ret = relocate_file_extent_cluster(rc->data_inode,
4144 &rc->cluster);
0257bb82
YZ
4145 if (ret < 0)
4146 err = ret;
4147 }
4148
3fd0a558
YZ
4149 rc->create_reloc_tree = 0;
4150 set_reloc_control(rc);
0257bb82 4151
3fd0a558 4152 backref_cache_cleanup(&rc->backref_cache);
63f018be 4153 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
5d4f98a2 4154
7f913c7c
QW
4155 /*
4156 * Even in the case when the relocation is cancelled, we should all go
4157 * through prepare_to_merge() and merge_reloc_roots().
4158 *
4159 * For error (including cancelled balance), prepare_to_merge() will
4160 * mark all reloc trees orphan, then queue them for cleanup in
4161 * merge_reloc_roots()
4162 */
3fd0a558 4163 err = prepare_to_merge(rc, err);
5d4f98a2
YZ
4164
4165 merge_reloc_roots(rc);
4166
3fd0a558 4167 rc->merge_reloc_tree = 0;
5d4f98a2 4168 unset_reloc_control(rc);
63f018be 4169 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
5d4f98a2
YZ
4170
4171 /* get rid of pinned extents */
7a7eaa40 4172 trans = btrfs_join_transaction(rc->extent_root);
62b99540 4173 if (IS_ERR(trans)) {
3612b495 4174 err = PTR_ERR(trans);
62b99540
QW
4175 goto out_free;
4176 }
3a45bb20 4177 btrfs_commit_transaction(trans);
6217b0fa 4178out_free:
d2311e69
QW
4179 ret = clean_dirty_subvols(rc);
4180 if (ret < 0 && !err)
4181 err = ret;
2ff7e61e 4182 btrfs_free_block_rsv(fs_info, rc->block_rsv);
3fd0a558 4183 btrfs_free_path(path);
5d4f98a2
YZ
4184 return err;
4185}
4186
4187static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
0257bb82 4188 struct btrfs_root *root, u64 objectid)
5d4f98a2
YZ
4189{
4190 struct btrfs_path *path;
4191 struct btrfs_inode_item *item;
4192 struct extent_buffer *leaf;
4193 int ret;
4194
4195 path = btrfs_alloc_path();
4196 if (!path)
4197 return -ENOMEM;
4198
4199 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4200 if (ret)
4201 goto out;
4202
4203 leaf = path->nodes[0];
4204 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
b159fa28 4205 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
5d4f98a2 4206 btrfs_set_inode_generation(leaf, item, 1);
0257bb82 4207 btrfs_set_inode_size(leaf, item, 0);
5d4f98a2 4208 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3fd0a558
YZ
4209 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4210 BTRFS_INODE_PREALLOC);
5d4f98a2 4211 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
4212out:
4213 btrfs_free_path(path);
4214 return ret;
4215}
4216
4217/*
4218 * helper to create inode for data relocation.
4219 * the inode is in data relocation tree and its link count is 0
4220 */
3fd0a558
YZ
4221static noinline_for_stack
4222struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
32da5386 4223 struct btrfs_block_group *group)
5d4f98a2
YZ
4224{
4225 struct inode *inode = NULL;
4226 struct btrfs_trans_handle *trans;
4227 struct btrfs_root *root;
4228 struct btrfs_key key;
4624900d 4229 u64 objectid;
5d4f98a2
YZ
4230 int err = 0;
4231
4232 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4233 if (IS_ERR(root))
4234 return ERR_CAST(root);
4235
a22285a6 4236 trans = btrfs_start_transaction(root, 6);
76deacf0 4237 if (IS_ERR(trans)) {
00246528 4238 btrfs_put_root(root);
3fd0a558 4239 return ERR_CAST(trans);
76deacf0 4240 }
5d4f98a2 4241
581bb050 4242 err = btrfs_find_free_objectid(root, &objectid);
5d4f98a2
YZ
4243 if (err)
4244 goto out;
4245
0257bb82 4246 err = __insert_orphan_inode(trans, root, objectid);
5d4f98a2
YZ
4247 BUG_ON(err);
4248
4249 key.objectid = objectid;
4250 key.type = BTRFS_INODE_ITEM_KEY;
4251 key.offset = 0;
4c66e0d4 4252 inode = btrfs_iget(fs_info->sb, &key, root);
2e19f1f9 4253 BUG_ON(IS_ERR(inode));
b3470b5d 4254 BTRFS_I(inode)->index_cnt = group->start;
5d4f98a2 4255
73f2e545 4256 err = btrfs_orphan_add(trans, BTRFS_I(inode));
5d4f98a2 4257out:
00246528 4258 btrfs_put_root(root);
3a45bb20 4259 btrfs_end_transaction(trans);
2ff7e61e 4260 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
4261 if (err) {
4262 if (inode)
4263 iput(inode);
4264 inode = ERR_PTR(err);
4265 }
4266 return inode;
4267}
4268
c258d6e3 4269static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3fd0a558
YZ
4270{
4271 struct reloc_control *rc;
4272
4273 rc = kzalloc(sizeof(*rc), GFP_NOFS);
4274 if (!rc)
4275 return NULL;
4276
4277 INIT_LIST_HEAD(&rc->reloc_roots);
d2311e69 4278 INIT_LIST_HEAD(&rc->dirty_subvol_roots);
2433bea5 4279 backref_cache_init(fs_info, &rc->backref_cache, 1);
3fd0a558 4280 mapping_tree_init(&rc->reloc_root_tree);
43eb5f29
QW
4281 extent_io_tree_init(fs_info, &rc->processed_blocks,
4282 IO_TREE_RELOC_BLOCKS, NULL);
3fd0a558
YZ
4283 return rc;
4284}
4285
1a0afa0e
JB
4286static void free_reloc_control(struct reloc_control *rc)
4287{
4288 struct mapping_node *node, *tmp;
4289
4290 free_reloc_roots(&rc->reloc_roots);
4291 rbtree_postorder_for_each_entry_safe(node, tmp,
4292 &rc->reloc_root_tree.rb_root, rb_node)
4293 kfree(node);
4294
4295 kfree(rc);
4296}
4297
ebce0e01
AB
4298/*
4299 * Print the block group being relocated
4300 */
4301static void describe_relocation(struct btrfs_fs_info *fs_info,
32da5386 4302 struct btrfs_block_group *block_group)
ebce0e01 4303{
f89e09cf 4304 char buf[128] = {'\0'};
ebce0e01 4305
f89e09cf 4306 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
ebce0e01
AB
4307
4308 btrfs_info(fs_info,
4309 "relocating block group %llu flags %s",
b3470b5d 4310 block_group->start, buf);
ebce0e01
AB
4311}
4312
430640e3
QW
4313static const char *stage_to_string(int stage)
4314{
4315 if (stage == MOVE_DATA_EXTENTS)
4316 return "move data extents";
4317 if (stage == UPDATE_DATA_PTRS)
4318 return "update data pointers";
4319 return "unknown";
4320}
4321
5d4f98a2
YZ
4322/*
4323 * function to relocate all extents in a block group.
4324 */
6bccf3ab 4325int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
5d4f98a2 4326{
32da5386 4327 struct btrfs_block_group *bg;
6bccf3ab 4328 struct btrfs_root *extent_root = fs_info->extent_root;
5d4f98a2 4329 struct reloc_control *rc;
0af3d00b
JB
4330 struct inode *inode;
4331 struct btrfs_path *path;
5d4f98a2 4332 int ret;
f0486c68 4333 int rw = 0;
5d4f98a2
YZ
4334 int err = 0;
4335
eede2bf3
OS
4336 bg = btrfs_lookup_block_group(fs_info, group_start);
4337 if (!bg)
4338 return -ENOENT;
4339
4340 if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4341 btrfs_put_block_group(bg);
4342 return -ETXTBSY;
4343 }
4344
c258d6e3 4345 rc = alloc_reloc_control(fs_info);
eede2bf3
OS
4346 if (!rc) {
4347 btrfs_put_block_group(bg);
5d4f98a2 4348 return -ENOMEM;
eede2bf3 4349 }
5d4f98a2 4350
f0486c68 4351 rc->extent_root = extent_root;
eede2bf3 4352 rc->block_group = bg;
5d4f98a2 4353
b12de528 4354 ret = btrfs_inc_block_group_ro(rc->block_group, true);
868f401a
Z
4355 if (ret) {
4356 err = ret;
4357 goto out;
f0486c68 4358 }
868f401a 4359 rw = 1;
f0486c68 4360
0af3d00b
JB
4361 path = btrfs_alloc_path();
4362 if (!path) {
4363 err = -ENOMEM;
4364 goto out;
4365 }
4366
7949f339 4367 inode = lookup_free_space_inode(rc->block_group, path);
0af3d00b
JB
4368 btrfs_free_path(path);
4369
4370 if (!IS_ERR(inode))
1bbc621e 4371 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
0af3d00b
JB
4372 else
4373 ret = PTR_ERR(inode);
4374
4375 if (ret && ret != -ENOENT) {
4376 err = ret;
4377 goto out;
4378 }
4379
5d4f98a2
YZ
4380 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4381 if (IS_ERR(rc->data_inode)) {
4382 err = PTR_ERR(rc->data_inode);
4383 rc->data_inode = NULL;
4384 goto out;
4385 }
4386
0b246afa 4387 describe_relocation(fs_info, rc->block_group);
5d4f98a2 4388
9cfa3e34 4389 btrfs_wait_block_group_reservations(rc->block_group);
f78c436c 4390 btrfs_wait_nocow_writers(rc->block_group);
6374e57a 4391 btrfs_wait_ordered_roots(fs_info, U64_MAX,
b3470b5d
DS
4392 rc->block_group->start,
4393 rc->block_group->length);
5d4f98a2
YZ
4394
4395 while (1) {
430640e3
QW
4396 int finishes_stage;
4397
76dda93c 4398 mutex_lock(&fs_info->cleaner_mutex);
5d4f98a2 4399 ret = relocate_block_group(rc);
76dda93c 4400 mutex_unlock(&fs_info->cleaner_mutex);
ff612ba7 4401 if (ret < 0)
5d4f98a2 4402 err = ret;
5d4f98a2 4403
430640e3 4404 finishes_stage = rc->stage;
ff612ba7
JB
4405 /*
4406 * We may have gotten ENOSPC after we already dirtied some
4407 * extents. If writeout happens while we're relocating a
4408 * different block group we could end up hitting the
4409 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4410 * btrfs_reloc_cow_block. Make sure we write everything out
4411 * properly so we don't trip over this problem, and then break
4412 * out of the loop if we hit an error.
4413 */
5d4f98a2 4414 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
0ef8b726
JB
4415 ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4416 (u64)-1);
ff612ba7 4417 if (ret)
0ef8b726 4418 err = ret;
5d4f98a2
YZ
4419 invalidate_mapping_pages(rc->data_inode->i_mapping,
4420 0, -1);
4421 rc->stage = UPDATE_DATA_PTRS;
5d4f98a2 4422 }
ff612ba7
JB
4423
4424 if (err < 0)
4425 goto out;
4426
4427 if (rc->extents_found == 0)
4428 break;
4429
430640e3
QW
4430 btrfs_info(fs_info, "found %llu extents, stage: %s",
4431 rc->extents_found, stage_to_string(finishes_stage));
5d4f98a2
YZ
4432 }
4433
5d4f98a2
YZ
4434 WARN_ON(rc->block_group->pinned > 0);
4435 WARN_ON(rc->block_group->reserved > 0);
bf38be65 4436 WARN_ON(rc->block_group->used > 0);
5d4f98a2 4437out:
f0486c68 4438 if (err && rw)
2ff7e61e 4439 btrfs_dec_block_group_ro(rc->block_group);
5d4f98a2 4440 iput(rc->data_inode);
5d4f98a2 4441 btrfs_put_block_group(rc->block_group);
1a0afa0e 4442 free_reloc_control(rc);
5d4f98a2
YZ
4443 return err;
4444}
4445
76dda93c
YZ
4446static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4447{
0b246afa 4448 struct btrfs_fs_info *fs_info = root->fs_info;
76dda93c 4449 struct btrfs_trans_handle *trans;
79787eaa 4450 int ret, err;
76dda93c 4451
0b246afa 4452 trans = btrfs_start_transaction(fs_info->tree_root, 0);
79787eaa
JM
4453 if (IS_ERR(trans))
4454 return PTR_ERR(trans);
76dda93c
YZ
4455
4456 memset(&root->root_item.drop_progress, 0,
4457 sizeof(root->root_item.drop_progress));
4458 root->root_item.drop_level = 0;
4459 btrfs_set_root_refs(&root->root_item, 0);
0b246afa 4460 ret = btrfs_update_root(trans, fs_info->tree_root,
76dda93c 4461 &root->root_key, &root->root_item);
76dda93c 4462
3a45bb20 4463 err = btrfs_end_transaction(trans);
79787eaa
JM
4464 if (err)
4465 return err;
4466 return ret;
76dda93c
YZ
4467}
4468
5d4f98a2
YZ
4469/*
4470 * recover relocation interrupted by system crash.
4471 *
4472 * this function resumes merging reloc trees with corresponding fs trees.
4473 * this is important for keeping the sharing of tree blocks
4474 */
4475int btrfs_recover_relocation(struct btrfs_root *root)
4476{
0b246afa 4477 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
4478 LIST_HEAD(reloc_roots);
4479 struct btrfs_key key;
4480 struct btrfs_root *fs_root;
4481 struct btrfs_root *reloc_root;
4482 struct btrfs_path *path;
4483 struct extent_buffer *leaf;
4484 struct reloc_control *rc = NULL;
4485 struct btrfs_trans_handle *trans;
4486 int ret;
4487 int err = 0;
4488
4489 path = btrfs_alloc_path();
4490 if (!path)
4491 return -ENOMEM;
e4058b54 4492 path->reada = READA_BACK;
5d4f98a2
YZ
4493
4494 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4495 key.type = BTRFS_ROOT_ITEM_KEY;
4496 key.offset = (u64)-1;
4497
4498 while (1) {
0b246afa 4499 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
5d4f98a2
YZ
4500 path, 0, 0);
4501 if (ret < 0) {
4502 err = ret;
4503 goto out;
4504 }
4505 if (ret > 0) {
4506 if (path->slots[0] == 0)
4507 break;
4508 path->slots[0]--;
4509 }
4510 leaf = path->nodes[0];
4511 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
b3b4aa74 4512 btrfs_release_path(path);
5d4f98a2
YZ
4513
4514 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4515 key.type != BTRFS_ROOT_ITEM_KEY)
4516 break;
4517
3dbf1738 4518 reloc_root = btrfs_read_tree_root(root, &key);
5d4f98a2
YZ
4519 if (IS_ERR(reloc_root)) {
4520 err = PTR_ERR(reloc_root);
4521 goto out;
4522 }
4523
3dbf1738 4524 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
5d4f98a2
YZ
4525 list_add(&reloc_root->root_list, &reloc_roots);
4526
4527 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
0b246afa 4528 fs_root = read_fs_root(fs_info,
5d4f98a2
YZ
4529 reloc_root->root_key.offset);
4530 if (IS_ERR(fs_root)) {
76dda93c
YZ
4531 ret = PTR_ERR(fs_root);
4532 if (ret != -ENOENT) {
4533 err = ret;
4534 goto out;
4535 }
79787eaa
JM
4536 ret = mark_garbage_root(reloc_root);
4537 if (ret < 0) {
4538 err = ret;
4539 goto out;
4540 }
932fd26d 4541 } else {
00246528 4542 btrfs_put_root(fs_root);
5d4f98a2
YZ
4543 }
4544 }
4545
4546 if (key.offset == 0)
4547 break;
4548
4549 key.offset--;
4550 }
b3b4aa74 4551 btrfs_release_path(path);
5d4f98a2
YZ
4552
4553 if (list_empty(&reloc_roots))
4554 goto out;
4555
c258d6e3 4556 rc = alloc_reloc_control(fs_info);
5d4f98a2
YZ
4557 if (!rc) {
4558 err = -ENOMEM;
4559 goto out;
4560 }
4561
0b246afa 4562 rc->extent_root = fs_info->extent_root;
5d4f98a2
YZ
4563
4564 set_reloc_control(rc);
4565
7a7eaa40 4566 trans = btrfs_join_transaction(rc->extent_root);
3612b495 4567 if (IS_ERR(trans)) {
3612b495 4568 err = PTR_ERR(trans);
fb2d83ee 4569 goto out_unset;
3612b495 4570 }
3fd0a558
YZ
4571
4572 rc->merge_reloc_tree = 1;
4573
5d4f98a2
YZ
4574 while (!list_empty(&reloc_roots)) {
4575 reloc_root = list_entry(reloc_roots.next,
4576 struct btrfs_root, root_list);
4577 list_del(&reloc_root->root_list);
4578
4579 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4580 list_add_tail(&reloc_root->root_list,
4581 &rc->reloc_roots);
4582 continue;
4583 }
4584
0b246afa 4585 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
79787eaa
JM
4586 if (IS_ERR(fs_root)) {
4587 err = PTR_ERR(fs_root);
ca1aa281 4588 list_add_tail(&reloc_root->root_list, &reloc_roots);
1402d17d 4589 btrfs_end_transaction(trans);
fb2d83ee 4590 goto out_unset;
79787eaa 4591 }
5d4f98a2 4592
ffd7b339 4593 err = __add_reloc_root(reloc_root);
79787eaa 4594 BUG_ON(err < 0); /* -ENOMEM or logic error */
f44deb74 4595 fs_root->reloc_root = btrfs_grab_root(reloc_root);
00246528 4596 btrfs_put_root(fs_root);
5d4f98a2
YZ
4597 }
4598
3a45bb20 4599 err = btrfs_commit_transaction(trans);
79787eaa 4600 if (err)
fb2d83ee 4601 goto out_unset;
5d4f98a2
YZ
4602
4603 merge_reloc_roots(rc);
4604
4605 unset_reloc_control(rc);
4606
7a7eaa40 4607 trans = btrfs_join_transaction(rc->extent_root);
62b99540 4608 if (IS_ERR(trans)) {
3612b495 4609 err = PTR_ERR(trans);
6217b0fa 4610 goto out_clean;
62b99540 4611 }
3a45bb20 4612 err = btrfs_commit_transaction(trans);
6217b0fa 4613out_clean:
d2311e69
QW
4614 ret = clean_dirty_subvols(rc);
4615 if (ret < 0 && !err)
4616 err = ret;
fb2d83ee
JB
4617out_unset:
4618 unset_reloc_control(rc);
1a0afa0e 4619 free_reloc_control(rc);
3612b495 4620out:
aca1bba6
LB
4621 if (!list_empty(&reloc_roots))
4622 free_reloc_roots(&reloc_roots);
4623
5d4f98a2
YZ
4624 btrfs_free_path(path);
4625
4626 if (err == 0) {
4627 /* cleanup orphan inode in data relocation tree */
0b246afa 4628 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
932fd26d 4629 if (IS_ERR(fs_root)) {
5d4f98a2 4630 err = PTR_ERR(fs_root);
932fd26d 4631 } else {
9f583209 4632 err = btrfs_orphan_cleanup(fs_root);
00246528 4633 btrfs_put_root(fs_root);
932fd26d 4634 }
5d4f98a2
YZ
4635 }
4636 return err;
4637}
4638
4639/*
4640 * helper to add ordered checksum for data relocation.
4641 *
4642 * cloning checksum properly handles the nodatasum extents.
4643 * it also saves CPU time to re-calculate the checksum.
4644 */
4645int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4646{
0b246afa 4647 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5d4f98a2 4648 struct btrfs_ordered_sum *sums;
5d4f98a2 4649 struct btrfs_ordered_extent *ordered;
5d4f98a2
YZ
4650 int ret;
4651 u64 disk_bytenr;
4577b014 4652 u64 new_bytenr;
5d4f98a2
YZ
4653 LIST_HEAD(list);
4654
4655 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
bffe633e 4656 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
5d4f98a2
YZ
4657
4658 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
0b246afa 4659 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
a2de733c 4660 disk_bytenr + len - 1, &list, 0);
79787eaa
JM
4661 if (ret)
4662 goto out;
5d4f98a2
YZ
4663
4664 while (!list_empty(&list)) {
4665 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4666 list_del_init(&sums->list);
4667
4577b014
JB
4668 /*
4669 * We need to offset the new_bytenr based on where the csum is.
4670 * We need to do this because we will read in entire prealloc
4671 * extents but we may have written to say the middle of the
4672 * prealloc extent, so we need to make sure the csum goes with
4673 * the right disk offset.
4674 *
4675 * We can do this because the data reloc inode refers strictly
4676 * to the on disk bytes, so we don't have to worry about
4677 * disk_len vs real len like with real inodes since it's all
4678 * disk length.
4679 */
bffe633e 4680 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
4577b014 4681 sums->bytenr = new_bytenr;
5d4f98a2 4682
f9756261 4683 btrfs_add_ordered_sum(ordered, sums);
5d4f98a2 4684 }
79787eaa 4685out:
5d4f98a2 4686 btrfs_put_ordered_extent(ordered);
411fc6bc 4687 return ret;
5d4f98a2 4688}
3fd0a558 4689
83d4cfd4
JB
4690int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4691 struct btrfs_root *root, struct extent_buffer *buf,
4692 struct extent_buffer *cow)
3fd0a558 4693{
0b246afa 4694 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558 4695 struct reloc_control *rc;
a26195a5 4696 struct btrfs_backref_node *node;
3fd0a558
YZ
4697 int first_cow = 0;
4698 int level;
83d4cfd4 4699 int ret = 0;
3fd0a558 4700
0b246afa 4701 rc = fs_info->reloc_ctl;
3fd0a558 4702 if (!rc)
83d4cfd4 4703 return 0;
3fd0a558
YZ
4704
4705 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4706 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4707
4708 level = btrfs_header_level(buf);
4709 if (btrfs_header_generation(buf) <=
4710 btrfs_root_last_snapshot(&root->root_item))
4711 first_cow = 1;
4712
4713 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4714 rc->create_reloc_tree) {
4715 WARN_ON(!first_cow && level == 0);
4716
4717 node = rc->backref_cache.path[level];
4718 BUG_ON(node->bytenr != buf->start &&
4719 node->new_bytenr != buf->start);
4720
4721 drop_node_buffer(node);
67439dad 4722 atomic_inc(&cow->refs);
3fd0a558
YZ
4723 node->eb = cow;
4724 node->new_bytenr = cow->start;
4725
4726 if (!node->pending) {
4727 list_move_tail(&node->list,
4728 &rc->backref_cache.pending[level]);
4729 node->pending = 1;
4730 }
4731
4732 if (first_cow)
9569cc20 4733 mark_block_processed(rc, node);
3fd0a558
YZ
4734
4735 if (first_cow && level > 0)
4736 rc->nodes_relocated += buf->len;
4737 }
4738
83d4cfd4 4739 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
3fd0a558 4740 ret = replace_file_extents(trans, rc, root, cow);
83d4cfd4 4741 return ret;
3fd0a558
YZ
4742}
4743
4744/*
4745 * called before creating snapshot. it calculates metadata reservation
01327610 4746 * required for relocating tree blocks in the snapshot
3fd0a558 4747 */
147d256e 4748void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
3fd0a558
YZ
4749 u64 *bytes_to_reserve)
4750{
10995c04
QW
4751 struct btrfs_root *root = pending->root;
4752 struct reloc_control *rc = root->fs_info->reloc_ctl;
3fd0a558 4753
6282675e 4754 if (!rc || !have_reloc_root(root))
3fd0a558
YZ
4755 return;
4756
3fd0a558
YZ
4757 if (!rc->merge_reloc_tree)
4758 return;
4759
4760 root = root->reloc_root;
4761 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4762 /*
4763 * relocation is in the stage of merging trees. the space
4764 * used by merging a reloc tree is twice the size of
4765 * relocated tree nodes in the worst case. half for cowing
4766 * the reloc tree, half for cowing the fs tree. the space
4767 * used by cowing the reloc tree will be freed after the
4768 * tree is dropped. if we create snapshot, cowing the fs
4769 * tree may use more space than it frees. so we need
4770 * reserve extra space.
4771 */
4772 *bytes_to_reserve += rc->nodes_relocated;
4773}
4774
4775/*
4776 * called after snapshot is created. migrate block reservation
4777 * and create reloc root for the newly created snapshot
f44deb74
JB
4778 *
4779 * This is similar to btrfs_init_reloc_root(), we come out of here with two
4780 * references held on the reloc_root, one for root->reloc_root and one for
4781 * rc->reloc_roots.
3fd0a558 4782 */
49b25e05 4783int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3fd0a558
YZ
4784 struct btrfs_pending_snapshot *pending)
4785{
4786 struct btrfs_root *root = pending->root;
4787 struct btrfs_root *reloc_root;
4788 struct btrfs_root *new_root;
10995c04 4789 struct reloc_control *rc = root->fs_info->reloc_ctl;
3fd0a558
YZ
4790 int ret;
4791
6282675e 4792 if (!rc || !have_reloc_root(root))
49b25e05 4793 return 0;
3fd0a558
YZ
4794
4795 rc = root->fs_info->reloc_ctl;
4796 rc->merging_rsv_size += rc->nodes_relocated;
4797
4798 if (rc->merge_reloc_tree) {
4799 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4800 rc->block_rsv,
3a584174 4801 rc->nodes_relocated, true);
49b25e05
JM
4802 if (ret)
4803 return ret;
3fd0a558
YZ
4804 }
4805
4806 new_root = pending->snap;
4807 reloc_root = create_reloc_root(trans, root->reloc_root,
4808 new_root->root_key.objectid);
49b25e05
JM
4809 if (IS_ERR(reloc_root))
4810 return PTR_ERR(reloc_root);
3fd0a558 4811
ffd7b339
JM
4812 ret = __add_reloc_root(reloc_root);
4813 BUG_ON(ret < 0);
f44deb74 4814 new_root->reloc_root = btrfs_grab_root(reloc_root);
3fd0a558 4815
49b25e05 4816 if (rc->create_reloc_tree)
3fd0a558 4817 ret = clone_backref_node(trans, rc, root, reloc_root);
49b25e05 4818 return ret;
3fd0a558 4819}