]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/relocation.c
btrfs: remove err variable from btrfs_delete_subvolume
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / relocation.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
5d4f98a2
YZ
2/*
3 * Copyright (C) 2009 Oracle. All rights reserved.
5d4f98a2
YZ
4 */
5
6#include <linux/sched.h>
7#include <linux/pagemap.h>
8#include <linux/writeback.h>
9#include <linux/blkdev.h>
10#include <linux/rbtree.h>
5a0e3ad6 11#include <linux/slab.h>
726a3421 12#include <linux/error-injection.h>
5d4f98a2
YZ
13#include "ctree.h"
14#include "disk-io.h"
15#include "transaction.h"
16#include "volumes.h"
17#include "locking.h"
18#include "btrfs_inode.h"
19#include "async-thread.h"
0af3d00b 20#include "free-space-cache.h"
581bb050 21#include "inode-map.h"
62b99540 22#include "qgroup.h"
cdccee99 23#include "print-tree.h"
86736342 24#include "delalloc-space.h"
aac0023c 25#include "block-group.h"
19b546d7 26#include "backref.h"
e9a28dc5 27#include "misc.h"
5d4f98a2 28
0c891389
QW
29/*
30 * Relocation overview
31 *
32 * [What does relocation do]
33 *
34 * The objective of relocation is to relocate all extents of the target block
35 * group to other block groups.
36 * This is utilized by resize (shrink only), profile converting, compacting
37 * space, or balance routine to spread chunks over devices.
38 *
39 * Before | After
40 * ------------------------------------------------------------------
41 * BG A: 10 data extents | BG A: deleted
42 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
43 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
44 *
45 * [How does relocation work]
46 *
47 * 1. Mark the target block group read-only
48 * New extents won't be allocated from the target block group.
49 *
50 * 2.1 Record each extent in the target block group
51 * To build a proper map of extents to be relocated.
52 *
53 * 2.2 Build data reloc tree and reloc trees
54 * Data reloc tree will contain an inode, recording all newly relocated
55 * data extents.
56 * There will be only one data reloc tree for one data block group.
57 *
58 * Reloc tree will be a special snapshot of its source tree, containing
59 * relocated tree blocks.
60 * Each tree referring to a tree block in target block group will get its
61 * reloc tree built.
62 *
63 * 2.3 Swap source tree with its corresponding reloc tree
64 * Each involved tree only refers to new extents after swap.
65 *
66 * 3. Cleanup reloc trees and data reloc tree.
67 * As old extents in the target block group are still referenced by reloc
68 * trees, we need to clean them up before really freeing the target block
69 * group.
70 *
71 * The main complexity is in steps 2.2 and 2.3.
72 *
73 * The entry point of relocation is relocate_block_group() function.
74 */
75
2a979612 76#define RELOCATION_RESERVED_NODES 256
5d4f98a2
YZ
77/*
78 * map address of tree root to tree
79 */
80struct mapping_node {
e9a28dc5
QW
81 struct {
82 struct rb_node rb_node;
83 u64 bytenr;
84 }; /* Use rb_simle_node for search/insert */
5d4f98a2
YZ
85 void *data;
86};
87
88struct mapping_tree {
89 struct rb_root rb_root;
90 spinlock_t lock;
91};
92
93/*
94 * present a tree block to process
95 */
96struct tree_block {
e9a28dc5
QW
97 struct {
98 struct rb_node rb_node;
99 u64 bytenr;
100 }; /* Use rb_simple_node for search/insert */
5d4f98a2
YZ
101 struct btrfs_key key;
102 unsigned int level:8;
103 unsigned int key_ready:1;
104};
105
0257bb82
YZ
106#define MAX_EXTENTS 128
107
108struct file_extent_cluster {
109 u64 start;
110 u64 end;
111 u64 boundary[MAX_EXTENTS];
112 unsigned int nr;
113};
114
5d4f98a2
YZ
115struct reloc_control {
116 /* block group to relocate */
32da5386 117 struct btrfs_block_group *block_group;
5d4f98a2
YZ
118 /* extent tree */
119 struct btrfs_root *extent_root;
120 /* inode for moving data */
121 struct inode *data_inode;
3fd0a558
YZ
122
123 struct btrfs_block_rsv *block_rsv;
124
a26195a5 125 struct btrfs_backref_cache backref_cache;
3fd0a558
YZ
126
127 struct file_extent_cluster cluster;
5d4f98a2
YZ
128 /* tree blocks have been processed */
129 struct extent_io_tree processed_blocks;
130 /* map start of tree root to corresponding reloc tree */
131 struct mapping_tree reloc_root_tree;
132 /* list of reloc trees */
133 struct list_head reloc_roots;
d2311e69
QW
134 /* list of subvolume trees that get relocated */
135 struct list_head dirty_subvol_roots;
3fd0a558
YZ
136 /* size of metadata reservation for merging reloc trees */
137 u64 merging_rsv_size;
138 /* size of relocated tree nodes */
139 u64 nodes_relocated;
0647bf56
WS
140 /* reserved size for block group relocation*/
141 u64 reserved_bytes;
3fd0a558 142
5d4f98a2
YZ
143 u64 search_start;
144 u64 extents_found;
3fd0a558 145
3fd0a558
YZ
146 unsigned int stage:8;
147 unsigned int create_reloc_tree:1;
148 unsigned int merge_reloc_tree:1;
5d4f98a2 149 unsigned int found_file_extent:1;
5d4f98a2
YZ
150};
151
152/* stages of data relocation */
153#define MOVE_DATA_EXTENTS 0
154#define UPDATE_DATA_PTRS 1
155
9569cc20 156static void mark_block_processed(struct reloc_control *rc,
a26195a5 157 struct btrfs_backref_node *node)
9569cc20
QW
158{
159 u32 blocksize;
160
161 if (node->level == 0 ||
162 in_range(node->bytenr, rc->block_group->start,
163 rc->block_group->length)) {
164 blocksize = rc->extent_root->fs_info->nodesize;
165 set_extent_bits(&rc->processed_blocks, node->bytenr,
166 node->bytenr + blocksize - 1, EXTENT_DIRTY);
167 }
168 node->processed = 1;
169}
170
5d4f98a2
YZ
171
172static void mapping_tree_init(struct mapping_tree *tree)
173{
6bef4d31 174 tree->rb_root = RB_ROOT;
5d4f98a2
YZ
175 spin_lock_init(&tree->lock);
176}
177
5d4f98a2
YZ
178/*
179 * walk up backref nodes until reach node presents tree root
180 */
a26195a5
QW
181static struct btrfs_backref_node *walk_up_backref(
182 struct btrfs_backref_node *node,
183 struct btrfs_backref_edge *edges[], int *index)
5d4f98a2 184{
a26195a5 185 struct btrfs_backref_edge *edge;
5d4f98a2
YZ
186 int idx = *index;
187
188 while (!list_empty(&node->upper)) {
189 edge = list_entry(node->upper.next,
a26195a5 190 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
191 edges[idx++] = edge;
192 node = edge->node[UPPER];
193 }
3fd0a558 194 BUG_ON(node->detached);
5d4f98a2
YZ
195 *index = idx;
196 return node;
197}
198
199/*
200 * walk down backref nodes to find start of next reference path
201 */
a26195a5
QW
202static struct btrfs_backref_node *walk_down_backref(
203 struct btrfs_backref_edge *edges[], int *index)
5d4f98a2 204{
a26195a5
QW
205 struct btrfs_backref_edge *edge;
206 struct btrfs_backref_node *lower;
5d4f98a2
YZ
207 int idx = *index;
208
209 while (idx > 0) {
210 edge = edges[idx - 1];
211 lower = edge->node[LOWER];
212 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
213 idx--;
214 continue;
215 }
216 edge = list_entry(edge->list[LOWER].next,
a26195a5 217 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
218 edges[idx - 1] = edge;
219 *index = idx;
220 return edge->node[UPPER];
221 }
222 *index = 0;
223 return NULL;
224}
5d4f98a2 225
a26195a5
QW
226static void update_backref_node(struct btrfs_backref_cache *cache,
227 struct btrfs_backref_node *node, u64 bytenr)
3fd0a558
YZ
228{
229 struct rb_node *rb_node;
230 rb_erase(&node->rb_node, &cache->rb_root);
231 node->bytenr = bytenr;
e9a28dc5 232 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
43c04fb1 233 if (rb_node)
982c92cb 234 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
3fd0a558
YZ
235}
236
237/*
238 * update backref cache after a transaction commit
239 */
240static int update_backref_cache(struct btrfs_trans_handle *trans,
a26195a5 241 struct btrfs_backref_cache *cache)
3fd0a558 242{
a26195a5 243 struct btrfs_backref_node *node;
3fd0a558
YZ
244 int level = 0;
245
246 if (cache->last_trans == 0) {
247 cache->last_trans = trans->transid;
248 return 0;
249 }
250
251 if (cache->last_trans == trans->transid)
252 return 0;
253
254 /*
255 * detached nodes are used to avoid unnecessary backref
256 * lookup. transaction commit changes the extent tree.
257 * so the detached nodes are no longer useful.
258 */
259 while (!list_empty(&cache->detached)) {
260 node = list_entry(cache->detached.next,
a26195a5 261 struct btrfs_backref_node, list);
023acb07 262 btrfs_backref_cleanup_node(cache, node);
3fd0a558
YZ
263 }
264
265 while (!list_empty(&cache->changed)) {
266 node = list_entry(cache->changed.next,
a26195a5 267 struct btrfs_backref_node, list);
3fd0a558
YZ
268 list_del_init(&node->list);
269 BUG_ON(node->pending);
270 update_backref_node(cache, node, node->new_bytenr);
271 }
272
273 /*
274 * some nodes can be left in the pending list if there were
275 * errors during processing the pending nodes.
276 */
277 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
278 list_for_each_entry(node, &cache->pending[level], list) {
279 BUG_ON(!node->pending);
280 if (node->bytenr == node->new_bytenr)
281 continue;
282 update_backref_node(cache, node, node->new_bytenr);
283 }
284 }
285
286 cache->last_trans = 0;
287 return 1;
288}
289
6282675e
QW
290static bool reloc_root_is_dead(struct btrfs_root *root)
291{
292 /*
293 * Pair with set_bit/clear_bit in clean_dirty_subvols and
294 * btrfs_update_reloc_root. We need to see the updated bit before
295 * trying to access reloc_root
296 */
297 smp_rmb();
298 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
299 return true;
300 return false;
301}
302
303/*
304 * Check if this subvolume tree has valid reloc tree.
305 *
306 * Reloc tree after swap is considered dead, thus not considered as valid.
307 * This is enough for most callers, as they don't distinguish dead reloc root
55465730
QW
308 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a
309 * special case.
6282675e
QW
310 */
311static bool have_reloc_root(struct btrfs_root *root)
312{
313 if (reloc_root_is_dead(root))
314 return false;
315 if (!root->reloc_root)
316 return false;
317 return true;
318}
f2a97a9d 319
55465730 320int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
3fd0a558
YZ
321{
322 struct btrfs_root *reloc_root;
323
92a7cc42 324 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
3fd0a558
YZ
325 return 0;
326
6282675e
QW
327 /* This root has been merged with its reloc tree, we can ignore it */
328 if (reloc_root_is_dead(root))
329 return 1;
330
3fd0a558
YZ
331 reloc_root = root->reloc_root;
332 if (!reloc_root)
333 return 0;
334
4d4225fc
JB
335 if (btrfs_header_generation(reloc_root->commit_root) ==
336 root->fs_info->running_transaction->transid)
3fd0a558
YZ
337 return 0;
338 /*
339 * if there is reloc tree and it was created in previous
340 * transaction backref lookup can find the reloc tree,
341 * so backref node for the fs tree root is useless for
342 * relocation.
343 */
344 return 1;
345}
55465730 346
5d4f98a2
YZ
347/*
348 * find reloc tree by address of tree root
349 */
2433bea5 350struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
5d4f98a2 351{
2433bea5 352 struct reloc_control *rc = fs_info->reloc_ctl;
5d4f98a2
YZ
353 struct rb_node *rb_node;
354 struct mapping_node *node;
355 struct btrfs_root *root = NULL;
356
2433bea5 357 ASSERT(rc);
5d4f98a2 358 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5 359 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
5d4f98a2
YZ
360 if (rb_node) {
361 node = rb_entry(rb_node, struct mapping_node, rb_node);
362 root = (struct btrfs_root *)node->data;
363 }
364 spin_unlock(&rc->reloc_root_tree.lock);
00246528 365 return btrfs_grab_root(root);
5d4f98a2
YZ
366}
367
29db137b
QW
368/*
369 * For useless nodes, do two major clean ups:
370 *
371 * - Cleanup the children edges and nodes
372 * If child node is also orphan (no parent) during cleanup, then the child
373 * node will also be cleaned up.
374 *
375 * - Freeing up leaves (level 0), keeps nodes detached
376 * For nodes, the node is still cached as "detached"
377 *
378 * Return false if @node is not in the @useless_nodes list.
379 * Return true if @node is in the @useless_nodes list.
380 */
381static bool handle_useless_nodes(struct reloc_control *rc,
a26195a5 382 struct btrfs_backref_node *node)
29db137b 383{
a26195a5 384 struct btrfs_backref_cache *cache = &rc->backref_cache;
29db137b
QW
385 struct list_head *useless_node = &cache->useless_node;
386 bool ret = false;
387
388 while (!list_empty(useless_node)) {
a26195a5 389 struct btrfs_backref_node *cur;
29db137b 390
a26195a5 391 cur = list_first_entry(useless_node, struct btrfs_backref_node,
29db137b
QW
392 list);
393 list_del_init(&cur->list);
394
395 /* Only tree root nodes can be added to @useless_nodes */
396 ASSERT(list_empty(&cur->upper));
397
398 if (cur == node)
399 ret = true;
400
401 /* The node is the lowest node */
402 if (cur->lowest) {
403 list_del_init(&cur->lower);
404 cur->lowest = 0;
405 }
406
407 /* Cleanup the lower edges */
408 while (!list_empty(&cur->lower)) {
a26195a5
QW
409 struct btrfs_backref_edge *edge;
410 struct btrfs_backref_node *lower;
29db137b
QW
411
412 edge = list_entry(cur->lower.next,
a26195a5 413 struct btrfs_backref_edge, list[UPPER]);
29db137b
QW
414 list_del(&edge->list[UPPER]);
415 list_del(&edge->list[LOWER]);
416 lower = edge->node[LOWER];
741188d3 417 btrfs_backref_free_edge(cache, edge);
29db137b
QW
418
419 /* Child node is also orphan, queue for cleanup */
420 if (list_empty(&lower->upper))
421 list_add(&lower->list, useless_node);
422 }
423 /* Mark this block processed for relocation */
424 mark_block_processed(rc, cur);
425
426 /*
427 * Backref nodes for tree leaves are deleted from the cache.
428 * Backref nodes for upper level tree blocks are left in the
429 * cache to avoid unnecessary backref lookup.
430 */
431 if (cur->level > 0) {
432 list_add(&cur->list, &cache->detached);
433 cur->detached = 1;
434 } else {
435 rb_erase(&cur->rb_node, &cache->rb_root);
741188d3 436 btrfs_backref_free_node(cache, cur);
29db137b
QW
437 }
438 }
439 return ret;
440}
441
e7d571c7
QW
442/*
443 * Build backref tree for a given tree block. Root of the backref tree
444 * corresponds the tree block, leaves of the backref tree correspond roots of
445 * b-trees that reference the tree block.
446 *
447 * The basic idea of this function is check backrefs of a given block to find
448 * upper level blocks that reference the block, and then check backrefs of
449 * these upper level blocks recursively. The recursion stops when tree root is
450 * reached or backrefs for the block is cached.
451 *
452 * NOTE: if we find that backrefs for a block are cached, we know backrefs for
453 * all upper level blocks that directly/indirectly reference the block are also
454 * cached.
455 */
a26195a5 456static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
e7d571c7
QW
457 struct reloc_control *rc, struct btrfs_key *node_key,
458 int level, u64 bytenr)
459{
460 struct btrfs_backref_iter *iter;
a26195a5 461 struct btrfs_backref_cache *cache = &rc->backref_cache;
e7d571c7
QW
462 /* For searching parent of TREE_BLOCK_REF */
463 struct btrfs_path *path;
a26195a5 464 struct btrfs_backref_node *cur;
a26195a5
QW
465 struct btrfs_backref_node *node = NULL;
466 struct btrfs_backref_edge *edge;
e7d571c7
QW
467 int ret;
468 int err = 0;
5d4f98a2 469
e7d571c7
QW
470 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
471 if (!iter)
472 return ERR_PTR(-ENOMEM);
473 path = btrfs_alloc_path();
474 if (!path) {
475 err = -ENOMEM;
476 goto out;
477 }
478
b1818dab 479 node = btrfs_backref_alloc_node(cache, bytenr, level);
e7d571c7
QW
480 if (!node) {
481 err = -ENOMEM;
482 goto out;
5d4f98a2
YZ
483 }
484
e7d571c7
QW
485 node->lowest = 1;
486 cur = node;
487
488 /* Breadth-first search to build backref cache */
489 do {
1b60d2ec
QW
490 ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
491 cur);
e7d571c7
QW
492 if (ret < 0) {
493 err = ret;
494 goto out;
495 }
496 edge = list_first_entry_or_null(&cache->pending_edge,
a26195a5 497 struct btrfs_backref_edge, list[UPPER]);
e7d571c7
QW
498 /*
499 * The pending list isn't empty, take the first block to
500 * process
501 */
502 if (edge) {
503 list_del_init(&edge->list[UPPER]);
504 cur = edge->node[UPPER];
505 }
506 } while (edge);
507
1f872924 508 /* Finish the upper linkage of newly added edges/nodes */
fc997ed0 509 ret = btrfs_backref_finish_upper_links(cache, node);
1f872924
QW
510 if (ret < 0) {
511 err = ret;
512 goto out;
3fd0a558 513 }
5d4f98a2 514
29db137b
QW
515 if (handle_useless_nodes(rc, node))
516 node = NULL;
5d4f98a2 517out:
71f572a9
QW
518 btrfs_backref_iter_free(iter);
519 btrfs_free_path(path);
5d4f98a2 520 if (err) {
1b23ea18 521 btrfs_backref_error_cleanup(cache, node);
5d4f98a2
YZ
522 return ERR_PTR(err);
523 }
75bfb9af 524 ASSERT(!node || !node->detached);
84780289
QW
525 ASSERT(list_empty(&cache->useless_node) &&
526 list_empty(&cache->pending_edge));
5d4f98a2
YZ
527 return node;
528}
529
3fd0a558
YZ
530/*
531 * helper to add backref node for the newly created snapshot.
532 * the backref node is created by cloning backref node that
533 * corresponds to root of source tree
534 */
535static int clone_backref_node(struct btrfs_trans_handle *trans,
536 struct reloc_control *rc,
537 struct btrfs_root *src,
538 struct btrfs_root *dest)
539{
540 struct btrfs_root *reloc_root = src->reloc_root;
a26195a5
QW
541 struct btrfs_backref_cache *cache = &rc->backref_cache;
542 struct btrfs_backref_node *node = NULL;
543 struct btrfs_backref_node *new_node;
544 struct btrfs_backref_edge *edge;
545 struct btrfs_backref_edge *new_edge;
3fd0a558
YZ
546 struct rb_node *rb_node;
547
548 if (cache->last_trans > 0)
549 update_backref_cache(trans, cache);
550
e9a28dc5 551 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
3fd0a558 552 if (rb_node) {
a26195a5 553 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
3fd0a558
YZ
554 if (node->detached)
555 node = NULL;
556 else
557 BUG_ON(node->new_bytenr != reloc_root->node->start);
558 }
559
560 if (!node) {
e9a28dc5
QW
561 rb_node = rb_simple_search(&cache->rb_root,
562 reloc_root->commit_root->start);
3fd0a558 563 if (rb_node) {
a26195a5 564 node = rb_entry(rb_node, struct btrfs_backref_node,
3fd0a558
YZ
565 rb_node);
566 BUG_ON(node->detached);
567 }
568 }
569
570 if (!node)
571 return 0;
572
b1818dab
QW
573 new_node = btrfs_backref_alloc_node(cache, dest->node->start,
574 node->level);
3fd0a558
YZ
575 if (!new_node)
576 return -ENOMEM;
577
3fd0a558 578 new_node->lowest = node->lowest;
6848ad64 579 new_node->checked = 1;
00246528 580 new_node->root = btrfs_grab_root(dest);
0b530bc5 581 ASSERT(new_node->root);
3fd0a558
YZ
582
583 if (!node->lowest) {
584 list_for_each_entry(edge, &node->lower, list[UPPER]) {
47254d07 585 new_edge = btrfs_backref_alloc_edge(cache);
3fd0a558
YZ
586 if (!new_edge)
587 goto fail;
588
f39911e5
QW
589 btrfs_backref_link_edge(new_edge, edge->node[LOWER],
590 new_node, LINK_UPPER);
3fd0a558 591 }
76b9e23d
MX
592 } else {
593 list_add_tail(&new_node->lower, &cache->leaves);
3fd0a558
YZ
594 }
595
e9a28dc5
QW
596 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
597 &new_node->rb_node);
43c04fb1 598 if (rb_node)
982c92cb 599 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
3fd0a558
YZ
600
601 if (!new_node->lowest) {
602 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
603 list_add_tail(&new_edge->list[LOWER],
604 &new_edge->node[LOWER]->upper);
605 }
606 }
607 return 0;
608fail:
609 while (!list_empty(&new_node->lower)) {
610 new_edge = list_entry(new_node->lower.next,
a26195a5 611 struct btrfs_backref_edge, list[UPPER]);
3fd0a558 612 list_del(&new_edge->list[UPPER]);
741188d3 613 btrfs_backref_free_edge(cache, new_edge);
3fd0a558 614 }
741188d3 615 btrfs_backref_free_node(cache, new_node);
3fd0a558
YZ
616 return -ENOMEM;
617}
618
5d4f98a2
YZ
619/*
620 * helper to add 'address of tree root -> reloc tree' mapping
621 */
ffd7b339 622static int __must_check __add_reloc_root(struct btrfs_root *root)
5d4f98a2 623{
0b246afa 624 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
625 struct rb_node *rb_node;
626 struct mapping_node *node;
0b246afa 627 struct reloc_control *rc = fs_info->reloc_ctl;
5d4f98a2
YZ
628
629 node = kmalloc(sizeof(*node), GFP_NOFS);
ffd7b339
JM
630 if (!node)
631 return -ENOMEM;
5d4f98a2 632
ea287ab1 633 node->bytenr = root->commit_root->start;
5d4f98a2
YZ
634 node->data = root;
635
636 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5
QW
637 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
638 node->bytenr, &node->rb_node);
5d4f98a2 639 spin_unlock(&rc->reloc_root_tree.lock);
ffd7b339 640 if (rb_node) {
0b246afa 641 btrfs_panic(fs_info, -EEXIST,
5d163e0e
JM
642 "Duplicate root found for start=%llu while inserting into relocation tree",
643 node->bytenr);
ffd7b339 644 }
5d4f98a2
YZ
645
646 list_add_tail(&root->root_list, &rc->reloc_roots);
647 return 0;
648}
649
650/*
c974c464 651 * helper to delete the 'address of tree root -> reloc tree'
5d4f98a2
YZ
652 * mapping
653 */
c974c464 654static void __del_reloc_root(struct btrfs_root *root)
5d4f98a2 655{
0b246afa 656 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
657 struct rb_node *rb_node;
658 struct mapping_node *node = NULL;
0b246afa 659 struct reloc_control *rc = fs_info->reloc_ctl;
f44deb74 660 bool put_ref = false;
5d4f98a2 661
65c6e82b 662 if (rc && root->node) {
389305b2 663 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5
QW
664 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
665 root->commit_root->start);
389305b2
QW
666 if (rb_node) {
667 node = rb_entry(rb_node, struct mapping_node, rb_node);
668 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
ea287ab1 669 RB_CLEAR_NODE(&node->rb_node);
389305b2
QW
670 }
671 spin_unlock(&rc->reloc_root_tree.lock);
672 if (!node)
673 return;
674 BUG_ON((struct btrfs_root *)node->data != root);
5d4f98a2 675 }
5d4f98a2 676
f44deb74
JB
677 /*
678 * We only put the reloc root here if it's on the list. There's a lot
679 * of places where the pattern is to splice the rc->reloc_roots, process
680 * the reloc roots, and then add the reloc root back onto
681 * rc->reloc_roots. If we call __del_reloc_root while it's off of the
682 * list we don't want the reference being dropped, because the guy
683 * messing with the list is in charge of the reference.
684 */
0b246afa 685 spin_lock(&fs_info->trans_lock);
f44deb74
JB
686 if (!list_empty(&root->root_list)) {
687 put_ref = true;
688 list_del_init(&root->root_list);
689 }
0b246afa 690 spin_unlock(&fs_info->trans_lock);
f44deb74
JB
691 if (put_ref)
692 btrfs_put_root(root);
c974c464
WS
693 kfree(node);
694}
695
696/*
697 * helper to update the 'address of tree root -> reloc tree'
698 * mapping
699 */
ea287ab1 700static int __update_reloc_root(struct btrfs_root *root)
c974c464 701{
0b246afa 702 struct btrfs_fs_info *fs_info = root->fs_info;
c974c464
WS
703 struct rb_node *rb_node;
704 struct mapping_node *node = NULL;
0b246afa 705 struct reloc_control *rc = fs_info->reloc_ctl;
c974c464
WS
706
707 spin_lock(&rc->reloc_root_tree.lock);
e9a28dc5
QW
708 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
709 root->commit_root->start);
c974c464
WS
710 if (rb_node) {
711 node = rb_entry(rb_node, struct mapping_node, rb_node);
712 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
5d4f98a2 713 }
c974c464
WS
714 spin_unlock(&rc->reloc_root_tree.lock);
715
716 if (!node)
717 return 0;
718 BUG_ON((struct btrfs_root *)node->data != root);
719
720 spin_lock(&rc->reloc_root_tree.lock);
ea287ab1 721 node->bytenr = root->node->start;
e9a28dc5
QW
722 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
723 node->bytenr, &node->rb_node);
c974c464
WS
724 spin_unlock(&rc->reloc_root_tree.lock);
725 if (rb_node)
982c92cb 726 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
5d4f98a2
YZ
727 return 0;
728}
729
3fd0a558
YZ
730static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
731 struct btrfs_root *root, u64 objectid)
5d4f98a2 732{
0b246afa 733 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
734 struct btrfs_root *reloc_root;
735 struct extent_buffer *eb;
736 struct btrfs_root_item *root_item;
737 struct btrfs_key root_key;
738 int ret;
739
5d4f98a2
YZ
740 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
741 BUG_ON(!root_item);
742
743 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
744 root_key.type = BTRFS_ROOT_ITEM_KEY;
3fd0a558 745 root_key.offset = objectid;
5d4f98a2 746
3fd0a558 747 if (root->root_key.objectid == objectid) {
054570a1
FM
748 u64 commit_root_gen;
749
3fd0a558
YZ
750 /* called by btrfs_init_reloc_root */
751 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
752 BTRFS_TREE_RELOC_OBJECTID);
753 BUG_ON(ret);
054570a1
FM
754 /*
755 * Set the last_snapshot field to the generation of the commit
756 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
757 * correctly (returns true) when the relocation root is created
758 * either inside the critical section of a transaction commit
759 * (through transaction.c:qgroup_account_snapshot()) and when
760 * it's created before the transaction commit is started.
761 */
762 commit_root_gen = btrfs_header_generation(root->commit_root);
763 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
3fd0a558
YZ
764 } else {
765 /*
766 * called by btrfs_reloc_post_snapshot_hook.
767 * the source tree is a reloc tree, all tree blocks
768 * modified after it was created have RELOC flag
769 * set in their headers. so it's OK to not update
770 * the 'last_snapshot'.
771 */
772 ret = btrfs_copy_root(trans, root, root->node, &eb,
773 BTRFS_TREE_RELOC_OBJECTID);
774 BUG_ON(ret);
775 }
5d4f98a2 776
5d4f98a2 777 memcpy(root_item, &root->root_item, sizeof(*root_item));
5d4f98a2
YZ
778 btrfs_set_root_bytenr(root_item, eb->start);
779 btrfs_set_root_level(root_item, btrfs_header_level(eb));
780 btrfs_set_root_generation(root_item, trans->transid);
3fd0a558
YZ
781
782 if (root->root_key.objectid == objectid) {
783 btrfs_set_root_refs(root_item, 0);
784 memset(&root_item->drop_progress, 0,
785 sizeof(struct btrfs_disk_key));
c8422684 786 btrfs_set_root_drop_level(root_item, 0);
3fd0a558 787 }
5d4f98a2
YZ
788
789 btrfs_tree_unlock(eb);
790 free_extent_buffer(eb);
791
0b246afa 792 ret = btrfs_insert_root(trans, fs_info->tree_root,
5d4f98a2
YZ
793 &root_key, root_item);
794 BUG_ON(ret);
795 kfree(root_item);
796
3dbf1738 797 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
5d4f98a2 798 BUG_ON(IS_ERR(reloc_root));
92a7cc42 799 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
5d4f98a2 800 reloc_root->last_trans = trans->transid;
3fd0a558
YZ
801 return reloc_root;
802}
803
804/*
805 * create reloc tree for a given fs tree. reloc tree is just a
806 * snapshot of the fs tree with special root objectid.
f44deb74
JB
807 *
808 * The reloc_root comes out of here with two references, one for
809 * root->reloc_root, and another for being on the rc->reloc_roots list.
3fd0a558
YZ
810 */
811int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
812 struct btrfs_root *root)
813{
0b246afa 814 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558 815 struct btrfs_root *reloc_root;
0b246afa 816 struct reloc_control *rc = fs_info->reloc_ctl;
20dd2cbf 817 struct btrfs_block_rsv *rsv;
3fd0a558 818 int clear_rsv = 0;
ffd7b339 819 int ret;
3fd0a558 820
aec7db3b 821 if (!rc)
2abc726a
JB
822 return 0;
823
1fac4a54
QW
824 /*
825 * The subvolume has reloc tree but the swap is finished, no need to
826 * create/update the dead reloc tree
827 */
6282675e 828 if (reloc_root_is_dead(root))
1fac4a54
QW
829 return 0;
830
aec7db3b
JB
831 /*
832 * This is subtle but important. We do not do
833 * record_root_in_transaction for reloc roots, instead we record their
834 * corresponding fs root, and then here we update the last trans for the
835 * reloc root. This means that we have to do this for the entire life
836 * of the reloc root, regardless of which stage of the relocation we are
837 * in.
838 */
3fd0a558
YZ
839 if (root->reloc_root) {
840 reloc_root = root->reloc_root;
841 reloc_root->last_trans = trans->transid;
842 return 0;
843 }
844
aec7db3b
JB
845 /*
846 * We are merging reloc roots, we do not need new reloc trees. Also
847 * reloc trees never need their own reloc tree.
848 */
849 if (!rc->create_reloc_tree ||
850 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
851 return 0;
852
20dd2cbf
MX
853 if (!trans->reloc_reserved) {
854 rsv = trans->block_rsv;
3fd0a558
YZ
855 trans->block_rsv = rc->block_rsv;
856 clear_rsv = 1;
857 }
858 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
859 if (clear_rsv)
20dd2cbf 860 trans->block_rsv = rsv;
5d4f98a2 861
ffd7b339
JM
862 ret = __add_reloc_root(reloc_root);
863 BUG_ON(ret < 0);
f44deb74 864 root->reloc_root = btrfs_grab_root(reloc_root);
5d4f98a2
YZ
865 return 0;
866}
867
868/*
869 * update root item of reloc tree
870 */
871int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
872 struct btrfs_root *root)
873{
0b246afa 874 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
875 struct btrfs_root *reloc_root;
876 struct btrfs_root_item *root_item;
5d4f98a2
YZ
877 int ret;
878
6282675e 879 if (!have_reloc_root(root))
7585717f 880 goto out;
5d4f98a2
YZ
881
882 reloc_root = root->reloc_root;
883 root_item = &reloc_root->root_item;
884
f44deb74
JB
885 /*
886 * We are probably ok here, but __del_reloc_root() will drop its ref of
887 * the root. We have the ref for root->reloc_root, but just in case
888 * hold it while we update the reloc root.
889 */
890 btrfs_grab_root(reloc_root);
891
d2311e69 892 /* root->reloc_root will stay until current relocation finished */
0b246afa 893 if (fs_info->reloc_ctl->merge_reloc_tree &&
3fd0a558 894 btrfs_root_refs(root_item) == 0) {
d2311e69 895 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
6282675e
QW
896 /*
897 * Mark the tree as dead before we change reloc_root so
898 * have_reloc_root will not touch it from now on.
899 */
900 smp_wmb();
c974c464 901 __del_reloc_root(reloc_root);
5d4f98a2
YZ
902 }
903
5d4f98a2 904 if (reloc_root->commit_root != reloc_root->node) {
ea287ab1 905 __update_reloc_root(reloc_root);
5d4f98a2
YZ
906 btrfs_set_root_node(root_item, reloc_root->node);
907 free_extent_buffer(reloc_root->commit_root);
908 reloc_root->commit_root = btrfs_root_node(reloc_root);
909 }
910
0b246afa 911 ret = btrfs_update_root(trans, fs_info->tree_root,
5d4f98a2
YZ
912 &reloc_root->root_key, root_item);
913 BUG_ON(ret);
f44deb74 914 btrfs_put_root(reloc_root);
7585717f 915out:
5d4f98a2
YZ
916 return 0;
917}
918
919/*
920 * helper to find first cached inode with inode number >= objectid
921 * in a subvolume
922 */
923static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
924{
925 struct rb_node *node;
926 struct rb_node *prev;
927 struct btrfs_inode *entry;
928 struct inode *inode;
929
930 spin_lock(&root->inode_lock);
931again:
932 node = root->inode_tree.rb_node;
933 prev = NULL;
934 while (node) {
935 prev = node;
936 entry = rb_entry(node, struct btrfs_inode, rb_node);
937
4a0cc7ca 938 if (objectid < btrfs_ino(entry))
5d4f98a2 939 node = node->rb_left;
4a0cc7ca 940 else if (objectid > btrfs_ino(entry))
5d4f98a2
YZ
941 node = node->rb_right;
942 else
943 break;
944 }
945 if (!node) {
946 while (prev) {
947 entry = rb_entry(prev, struct btrfs_inode, rb_node);
4a0cc7ca 948 if (objectid <= btrfs_ino(entry)) {
5d4f98a2
YZ
949 node = prev;
950 break;
951 }
952 prev = rb_next(prev);
953 }
954 }
955 while (node) {
956 entry = rb_entry(node, struct btrfs_inode, rb_node);
957 inode = igrab(&entry->vfs_inode);
958 if (inode) {
959 spin_unlock(&root->inode_lock);
960 return inode;
961 }
962
4a0cc7ca 963 objectid = btrfs_ino(entry) + 1;
5d4f98a2
YZ
964 if (cond_resched_lock(&root->inode_lock))
965 goto again;
966
967 node = rb_next(node);
968 }
969 spin_unlock(&root->inode_lock);
970 return NULL;
971}
972
5d4f98a2
YZ
973/*
974 * get new location of data
975 */
976static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
977 u64 bytenr, u64 num_bytes)
978{
979 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
980 struct btrfs_path *path;
981 struct btrfs_file_extent_item *fi;
982 struct extent_buffer *leaf;
983 int ret;
984
985 path = btrfs_alloc_path();
986 if (!path)
987 return -ENOMEM;
988
989 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
f85b7379
DS
990 ret = btrfs_lookup_file_extent(NULL, root, path,
991 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
5d4f98a2
YZ
992 if (ret < 0)
993 goto out;
994 if (ret > 0) {
995 ret = -ENOENT;
996 goto out;
997 }
998
999 leaf = path->nodes[0];
1000 fi = btrfs_item_ptr(leaf, path->slots[0],
1001 struct btrfs_file_extent_item);
1002
1003 BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1004 btrfs_file_extent_compression(leaf, fi) ||
1005 btrfs_file_extent_encryption(leaf, fi) ||
1006 btrfs_file_extent_other_encoding(leaf, fi));
1007
1008 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
83d4cfd4 1009 ret = -EINVAL;
5d4f98a2
YZ
1010 goto out;
1011 }
1012
3fd0a558 1013 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5d4f98a2
YZ
1014 ret = 0;
1015out:
1016 btrfs_free_path(path);
1017 return ret;
1018}
1019
1020/*
1021 * update file extent items in the tree leaf to point to
1022 * the new locations.
1023 */
3fd0a558
YZ
1024static noinline_for_stack
1025int replace_file_extents(struct btrfs_trans_handle *trans,
1026 struct reloc_control *rc,
1027 struct btrfs_root *root,
1028 struct extent_buffer *leaf)
5d4f98a2 1029{
0b246afa 1030 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1031 struct btrfs_key key;
1032 struct btrfs_file_extent_item *fi;
1033 struct inode *inode = NULL;
5d4f98a2
YZ
1034 u64 parent;
1035 u64 bytenr;
3fd0a558 1036 u64 new_bytenr = 0;
5d4f98a2
YZ
1037 u64 num_bytes;
1038 u64 end;
1039 u32 nritems;
1040 u32 i;
83d4cfd4 1041 int ret = 0;
5d4f98a2
YZ
1042 int first = 1;
1043 int dirty = 0;
1044
1045 if (rc->stage != UPDATE_DATA_PTRS)
1046 return 0;
1047
1048 /* reloc trees always use full backref */
1049 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1050 parent = leaf->start;
1051 else
1052 parent = 0;
1053
1054 nritems = btrfs_header_nritems(leaf);
1055 for (i = 0; i < nritems; i++) {
82fa113f
QW
1056 struct btrfs_ref ref = { 0 };
1057
5d4f98a2
YZ
1058 cond_resched();
1059 btrfs_item_key_to_cpu(leaf, &key, i);
1060 if (key.type != BTRFS_EXTENT_DATA_KEY)
1061 continue;
1062 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1063 if (btrfs_file_extent_type(leaf, fi) ==
1064 BTRFS_FILE_EXTENT_INLINE)
1065 continue;
1066 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1067 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1068 if (bytenr == 0)
1069 continue;
9569cc20
QW
1070 if (!in_range(bytenr, rc->block_group->start,
1071 rc->block_group->length))
5d4f98a2
YZ
1072 continue;
1073
1074 /*
1075 * if we are modifying block in fs tree, wait for readpage
1076 * to complete and drop the extent cache
1077 */
1078 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5d4f98a2
YZ
1079 if (first) {
1080 inode = find_next_inode(root, key.objectid);
5d4f98a2 1081 first = 0;
4a0cc7ca 1082 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
3fd0a558 1083 btrfs_add_delayed_iput(inode);
5d4f98a2 1084 inode = find_next_inode(root, key.objectid);
5d4f98a2 1085 }
4a0cc7ca 1086 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
5d4f98a2
YZ
1087 end = key.offset +
1088 btrfs_file_extent_num_bytes(leaf, fi);
1089 WARN_ON(!IS_ALIGNED(key.offset,
0b246afa
JM
1090 fs_info->sectorsize));
1091 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
5d4f98a2
YZ
1092 end--;
1093 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
d0082371 1094 key.offset, end);
5d4f98a2
YZ
1095 if (!ret)
1096 continue;
1097
dcdbc059
NB
1098 btrfs_drop_extent_cache(BTRFS_I(inode),
1099 key.offset, end, 1);
5d4f98a2 1100 unlock_extent(&BTRFS_I(inode)->io_tree,
d0082371 1101 key.offset, end);
5d4f98a2
YZ
1102 }
1103 }
1104
1105 ret = get_new_location(rc->data_inode, &new_bytenr,
1106 bytenr, num_bytes);
83d4cfd4
JB
1107 if (ret) {
1108 /*
1109 * Don't have to abort since we've not changed anything
1110 * in the file extent yet.
1111 */
1112 break;
3fd0a558 1113 }
5d4f98a2
YZ
1114
1115 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1116 dirty = 1;
1117
1118 key.offset -= btrfs_file_extent_offset(leaf, fi);
82fa113f
QW
1119 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1120 num_bytes, parent);
1121 ref.real_root = root->root_key.objectid;
1122 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1123 key.objectid, key.offset);
1124 ret = btrfs_inc_extent_ref(trans, &ref);
83d4cfd4 1125 if (ret) {
66642832 1126 btrfs_abort_transaction(trans, ret);
83d4cfd4
JB
1127 break;
1128 }
5d4f98a2 1129
ffd4bb2a
QW
1130 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1131 num_bytes, parent);
1132 ref.real_root = root->root_key.objectid;
1133 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1134 key.objectid, key.offset);
1135 ret = btrfs_free_extent(trans, &ref);
83d4cfd4 1136 if (ret) {
66642832 1137 btrfs_abort_transaction(trans, ret);
83d4cfd4
JB
1138 break;
1139 }
5d4f98a2
YZ
1140 }
1141 if (dirty)
1142 btrfs_mark_buffer_dirty(leaf);
3fd0a558
YZ
1143 if (inode)
1144 btrfs_add_delayed_iput(inode);
83d4cfd4 1145 return ret;
5d4f98a2
YZ
1146}
1147
1148static noinline_for_stack
1149int memcmp_node_keys(struct extent_buffer *eb, int slot,
1150 struct btrfs_path *path, int level)
1151{
1152 struct btrfs_disk_key key1;
1153 struct btrfs_disk_key key2;
1154 btrfs_node_key(eb, &key1, slot);
1155 btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1156 return memcmp(&key1, &key2, sizeof(key1));
1157}
1158
1159/*
1160 * try to replace tree blocks in fs tree with the new blocks
1161 * in reloc tree. tree blocks haven't been modified since the
1162 * reloc tree was create can be replaced.
1163 *
1164 * if a block was replaced, level of the block + 1 is returned.
1165 * if no block got replaced, 0 is returned. if there are other
1166 * errors, a negative error number is returned.
1167 */
3fd0a558 1168static noinline_for_stack
3d0174f7 1169int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
3fd0a558
YZ
1170 struct btrfs_root *dest, struct btrfs_root *src,
1171 struct btrfs_path *path, struct btrfs_key *next_key,
1172 int lowest_level, int max_level)
5d4f98a2 1173{
0b246afa 1174 struct btrfs_fs_info *fs_info = dest->fs_info;
5d4f98a2
YZ
1175 struct extent_buffer *eb;
1176 struct extent_buffer *parent;
82fa113f 1177 struct btrfs_ref ref = { 0 };
5d4f98a2
YZ
1178 struct btrfs_key key;
1179 u64 old_bytenr;
1180 u64 new_bytenr;
1181 u64 old_ptr_gen;
1182 u64 new_ptr_gen;
1183 u64 last_snapshot;
1184 u32 blocksize;
3fd0a558 1185 int cow = 0;
5d4f98a2
YZ
1186 int level;
1187 int ret;
1188 int slot;
1189
1190 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1191 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
5d4f98a2
YZ
1192
1193 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
3fd0a558 1194again:
5d4f98a2
YZ
1195 slot = path->slots[lowest_level];
1196 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1197
1198 eb = btrfs_lock_root_node(dest);
5d4f98a2
YZ
1199 level = btrfs_header_level(eb);
1200
1201 if (level < lowest_level) {
1202 btrfs_tree_unlock(eb);
1203 free_extent_buffer(eb);
1204 return 0;
1205 }
1206
3fd0a558 1207 if (cow) {
9631e4cc
JB
1208 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1209 BTRFS_NESTING_COW);
3fd0a558
YZ
1210 BUG_ON(ret);
1211 }
5d4f98a2
YZ
1212
1213 if (next_key) {
1214 next_key->objectid = (u64)-1;
1215 next_key->type = (u8)-1;
1216 next_key->offset = (u64)-1;
1217 }
1218
1219 parent = eb;
1220 while (1) {
1221 level = btrfs_header_level(parent);
1222 BUG_ON(level < lowest_level);
1223
e3b83361 1224 ret = btrfs_bin_search(parent, &key, &slot);
cbca7d59
FM
1225 if (ret < 0)
1226 break;
5d4f98a2
YZ
1227 if (ret && slot > 0)
1228 slot--;
1229
1230 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1231 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1232
1233 old_bytenr = btrfs_node_blockptr(parent, slot);
0b246afa 1234 blocksize = fs_info->nodesize;
5d4f98a2
YZ
1235 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1236
1237 if (level <= max_level) {
1238 eb = path->nodes[level];
1239 new_bytenr = btrfs_node_blockptr(eb,
1240 path->slots[level]);
1241 new_ptr_gen = btrfs_node_ptr_generation(eb,
1242 path->slots[level]);
1243 } else {
1244 new_bytenr = 0;
1245 new_ptr_gen = 0;
1246 }
1247
fae7f21c 1248 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
5d4f98a2
YZ
1249 ret = level;
1250 break;
1251 }
1252
1253 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1254 memcmp_node_keys(parent, slot, path, level)) {
3fd0a558 1255 if (level <= lowest_level) {
5d4f98a2
YZ
1256 ret = 0;
1257 break;
1258 }
1259
6b3426be 1260 eb = btrfs_read_node_slot(parent, slot);
64c043de
LB
1261 if (IS_ERR(eb)) {
1262 ret = PTR_ERR(eb);
264813ac 1263 break;
416bc658 1264 }
5d4f98a2 1265 btrfs_tree_lock(eb);
3fd0a558
YZ
1266 if (cow) {
1267 ret = btrfs_cow_block(trans, dest, eb, parent,
9631e4cc
JB
1268 slot, &eb,
1269 BTRFS_NESTING_COW);
3fd0a558 1270 BUG_ON(ret);
5d4f98a2
YZ
1271 }
1272
1273 btrfs_tree_unlock(parent);
1274 free_extent_buffer(parent);
1275
1276 parent = eb;
1277 continue;
1278 }
1279
3fd0a558
YZ
1280 if (!cow) {
1281 btrfs_tree_unlock(parent);
1282 free_extent_buffer(parent);
1283 cow = 1;
1284 goto again;
1285 }
1286
5d4f98a2
YZ
1287 btrfs_node_key_to_cpu(path->nodes[level], &key,
1288 path->slots[level]);
b3b4aa74 1289 btrfs_release_path(path);
5d4f98a2
YZ
1290
1291 path->lowest_level = level;
1292 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1293 path->lowest_level = 0;
1294 BUG_ON(ret);
1295
824d8dff
QW
1296 /*
1297 * Info qgroup to trace both subtrees.
1298 *
1299 * We must trace both trees.
1300 * 1) Tree reloc subtree
1301 * If not traced, we will leak data numbers
1302 * 2) Fs subtree
1303 * If not traced, we will double count old data
f616f5cd
QW
1304 *
1305 * We don't scan the subtree right now, but only record
1306 * the swapped tree blocks.
1307 * The real subtree rescan is delayed until we have new
1308 * CoW on the subtree root node before transaction commit.
824d8dff 1309 */
370a11b8
QW
1310 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1311 rc->block_group, parent, slot,
1312 path->nodes[level], path->slots[level],
1313 last_snapshot);
1314 if (ret < 0)
1315 break;
5d4f98a2
YZ
1316 /*
1317 * swap blocks in fs tree and reloc tree.
1318 */
1319 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1320 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1321 btrfs_mark_buffer_dirty(parent);
1322
1323 btrfs_set_node_blockptr(path->nodes[level],
1324 path->slots[level], old_bytenr);
1325 btrfs_set_node_ptr_generation(path->nodes[level],
1326 path->slots[level], old_ptr_gen);
1327 btrfs_mark_buffer_dirty(path->nodes[level]);
1328
82fa113f
QW
1329 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1330 blocksize, path->nodes[level]->start);
1331 ref.skip_qgroup = true;
1332 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1333 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2 1334 BUG_ON(ret);
82fa113f
QW
1335 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1336 blocksize, 0);
1337 ref.skip_qgroup = true;
1338 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1339 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2
YZ
1340 BUG_ON(ret);
1341
ffd4bb2a
QW
1342 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1343 blocksize, path->nodes[level]->start);
1344 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1345 ref.skip_qgroup = true;
1346 ret = btrfs_free_extent(trans, &ref);
5d4f98a2
YZ
1347 BUG_ON(ret);
1348
ffd4bb2a
QW
1349 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1350 blocksize, 0);
1351 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1352 ref.skip_qgroup = true;
1353 ret = btrfs_free_extent(trans, &ref);
5d4f98a2
YZ
1354 BUG_ON(ret);
1355
1356 btrfs_unlock_up_safe(path, 0);
1357
1358 ret = level;
1359 break;
1360 }
1361 btrfs_tree_unlock(parent);
1362 free_extent_buffer(parent);
1363 return ret;
1364}
1365
1366/*
1367 * helper to find next relocated block in reloc tree
1368 */
1369static noinline_for_stack
1370int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1371 int *level)
1372{
1373 struct extent_buffer *eb;
1374 int i;
1375 u64 last_snapshot;
1376 u32 nritems;
1377
1378 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1379
1380 for (i = 0; i < *level; i++) {
1381 free_extent_buffer(path->nodes[i]);
1382 path->nodes[i] = NULL;
1383 }
1384
1385 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1386 eb = path->nodes[i];
1387 nritems = btrfs_header_nritems(eb);
1388 while (path->slots[i] + 1 < nritems) {
1389 path->slots[i]++;
1390 if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1391 last_snapshot)
1392 continue;
1393
1394 *level = i;
1395 return 0;
1396 }
1397 free_extent_buffer(path->nodes[i]);
1398 path->nodes[i] = NULL;
1399 }
1400 return 1;
1401}
1402
1403/*
1404 * walk down reloc tree to find relocated block of lowest level
1405 */
1406static noinline_for_stack
1407int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1408 int *level)
1409{
1410 struct extent_buffer *eb = NULL;
1411 int i;
5d4f98a2
YZ
1412 u64 ptr_gen = 0;
1413 u64 last_snapshot;
5d4f98a2
YZ
1414 u32 nritems;
1415
1416 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1417
1418 for (i = *level; i > 0; i--) {
1419 eb = path->nodes[i];
1420 nritems = btrfs_header_nritems(eb);
1421 while (path->slots[i] < nritems) {
1422 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1423 if (ptr_gen > last_snapshot)
1424 break;
1425 path->slots[i]++;
1426 }
1427 if (path->slots[i] >= nritems) {
1428 if (i == *level)
1429 break;
1430 *level = i + 1;
1431 return 0;
1432 }
1433 if (i == 1) {
1434 *level = i;
1435 return 0;
1436 }
1437
8ef385bb
JB
1438 eb = btrfs_read_node_slot(eb, path->slots[i]);
1439 if (IS_ERR(eb))
64c043de 1440 return PTR_ERR(eb);
5d4f98a2
YZ
1441 BUG_ON(btrfs_header_level(eb) != i - 1);
1442 path->nodes[i - 1] = eb;
1443 path->slots[i - 1] = 0;
1444 }
1445 return 1;
1446}
1447
1448/*
1449 * invalidate extent cache for file extents whose key in range of
1450 * [min_key, max_key)
1451 */
1452static int invalidate_extent_cache(struct btrfs_root *root,
1453 struct btrfs_key *min_key,
1454 struct btrfs_key *max_key)
1455{
0b246afa 1456 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
1457 struct inode *inode = NULL;
1458 u64 objectid;
1459 u64 start, end;
33345d01 1460 u64 ino;
5d4f98a2
YZ
1461
1462 objectid = min_key->objectid;
1463 while (1) {
1464 cond_resched();
1465 iput(inode);
1466
1467 if (objectid > max_key->objectid)
1468 break;
1469
1470 inode = find_next_inode(root, objectid);
1471 if (!inode)
1472 break;
4a0cc7ca 1473 ino = btrfs_ino(BTRFS_I(inode));
5d4f98a2 1474
33345d01 1475 if (ino > max_key->objectid) {
5d4f98a2
YZ
1476 iput(inode);
1477 break;
1478 }
1479
33345d01 1480 objectid = ino + 1;
5d4f98a2
YZ
1481 if (!S_ISREG(inode->i_mode))
1482 continue;
1483
33345d01 1484 if (unlikely(min_key->objectid == ino)) {
5d4f98a2
YZ
1485 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1486 continue;
1487 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1488 start = 0;
1489 else {
1490 start = min_key->offset;
0b246afa 1491 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
5d4f98a2
YZ
1492 }
1493 } else {
1494 start = 0;
1495 }
1496
33345d01 1497 if (unlikely(max_key->objectid == ino)) {
5d4f98a2
YZ
1498 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1499 continue;
1500 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1501 end = (u64)-1;
1502 } else {
1503 if (max_key->offset == 0)
1504 continue;
1505 end = max_key->offset;
0b246afa 1506 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
5d4f98a2
YZ
1507 end--;
1508 }
1509 } else {
1510 end = (u64)-1;
1511 }
1512
1513 /* the lock_extent waits for readpage to complete */
d0082371 1514 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
dcdbc059 1515 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
d0082371 1516 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
5d4f98a2
YZ
1517 }
1518 return 0;
1519}
1520
1521static int find_next_key(struct btrfs_path *path, int level,
1522 struct btrfs_key *key)
1523
1524{
1525 while (level < BTRFS_MAX_LEVEL) {
1526 if (!path->nodes[level])
1527 break;
1528 if (path->slots[level] + 1 <
1529 btrfs_header_nritems(path->nodes[level])) {
1530 btrfs_node_key_to_cpu(path->nodes[level], key,
1531 path->slots[level] + 1);
1532 return 0;
1533 }
1534 level++;
1535 }
1536 return 1;
1537}
1538
d2311e69
QW
1539/*
1540 * Insert current subvolume into reloc_control::dirty_subvol_roots
1541 */
1542static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
1543 struct reloc_control *rc,
1544 struct btrfs_root *root)
1545{
1546 struct btrfs_root *reloc_root = root->reloc_root;
1547 struct btrfs_root_item *reloc_root_item;
1548
1549 /* @root must be a subvolume tree root with a valid reloc tree */
1550 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1551 ASSERT(reloc_root);
1552
1553 reloc_root_item = &reloc_root->root_item;
1554 memset(&reloc_root_item->drop_progress, 0,
1555 sizeof(reloc_root_item->drop_progress));
c8422684 1556 btrfs_set_root_drop_level(reloc_root_item, 0);
d2311e69
QW
1557 btrfs_set_root_refs(reloc_root_item, 0);
1558 btrfs_update_reloc_root(trans, root);
1559
1560 if (list_empty(&root->reloc_dirty_list)) {
00246528 1561 btrfs_grab_root(root);
d2311e69
QW
1562 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1563 }
1564}
1565
1566static int clean_dirty_subvols(struct reloc_control *rc)
1567{
1568 struct btrfs_root *root;
1569 struct btrfs_root *next;
1570 int ret = 0;
30d40577 1571 int ret2;
d2311e69
QW
1572
1573 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1574 reloc_dirty_list) {
30d40577
QW
1575 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1576 /* Merged subvolume, cleanup its reloc root */
1577 struct btrfs_root *reloc_root = root->reloc_root;
d2311e69 1578
30d40577
QW
1579 list_del_init(&root->reloc_dirty_list);
1580 root->reloc_root = NULL;
6282675e
QW
1581 /*
1582 * Need barrier to ensure clear_bit() only happens after
1583 * root->reloc_root = NULL. Pairs with have_reloc_root.
1584 */
1585 smp_wmb();
1fac4a54 1586 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
f28de8d8 1587 if (reloc_root) {
f44deb74
JB
1588 /*
1589 * btrfs_drop_snapshot drops our ref we hold for
1590 * ->reloc_root. If it fails however we must
1591 * drop the ref ourselves.
1592 */
f28de8d8 1593 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
f44deb74
JB
1594 if (ret2 < 0) {
1595 btrfs_put_root(reloc_root);
1596 if (!ret)
1597 ret = ret2;
1598 }
f28de8d8 1599 }
00246528 1600 btrfs_put_root(root);
30d40577
QW
1601 } else {
1602 /* Orphan reloc tree, just clean it up */
0078a9f9 1603 ret2 = btrfs_drop_snapshot(root, 0, 1);
f44deb74
JB
1604 if (ret2 < 0) {
1605 btrfs_put_root(root);
1606 if (!ret)
1607 ret = ret2;
1608 }
d2311e69 1609 }
d2311e69
QW
1610 }
1611 return ret;
1612}
1613
5d4f98a2
YZ
1614/*
1615 * merge the relocated tree blocks in reloc tree with corresponding
1616 * fs tree.
1617 */
1618static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1619 struct btrfs_root *root)
1620{
0b246afa 1621 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
1622 struct btrfs_key key;
1623 struct btrfs_key next_key;
9e6a0c52 1624 struct btrfs_trans_handle *trans = NULL;
5d4f98a2
YZ
1625 struct btrfs_root *reloc_root;
1626 struct btrfs_root_item *root_item;
1627 struct btrfs_path *path;
3fd0a558 1628 struct extent_buffer *leaf;
fca3a45d 1629 int reserve_level;
5d4f98a2
YZ
1630 int level;
1631 int max_level;
1632 int replaced = 0;
1633 int ret;
1634 int err = 0;
3fd0a558 1635 u32 min_reserved;
5d4f98a2
YZ
1636
1637 path = btrfs_alloc_path();
1638 if (!path)
1639 return -ENOMEM;
e4058b54 1640 path->reada = READA_FORWARD;
5d4f98a2
YZ
1641
1642 reloc_root = root->reloc_root;
1643 root_item = &reloc_root->root_item;
1644
1645 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1646 level = btrfs_root_level(root_item);
67439dad 1647 atomic_inc(&reloc_root->node->refs);
5d4f98a2
YZ
1648 path->nodes[level] = reloc_root->node;
1649 path->slots[level] = 0;
1650 } else {
1651 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1652
c8422684 1653 level = btrfs_root_drop_level(root_item);
5d4f98a2
YZ
1654 BUG_ON(level == 0);
1655 path->lowest_level = level;
1656 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
33c66f43 1657 path->lowest_level = 0;
5d4f98a2
YZ
1658 if (ret < 0) {
1659 btrfs_free_path(path);
1660 return ret;
1661 }
1662
1663 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1664 path->slots[level]);
1665 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1666
1667 btrfs_unlock_up_safe(path, 0);
1668 }
1669
44d354ab
QW
1670 /*
1671 * In merge_reloc_root(), we modify the upper level pointer to swap the
1672 * tree blocks between reloc tree and subvolume tree. Thus for tree
1673 * block COW, we COW at most from level 1 to root level for each tree.
1674 *
1675 * Thus the needed metadata size is at most root_level * nodesize,
1676 * and * 2 since we have two trees to COW.
1677 */
fca3a45d
JB
1678 reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1679 min_reserved = fs_info->nodesize * reserve_level * 2;
3fd0a558 1680 memset(&next_key, 0, sizeof(next_key));
5d4f98a2 1681
3fd0a558 1682 while (1) {
08e007d2 1683 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
44d354ab 1684 BTRFS_RESERVE_FLUSH_LIMIT);
3fd0a558 1685 if (ret) {
9e6a0c52
JB
1686 err = ret;
1687 goto out;
5d4f98a2 1688 }
9e6a0c52
JB
1689 trans = btrfs_start_transaction(root, 0);
1690 if (IS_ERR(trans)) {
1691 err = PTR_ERR(trans);
1692 trans = NULL;
1693 goto out;
1694 }
2abc726a
JB
1695
1696 /*
1697 * At this point we no longer have a reloc_control, so we can't
1698 * depend on btrfs_init_reloc_root to update our last_trans.
1699 *
1700 * But that's ok, we started the trans handle on our
1701 * corresponding fs_root, which means it's been added to the
1702 * dirty list. At commit time we'll still call
1703 * btrfs_update_reloc_root() and update our root item
1704 * appropriately.
1705 */
1706 reloc_root->last_trans = trans->transid;
9e6a0c52 1707 trans->block_rsv = rc->block_rsv;
5d4f98a2 1708
5d4f98a2 1709 replaced = 0;
5d4f98a2
YZ
1710 max_level = level;
1711
1712 ret = walk_down_reloc_tree(reloc_root, path, &level);
1713 if (ret < 0) {
1714 err = ret;
1715 goto out;
1716 }
1717 if (ret > 0)
1718 break;
1719
1720 if (!find_next_key(path, level, &key) &&
1721 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1722 ret = 0;
5d4f98a2 1723 } else {
3d0174f7 1724 ret = replace_path(trans, rc, root, reloc_root, path,
3fd0a558 1725 &next_key, level, max_level);
5d4f98a2
YZ
1726 }
1727 if (ret < 0) {
1728 err = ret;
1729 goto out;
1730 }
1731
1732 if (ret > 0) {
1733 level = ret;
1734 btrfs_node_key_to_cpu(path->nodes[level], &key,
1735 path->slots[level]);
1736 replaced = 1;
5d4f98a2
YZ
1737 }
1738
1739 ret = walk_up_reloc_tree(reloc_root, path, &level);
1740 if (ret > 0)
1741 break;
1742
1743 BUG_ON(level == 0);
1744 /*
1745 * save the merging progress in the drop_progress.
1746 * this is OK since root refs == 1 in this case.
1747 */
1748 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1749 path->slots[level]);
c8422684 1750 btrfs_set_root_drop_level(root_item, level);
5d4f98a2 1751
3a45bb20 1752 btrfs_end_transaction_throttle(trans);
9e6a0c52 1753 trans = NULL;
5d4f98a2 1754
2ff7e61e 1755 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
1756
1757 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1758 invalidate_extent_cache(root, &key, &next_key);
1759 }
1760
1761 /*
1762 * handle the case only one block in the fs tree need to be
1763 * relocated and the block is tree root.
1764 */
1765 leaf = btrfs_lock_root_node(root);
9631e4cc
JB
1766 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1767 BTRFS_NESTING_COW);
5d4f98a2
YZ
1768 btrfs_tree_unlock(leaf);
1769 free_extent_buffer(leaf);
1770 if (ret < 0)
1771 err = ret;
1772out:
1773 btrfs_free_path(path);
1774
d2311e69
QW
1775 if (err == 0)
1776 insert_dirty_subvol(trans, rc, root);
5d4f98a2 1777
9e6a0c52 1778 if (trans)
3a45bb20 1779 btrfs_end_transaction_throttle(trans);
5d4f98a2 1780
2ff7e61e 1781 btrfs_btree_balance_dirty(fs_info);
5d4f98a2 1782
5d4f98a2
YZ
1783 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1784 invalidate_extent_cache(root, &key, &next_key);
1785
1786 return err;
1787}
1788
3fd0a558
YZ
1789static noinline_for_stack
1790int prepare_to_merge(struct reloc_control *rc, int err)
5d4f98a2 1791{
3fd0a558 1792 struct btrfs_root *root = rc->extent_root;
0b246afa 1793 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2 1794 struct btrfs_root *reloc_root;
3fd0a558
YZ
1795 struct btrfs_trans_handle *trans;
1796 LIST_HEAD(reloc_roots);
1797 u64 num_bytes = 0;
1798 int ret;
3fd0a558 1799
0b246afa
JM
1800 mutex_lock(&fs_info->reloc_mutex);
1801 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
3fd0a558 1802 rc->merging_rsv_size += rc->nodes_relocated * 2;
0b246afa 1803 mutex_unlock(&fs_info->reloc_mutex);
7585717f 1804
3fd0a558
YZ
1805again:
1806 if (!err) {
1807 num_bytes = rc->merging_rsv_size;
08e007d2
MX
1808 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
1809 BTRFS_RESERVE_FLUSH_ALL);
3fd0a558
YZ
1810 if (ret)
1811 err = ret;
1812 }
1813
7a7eaa40 1814 trans = btrfs_join_transaction(rc->extent_root);
3612b495
TI
1815 if (IS_ERR(trans)) {
1816 if (!err)
2ff7e61e 1817 btrfs_block_rsv_release(fs_info, rc->block_rsv,
63f018be 1818 num_bytes, NULL);
3612b495
TI
1819 return PTR_ERR(trans);
1820 }
3fd0a558
YZ
1821
1822 if (!err) {
1823 if (num_bytes != rc->merging_rsv_size) {
3a45bb20 1824 btrfs_end_transaction(trans);
2ff7e61e 1825 btrfs_block_rsv_release(fs_info, rc->block_rsv,
63f018be 1826 num_bytes, NULL);
3fd0a558
YZ
1827 goto again;
1828 }
1829 }
5d4f98a2 1830
3fd0a558
YZ
1831 rc->merge_reloc_tree = 1;
1832
1833 while (!list_empty(&rc->reloc_roots)) {
1834 reloc_root = list_entry(rc->reloc_roots.next,
1835 struct btrfs_root, root_list);
1836 list_del_init(&reloc_root->root_list);
5d4f98a2 1837
a820feb5
DS
1838 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1839 false);
5d4f98a2
YZ
1840 BUG_ON(IS_ERR(root));
1841 BUG_ON(root->reloc_root != reloc_root);
1842
3fd0a558
YZ
1843 /*
1844 * set reference count to 1, so btrfs_recover_relocation
1845 * knows it should resumes merging
1846 */
1847 if (!err)
1848 btrfs_set_root_refs(&reloc_root->root_item, 1);
5d4f98a2 1849 btrfs_update_reloc_root(trans, root);
5d4f98a2 1850
3fd0a558 1851 list_add(&reloc_root->root_list, &reloc_roots);
00246528 1852 btrfs_put_root(root);
3fd0a558 1853 }
5d4f98a2 1854
3fd0a558 1855 list_splice(&reloc_roots, &rc->reloc_roots);
5d4f98a2 1856
3fd0a558 1857 if (!err)
3a45bb20 1858 btrfs_commit_transaction(trans);
3fd0a558 1859 else
3a45bb20 1860 btrfs_end_transaction(trans);
3fd0a558 1861 return err;
5d4f98a2
YZ
1862}
1863
aca1bba6
LB
1864static noinline_for_stack
1865void free_reloc_roots(struct list_head *list)
1866{
a7571232 1867 struct btrfs_root *reloc_root, *tmp;
aca1bba6 1868
a7571232 1869 list_for_each_entry_safe(reloc_root, tmp, list, root_list)
bb166d72 1870 __del_reloc_root(reloc_root);
aca1bba6
LB
1871}
1872
3fd0a558 1873static noinline_for_stack
94404e82 1874void merge_reloc_roots(struct reloc_control *rc)
5d4f98a2 1875{
0b246afa 1876 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2 1877 struct btrfs_root *root;
3fd0a558
YZ
1878 struct btrfs_root *reloc_root;
1879 LIST_HEAD(reloc_roots);
1880 int found = 0;
aca1bba6 1881 int ret = 0;
3fd0a558
YZ
1882again:
1883 root = rc->extent_root;
7585717f
CM
1884
1885 /*
1886 * this serializes us with btrfs_record_root_in_transaction,
1887 * we have to make sure nobody is in the middle of
1888 * adding their roots to the list while we are
1889 * doing this splice
1890 */
0b246afa 1891 mutex_lock(&fs_info->reloc_mutex);
3fd0a558 1892 list_splice_init(&rc->reloc_roots, &reloc_roots);
0b246afa 1893 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2 1894
3fd0a558
YZ
1895 while (!list_empty(&reloc_roots)) {
1896 found = 1;
1897 reloc_root = list_entry(reloc_roots.next,
1898 struct btrfs_root, root_list);
5d4f98a2 1899
a820feb5
DS
1900 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1901 false);
3fd0a558 1902 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3fd0a558
YZ
1903 BUG_ON(IS_ERR(root));
1904 BUG_ON(root->reloc_root != reloc_root);
3fd0a558 1905 ret = merge_reloc_root(rc, root);
00246528 1906 btrfs_put_root(root);
b37b39cd 1907 if (ret) {
25e293c2
WS
1908 if (list_empty(&reloc_root->root_list))
1909 list_add_tail(&reloc_root->root_list,
1910 &reloc_roots);
aca1bba6 1911 goto out;
b37b39cd 1912 }
3fd0a558 1913 } else {
51415b6c
QW
1914 if (!IS_ERR(root)) {
1915 if (root->reloc_root == reloc_root) {
1916 root->reloc_root = NULL;
1917 btrfs_put_root(reloc_root);
1918 }
1dae7e0e
QW
1919 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
1920 &root->state);
51415b6c
QW
1921 btrfs_put_root(root);
1922 }
1923
3fd0a558 1924 list_del_init(&reloc_root->root_list);
30d40577
QW
1925 /* Don't forget to queue this reloc root for cleanup */
1926 list_add_tail(&reloc_root->reloc_dirty_list,
1927 &rc->dirty_subvol_roots);
3fd0a558 1928 }
5d4f98a2
YZ
1929 }
1930
3fd0a558
YZ
1931 if (found) {
1932 found = 0;
1933 goto again;
1934 }
aca1bba6
LB
1935out:
1936 if (ret) {
0b246afa 1937 btrfs_handle_fs_error(fs_info, ret, NULL);
a7571232 1938 free_reloc_roots(&reloc_roots);
467bb1d2
WS
1939
1940 /* new reloc root may be added */
0b246afa 1941 mutex_lock(&fs_info->reloc_mutex);
467bb1d2 1942 list_splice_init(&rc->reloc_roots, &reloc_roots);
0b246afa 1943 mutex_unlock(&fs_info->reloc_mutex);
a7571232 1944 free_reloc_roots(&reloc_roots);
aca1bba6
LB
1945 }
1946
7b7b7431
JB
1947 /*
1948 * We used to have
1949 *
1950 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1951 *
1952 * here, but it's wrong. If we fail to start the transaction in
1953 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
1954 * have actually been removed from the reloc_root_tree rb tree. This is
1955 * fine because we're bailing here, and we hold a reference on the root
1956 * for the list that holds it, so these roots will be cleaned up when we
1957 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
1958 * will be cleaned up on unmount.
1959 *
1960 * The remaining nodes will be cleaned up by free_reloc_control.
1961 */
5d4f98a2
YZ
1962}
1963
1964static void free_block_list(struct rb_root *blocks)
1965{
1966 struct tree_block *block;
1967 struct rb_node *rb_node;
1968 while ((rb_node = rb_first(blocks))) {
1969 block = rb_entry(rb_node, struct tree_block, rb_node);
1970 rb_erase(rb_node, blocks);
1971 kfree(block);
1972 }
1973}
1974
1975static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1976 struct btrfs_root *reloc_root)
1977{
0b246afa 1978 struct btrfs_fs_info *fs_info = reloc_root->fs_info;
5d4f98a2 1979 struct btrfs_root *root;
442b1ac5 1980 int ret;
5d4f98a2
YZ
1981
1982 if (reloc_root->last_trans == trans->transid)
1983 return 0;
1984
a820feb5 1985 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
5d4f98a2
YZ
1986 BUG_ON(IS_ERR(root));
1987 BUG_ON(root->reloc_root != reloc_root);
442b1ac5 1988 ret = btrfs_record_root_in_trans(trans, root);
00246528 1989 btrfs_put_root(root);
5d4f98a2 1990
442b1ac5 1991 return ret;
5d4f98a2
YZ
1992}
1993
3fd0a558
YZ
1994static noinline_for_stack
1995struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1996 struct reloc_control *rc,
a26195a5
QW
1997 struct btrfs_backref_node *node,
1998 struct btrfs_backref_edge *edges[])
5d4f98a2 1999{
a26195a5 2000 struct btrfs_backref_node *next;
5d4f98a2 2001 struct btrfs_root *root;
3fd0a558
YZ
2002 int index = 0;
2003
5d4f98a2
YZ
2004 next = node;
2005 while (1) {
2006 cond_resched();
2007 next = walk_up_backref(next, edges, &index);
2008 root = next->root;
3fd0a558 2009 BUG_ON(!root);
92a7cc42 2010 BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
5d4f98a2
YZ
2011
2012 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2013 record_reloc_root_in_trans(trans, root);
2014 break;
2015 }
2016
3fd0a558
YZ
2017 btrfs_record_root_in_trans(trans, root);
2018 root = root->reloc_root;
2019
2020 if (next->new_bytenr != root->node->start) {
2021 BUG_ON(next->new_bytenr);
2022 BUG_ON(!list_empty(&next->list));
2023 next->new_bytenr = root->node->start;
00246528
JB
2024 btrfs_put_root(next->root);
2025 next->root = btrfs_grab_root(root);
0b530bc5 2026 ASSERT(next->root);
3fd0a558
YZ
2027 list_add_tail(&next->list,
2028 &rc->backref_cache.changed);
9569cc20 2029 mark_block_processed(rc, next);
5d4f98a2
YZ
2030 break;
2031 }
2032
3fd0a558 2033 WARN_ON(1);
5d4f98a2
YZ
2034 root = NULL;
2035 next = walk_down_backref(edges, &index);
2036 if (!next || next->level <= node->level)
2037 break;
2038 }
3fd0a558
YZ
2039 if (!root)
2040 return NULL;
5d4f98a2 2041
3fd0a558
YZ
2042 next = node;
2043 /* setup backref node path for btrfs_reloc_cow_block */
2044 while (1) {
2045 rc->backref_cache.path[next->level] = next;
2046 if (--index < 0)
2047 break;
2048 next = edges[index]->node[UPPER];
5d4f98a2 2049 }
5d4f98a2
YZ
2050 return root;
2051}
2052
3fd0a558 2053/*
92a7cc42
QW
2054 * Select a tree root for relocation.
2055 *
2056 * Return NULL if the block is not shareable. We should use do_relocation() in
2057 * this case.
2058 *
2059 * Return a tree root pointer if the block is shareable.
2060 * Return -ENOENT if the block is root of reloc tree.
3fd0a558 2061 */
5d4f98a2 2062static noinline_for_stack
a26195a5 2063struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
5d4f98a2 2064{
a26195a5 2065 struct btrfs_backref_node *next;
3fd0a558
YZ
2066 struct btrfs_root *root;
2067 struct btrfs_root *fs_root = NULL;
a26195a5 2068 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3fd0a558
YZ
2069 int index = 0;
2070
2071 next = node;
2072 while (1) {
2073 cond_resched();
2074 next = walk_up_backref(next, edges, &index);
2075 root = next->root;
2076 BUG_ON(!root);
2077
92a7cc42
QW
2078 /* No other choice for non-shareable tree */
2079 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
3fd0a558
YZ
2080 return root;
2081
2082 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2083 fs_root = root;
2084
2085 if (next != node)
2086 return NULL;
2087
2088 next = walk_down_backref(edges, &index);
2089 if (!next || next->level <= node->level)
2090 break;
2091 }
2092
2093 if (!fs_root)
2094 return ERR_PTR(-ENOENT);
2095 return fs_root;
5d4f98a2
YZ
2096}
2097
2098static noinline_for_stack
3fd0a558 2099u64 calcu_metadata_size(struct reloc_control *rc,
a26195a5 2100 struct btrfs_backref_node *node, int reserve)
5d4f98a2 2101{
0b246afa 2102 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5
QW
2103 struct btrfs_backref_node *next = node;
2104 struct btrfs_backref_edge *edge;
2105 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3fd0a558
YZ
2106 u64 num_bytes = 0;
2107 int index = 0;
2108
2109 BUG_ON(reserve && node->processed);
2110
2111 while (next) {
2112 cond_resched();
2113 while (1) {
2114 if (next->processed && (reserve || next != node))
2115 break;
2116
0b246afa 2117 num_bytes += fs_info->nodesize;
3fd0a558
YZ
2118
2119 if (list_empty(&next->upper))
2120 break;
2121
2122 edge = list_entry(next->upper.next,
a26195a5 2123 struct btrfs_backref_edge, list[LOWER]);
3fd0a558
YZ
2124 edges[index++] = edge;
2125 next = edge->node[UPPER];
2126 }
2127 next = walk_down_backref(edges, &index);
2128 }
2129 return num_bytes;
5d4f98a2
YZ
2130}
2131
3fd0a558
YZ
2132static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2133 struct reloc_control *rc,
a26195a5 2134 struct btrfs_backref_node *node)
5d4f98a2 2135{
3fd0a558 2136 struct btrfs_root *root = rc->extent_root;
da17066c 2137 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558
YZ
2138 u64 num_bytes;
2139 int ret;
0647bf56 2140 u64 tmp;
3fd0a558
YZ
2141
2142 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
5d4f98a2 2143
3fd0a558 2144 trans->block_rsv = rc->block_rsv;
0647bf56 2145 rc->reserved_bytes += num_bytes;
8ca17f0f
JB
2146
2147 /*
2148 * We are under a transaction here so we can only do limited flushing.
2149 * If we get an enospc just kick back -EAGAIN so we know to drop the
2150 * transaction and try to refill when we can flush all the things.
2151 */
0647bf56 2152 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
8ca17f0f 2153 BTRFS_RESERVE_FLUSH_LIMIT);
3fd0a558 2154 if (ret) {
da17066c 2155 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
8ca17f0f
JB
2156 while (tmp <= rc->reserved_bytes)
2157 tmp <<= 1;
2158 /*
2159 * only one thread can access block_rsv at this point,
2160 * so we don't need hold lock to protect block_rsv.
2161 * we expand more reservation size here to allow enough
52042d8e 2162 * space for relocation and we will return earlier in
8ca17f0f
JB
2163 * enospc case.
2164 */
da17066c
JM
2165 rc->block_rsv->size = tmp + fs_info->nodesize *
2166 RELOCATION_RESERVED_NODES;
8ca17f0f 2167 return -EAGAIN;
5d4f98a2 2168 }
3fd0a558 2169
3fd0a558
YZ
2170 return 0;
2171}
2172
5d4f98a2
YZ
2173/*
2174 * relocate a block tree, and then update pointers in upper level
2175 * blocks that reference the block to point to the new location.
2176 *
2177 * if called by link_to_upper, the block has already been relocated.
2178 * in that case this function just updates pointers.
2179 */
2180static int do_relocation(struct btrfs_trans_handle *trans,
3fd0a558 2181 struct reloc_control *rc,
a26195a5 2182 struct btrfs_backref_node *node,
5d4f98a2
YZ
2183 struct btrfs_key *key,
2184 struct btrfs_path *path, int lowest)
2185{
a26195a5
QW
2186 struct btrfs_backref_node *upper;
2187 struct btrfs_backref_edge *edge;
2188 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
5d4f98a2
YZ
2189 struct btrfs_root *root;
2190 struct extent_buffer *eb;
2191 u32 blocksize;
2192 u64 bytenr;
5d4f98a2
YZ
2193 int slot;
2194 int ret;
2195 int err = 0;
2196
2197 BUG_ON(lowest && node->eb);
2198
2199 path->lowest_level = node->level + 1;
3fd0a558 2200 rc->backref_cache.path[node->level] = node;
5d4f98a2 2201 list_for_each_entry(edge, &node->upper, list[LOWER]) {
82fa113f 2202 struct btrfs_ref ref = { 0 };
581c1760 2203
5d4f98a2 2204 cond_resched();
5d4f98a2
YZ
2205
2206 upper = edge->node[UPPER];
dc4103f9 2207 root = select_reloc_root(trans, rc, upper, edges);
3fd0a558
YZ
2208 BUG_ON(!root);
2209
2210 if (upper->eb && !upper->locked) {
2211 if (!lowest) {
e3b83361 2212 ret = btrfs_bin_search(upper->eb, key, &slot);
cbca7d59
FM
2213 if (ret < 0) {
2214 err = ret;
2215 goto next;
2216 }
3fd0a558
YZ
2217 BUG_ON(ret);
2218 bytenr = btrfs_node_blockptr(upper->eb, slot);
2219 if (node->eb->start == bytenr)
2220 goto next;
2221 }
b0fe7078 2222 btrfs_backref_drop_node_buffer(upper);
3fd0a558 2223 }
5d4f98a2
YZ
2224
2225 if (!upper->eb) {
2226 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
3561b9db
LB
2227 if (ret) {
2228 if (ret < 0)
2229 err = ret;
2230 else
2231 err = -ENOENT;
2232
2233 btrfs_release_path(path);
5d4f98a2
YZ
2234 break;
2235 }
5d4f98a2 2236
3fd0a558
YZ
2237 if (!upper->eb) {
2238 upper->eb = path->nodes[upper->level];
2239 path->nodes[upper->level] = NULL;
2240 } else {
2241 BUG_ON(upper->eb != path->nodes[upper->level]);
2242 }
5d4f98a2 2243
3fd0a558
YZ
2244 upper->locked = 1;
2245 path->locks[upper->level] = 0;
5d4f98a2 2246
3fd0a558 2247 slot = path->slots[upper->level];
b3b4aa74 2248 btrfs_release_path(path);
5d4f98a2 2249 } else {
e3b83361 2250 ret = btrfs_bin_search(upper->eb, key, &slot);
cbca7d59
FM
2251 if (ret < 0) {
2252 err = ret;
2253 goto next;
2254 }
5d4f98a2
YZ
2255 BUG_ON(ret);
2256 }
2257
2258 bytenr = btrfs_node_blockptr(upper->eb, slot);
3fd0a558 2259 if (lowest) {
4547f4d8
LB
2260 if (bytenr != node->bytenr) {
2261 btrfs_err(root->fs_info,
2262 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2263 bytenr, node->bytenr, slot,
2264 upper->eb->start);
2265 err = -EIO;
2266 goto next;
2267 }
5d4f98a2 2268 } else {
3fd0a558
YZ
2269 if (node->eb->start == bytenr)
2270 goto next;
5d4f98a2
YZ
2271 }
2272
da17066c 2273 blocksize = root->fs_info->nodesize;
c9752536 2274 eb = btrfs_read_node_slot(upper->eb, slot);
64c043de
LB
2275 if (IS_ERR(eb)) {
2276 err = PTR_ERR(eb);
2277 goto next;
97d9a8a4 2278 }
5d4f98a2 2279 btrfs_tree_lock(eb);
5d4f98a2
YZ
2280
2281 if (!node->eb) {
2282 ret = btrfs_cow_block(trans, root, eb, upper->eb,
9631e4cc 2283 slot, &eb, BTRFS_NESTING_COW);
3fd0a558
YZ
2284 btrfs_tree_unlock(eb);
2285 free_extent_buffer(eb);
5d4f98a2
YZ
2286 if (ret < 0) {
2287 err = ret;
3fd0a558 2288 goto next;
5d4f98a2 2289 }
3fd0a558 2290 BUG_ON(node->eb != eb);
5d4f98a2
YZ
2291 } else {
2292 btrfs_set_node_blockptr(upper->eb, slot,
2293 node->eb->start);
2294 btrfs_set_node_ptr_generation(upper->eb, slot,
2295 trans->transid);
2296 btrfs_mark_buffer_dirty(upper->eb);
2297
82fa113f
QW
2298 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2299 node->eb->start, blocksize,
2300 upper->eb->start);
2301 ref.real_root = root->root_key.objectid;
2302 btrfs_init_tree_ref(&ref, node->level,
2303 btrfs_header_owner(upper->eb));
2304 ret = btrfs_inc_extent_ref(trans, &ref);
5d4f98a2
YZ
2305 BUG_ON(ret);
2306
2307 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2308 BUG_ON(ret);
5d4f98a2 2309 }
3fd0a558
YZ
2310next:
2311 if (!upper->pending)
b0fe7078 2312 btrfs_backref_drop_node_buffer(upper);
3fd0a558 2313 else
b0fe7078 2314 btrfs_backref_unlock_node_buffer(upper);
3fd0a558
YZ
2315 if (err)
2316 break;
5d4f98a2 2317 }
3fd0a558
YZ
2318
2319 if (!err && node->pending) {
b0fe7078 2320 btrfs_backref_drop_node_buffer(node);
3fd0a558
YZ
2321 list_move_tail(&node->list, &rc->backref_cache.changed);
2322 node->pending = 0;
2323 }
2324
5d4f98a2 2325 path->lowest_level = 0;
3fd0a558 2326 BUG_ON(err == -ENOSPC);
5d4f98a2
YZ
2327 return err;
2328}
2329
2330static int link_to_upper(struct btrfs_trans_handle *trans,
3fd0a558 2331 struct reloc_control *rc,
a26195a5 2332 struct btrfs_backref_node *node,
5d4f98a2
YZ
2333 struct btrfs_path *path)
2334{
2335 struct btrfs_key key;
5d4f98a2
YZ
2336
2337 btrfs_node_key_to_cpu(node->eb, &key, 0);
3fd0a558 2338 return do_relocation(trans, rc, node, &key, path, 0);
5d4f98a2
YZ
2339}
2340
2341static int finish_pending_nodes(struct btrfs_trans_handle *trans,
3fd0a558
YZ
2342 struct reloc_control *rc,
2343 struct btrfs_path *path, int err)
5d4f98a2 2344{
3fd0a558 2345 LIST_HEAD(list);
a26195a5
QW
2346 struct btrfs_backref_cache *cache = &rc->backref_cache;
2347 struct btrfs_backref_node *node;
5d4f98a2
YZ
2348 int level;
2349 int ret;
5d4f98a2
YZ
2350
2351 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2352 while (!list_empty(&cache->pending[level])) {
2353 node = list_entry(cache->pending[level].next,
a26195a5 2354 struct btrfs_backref_node, list);
3fd0a558
YZ
2355 list_move_tail(&node->list, &list);
2356 BUG_ON(!node->pending);
5d4f98a2 2357
3fd0a558
YZ
2358 if (!err) {
2359 ret = link_to_upper(trans, rc, node, path);
2360 if (ret < 0)
2361 err = ret;
2362 }
5d4f98a2 2363 }
3fd0a558 2364 list_splice_init(&list, &cache->pending[level]);
5d4f98a2 2365 }
5d4f98a2
YZ
2366 return err;
2367}
2368
5d4f98a2
YZ
2369/*
2370 * mark a block and all blocks directly/indirectly reference the block
2371 * as processed.
2372 */
2373static void update_processed_blocks(struct reloc_control *rc,
a26195a5 2374 struct btrfs_backref_node *node)
5d4f98a2 2375{
a26195a5
QW
2376 struct btrfs_backref_node *next = node;
2377 struct btrfs_backref_edge *edge;
2378 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
5d4f98a2
YZ
2379 int index = 0;
2380
2381 while (next) {
2382 cond_resched();
2383 while (1) {
2384 if (next->processed)
2385 break;
2386
9569cc20 2387 mark_block_processed(rc, next);
5d4f98a2
YZ
2388
2389 if (list_empty(&next->upper))
2390 break;
2391
2392 edge = list_entry(next->upper.next,
a26195a5 2393 struct btrfs_backref_edge, list[LOWER]);
5d4f98a2
YZ
2394 edges[index++] = edge;
2395 next = edge->node[UPPER];
2396 }
2397 next = walk_down_backref(edges, &index);
2398 }
2399}
2400
7476dfda 2401static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
3fd0a558 2402{
da17066c 2403 u32 blocksize = rc->extent_root->fs_info->nodesize;
7476dfda 2404
3fd0a558
YZ
2405 if (test_range_bit(&rc->processed_blocks, bytenr,
2406 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2407 return 1;
2408 return 0;
5d4f98a2
YZ
2409}
2410
2ff7e61e 2411static int get_tree_block_key(struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
2412 struct tree_block *block)
2413{
2414 struct extent_buffer *eb;
2415
1b7ec85e 2416 eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset,
581c1760 2417 block->level, NULL);
64c043de
LB
2418 if (IS_ERR(eb)) {
2419 return PTR_ERR(eb);
2420 } else if (!extent_buffer_uptodate(eb)) {
416bc658
JB
2421 free_extent_buffer(eb);
2422 return -EIO;
2423 }
5d4f98a2
YZ
2424 if (block->level == 0)
2425 btrfs_item_key_to_cpu(eb, &block->key, 0);
2426 else
2427 btrfs_node_key_to_cpu(eb, &block->key, 0);
2428 free_extent_buffer(eb);
2429 block->key_ready = 1;
2430 return 0;
2431}
2432
5d4f98a2
YZ
2433/*
2434 * helper function to relocate a tree block
2435 */
2436static int relocate_tree_block(struct btrfs_trans_handle *trans,
2437 struct reloc_control *rc,
a26195a5 2438 struct btrfs_backref_node *node,
5d4f98a2
YZ
2439 struct btrfs_key *key,
2440 struct btrfs_path *path)
2441{
2442 struct btrfs_root *root;
3fd0a558
YZ
2443 int ret = 0;
2444
2445 if (!node)
2446 return 0;
5d4f98a2 2447
5f6b2e5c
JB
2448 /*
2449 * If we fail here we want to drop our backref_node because we are going
2450 * to start over and regenerate the tree for it.
2451 */
2452 ret = reserve_metadata_space(trans, rc, node);
2453 if (ret)
2454 goto out;
2455
3fd0a558 2456 BUG_ON(node->processed);
147d256e 2457 root = select_one_root(node);
3fd0a558 2458 if (root == ERR_PTR(-ENOENT)) {
5d4f98a2 2459 update_processed_blocks(rc, node);
3fd0a558 2460 goto out;
5d4f98a2
YZ
2461 }
2462
3fd0a558 2463 if (root) {
92a7cc42 2464 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
3fd0a558
YZ
2465 BUG_ON(node->new_bytenr);
2466 BUG_ON(!list_empty(&node->list));
2467 btrfs_record_root_in_trans(trans, root);
2468 root = root->reloc_root;
2469 node->new_bytenr = root->node->start;
00246528
JB
2470 btrfs_put_root(node->root);
2471 node->root = btrfs_grab_root(root);
0b530bc5 2472 ASSERT(node->root);
3fd0a558
YZ
2473 list_add_tail(&node->list, &rc->backref_cache.changed);
2474 } else {
2475 path->lowest_level = node->level;
2476 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
b3b4aa74 2477 btrfs_release_path(path);
3fd0a558
YZ
2478 if (ret > 0)
2479 ret = 0;
2480 }
2481 if (!ret)
2482 update_processed_blocks(rc, node);
2483 } else {
2484 ret = do_relocation(trans, rc, node, key, path, 1);
2485 }
5d4f98a2 2486out:
0647bf56 2487 if (ret || node->level == 0 || node->cowonly)
023acb07 2488 btrfs_backref_cleanup_node(&rc->backref_cache, node);
5d4f98a2
YZ
2489 return ret;
2490}
2491
2492/*
2493 * relocate a list of blocks
2494 */
2495static noinline_for_stack
2496int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2497 struct reloc_control *rc, struct rb_root *blocks)
2498{
2ff7e61e 2499 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
a26195a5 2500 struct btrfs_backref_node *node;
5d4f98a2
YZ
2501 struct btrfs_path *path;
2502 struct tree_block *block;
98ff7b94 2503 struct tree_block *next;
5d4f98a2
YZ
2504 int ret;
2505 int err = 0;
2506
2507 path = btrfs_alloc_path();
e1a12670
LB
2508 if (!path) {
2509 err = -ENOMEM;
34c2b290 2510 goto out_free_blocks;
e1a12670 2511 }
5d4f98a2 2512
98ff7b94
QW
2513 /* Kick in readahead for tree blocks with missing keys */
2514 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
5d4f98a2 2515 if (!block->key_ready)
3fbaf258
JB
2516 btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0,
2517 block->level);
5d4f98a2
YZ
2518 }
2519
98ff7b94
QW
2520 /* Get first keys */
2521 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
34c2b290 2522 if (!block->key_ready) {
2ff7e61e 2523 err = get_tree_block_key(fs_info, block);
34c2b290
DS
2524 if (err)
2525 goto out_free_path;
2526 }
5d4f98a2
YZ
2527 }
2528
98ff7b94
QW
2529 /* Do tree relocation */
2530 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3fd0a558 2531 node = build_backref_tree(rc, &block->key,
5d4f98a2
YZ
2532 block->level, block->bytenr);
2533 if (IS_ERR(node)) {
2534 err = PTR_ERR(node);
2535 goto out;
2536 }
2537
2538 ret = relocate_tree_block(trans, rc, node, &block->key,
2539 path);
2540 if (ret < 0) {
50dbbb71
JB
2541 err = ret;
2542 break;
5d4f98a2 2543 }
5d4f98a2 2544 }
5d4f98a2 2545out:
3fd0a558 2546 err = finish_pending_nodes(trans, rc, path, err);
5d4f98a2 2547
34c2b290 2548out_free_path:
5d4f98a2 2549 btrfs_free_path(path);
34c2b290 2550out_free_blocks:
e1a12670 2551 free_block_list(blocks);
5d4f98a2
YZ
2552 return err;
2553}
2554
056d9bec
NB
2555static noinline_for_stack int prealloc_file_extent_cluster(
2556 struct btrfs_inode *inode,
2557 struct file_extent_cluster *cluster)
efa56464
YZ
2558{
2559 u64 alloc_hint = 0;
2560 u64 start;
2561 u64 end;
056d9bec 2562 u64 offset = inode->index_cnt;
efa56464 2563 u64 num_bytes;
4e9d0d01 2564 int nr;
efa56464 2565 int ret = 0;
dcb40c19
WX
2566 u64 prealloc_start = cluster->start - offset;
2567 u64 prealloc_end = cluster->end - offset;
214e61d0 2568 u64 cur_offset = prealloc_start;
efa56464
YZ
2569
2570 BUG_ON(cluster->start != cluster->boundary[0]);
056d9bec 2571 ret = btrfs_alloc_data_chunk_ondemand(inode,
a89ef455 2572 prealloc_end + 1 - prealloc_start);
efa56464 2573 if (ret)
214e61d0 2574 return ret;
efa56464 2575
056d9bec 2576 inode_lock(&inode->vfs_inode);
4e9d0d01 2577 for (nr = 0; nr < cluster->nr; nr++) {
efa56464
YZ
2578 start = cluster->boundary[nr] - offset;
2579 if (nr + 1 < cluster->nr)
2580 end = cluster->boundary[nr + 1] - 1 - offset;
2581 else
2582 end = cluster->end - offset;
2583
056d9bec 2584 lock_extent(&inode->io_tree, start, end);
efa56464 2585 num_bytes = end + 1 - start;
056d9bec 2586 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
efa56464
YZ
2587 num_bytes, num_bytes,
2588 end + 1, &alloc_hint);
18513091 2589 cur_offset = end + 1;
056d9bec 2590 unlock_extent(&inode->io_tree, start, end);
efa56464
YZ
2591 if (ret)
2592 break;
efa56464 2593 }
056d9bec 2594 inode_unlock(&inode->vfs_inode);
214e61d0 2595
18513091 2596 if (cur_offset < prealloc_end)
056d9bec 2597 btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
a89ef455 2598 prealloc_end + 1 - cur_offset);
efa56464
YZ
2599 return ret;
2600}
2601
5d4f98a2 2602static noinline_for_stack
0257bb82
YZ
2603int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2604 u64 block_start)
2605{
0257bb82
YZ
2606 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2607 struct extent_map *em;
2608 int ret = 0;
2609
172ddd60 2610 em = alloc_extent_map();
0257bb82
YZ
2611 if (!em)
2612 return -ENOMEM;
2613
2614 em->start = start;
2615 em->len = end + 1 - start;
2616 em->block_len = em->len;
2617 em->block_start = block_start;
0257bb82
YZ
2618 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2619
d0082371 2620 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
0257bb82
YZ
2621 while (1) {
2622 write_lock(&em_tree->lock);
09a2a8f9 2623 ret = add_extent_mapping(em_tree, em, 0);
0257bb82
YZ
2624 write_unlock(&em_tree->lock);
2625 if (ret != -EEXIST) {
2626 free_extent_map(em);
2627 break;
2628 }
dcdbc059 2629 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
0257bb82 2630 }
d0082371 2631 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
0257bb82
YZ
2632 return ret;
2633}
2634
726a3421
QW
2635/*
2636 * Allow error injection to test balance cancellation
2637 */
2638int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
2639{
5cb502f4
QW
2640 return atomic_read(&fs_info->balance_cancel_req) ||
2641 fatal_signal_pending(current);
726a3421
QW
2642}
2643ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2644
0257bb82
YZ
2645static int relocate_file_extent_cluster(struct inode *inode,
2646 struct file_extent_cluster *cluster)
5d4f98a2 2647{
2ff7e61e 2648 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5d4f98a2
YZ
2649 u64 page_start;
2650 u64 page_end;
0257bb82
YZ
2651 u64 offset = BTRFS_I(inode)->index_cnt;
2652 unsigned long index;
5d4f98a2 2653 unsigned long last_index;
5d4f98a2
YZ
2654 struct page *page;
2655 struct file_ra_state *ra;
3b16a4e3 2656 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
0257bb82 2657 int nr = 0;
5d4f98a2
YZ
2658 int ret = 0;
2659
0257bb82
YZ
2660 if (!cluster->nr)
2661 return 0;
2662
5d4f98a2
YZ
2663 ra = kzalloc(sizeof(*ra), GFP_NOFS);
2664 if (!ra)
2665 return -ENOMEM;
2666
056d9bec 2667 ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
efa56464
YZ
2668 if (ret)
2669 goto out;
0257bb82 2670
efa56464 2671 file_ra_state_init(ra, inode->i_mapping);
5d4f98a2 2672
0257bb82
YZ
2673 ret = setup_extent_mapping(inode, cluster->start - offset,
2674 cluster->end - offset, cluster->start);
5d4f98a2 2675 if (ret)
efa56464 2676 goto out;
5d4f98a2 2677
09cbfeaf
KS
2678 index = (cluster->start - offset) >> PAGE_SHIFT;
2679 last_index = (cluster->end - offset) >> PAGE_SHIFT;
0257bb82 2680 while (index <= last_index) {
9f3db423
NB
2681 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
2682 PAGE_SIZE);
efa56464
YZ
2683 if (ret)
2684 goto out;
2685
0257bb82 2686 page = find_lock_page(inode->i_mapping, index);
5d4f98a2 2687 if (!page) {
0257bb82
YZ
2688 page_cache_sync_readahead(inode->i_mapping,
2689 ra, NULL, index,
2690 last_index + 1 - index);
a94733d0 2691 page = find_or_create_page(inode->i_mapping, index,
3b16a4e3 2692 mask);
0257bb82 2693 if (!page) {
691fa059 2694 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 2695 PAGE_SIZE, true);
44db1216 2696 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 2697 PAGE_SIZE);
0257bb82 2698 ret = -ENOMEM;
efa56464 2699 goto out;
0257bb82 2700 }
5d4f98a2 2701 }
0257bb82
YZ
2702
2703 if (PageReadahead(page)) {
2704 page_cache_async_readahead(inode->i_mapping,
2705 ra, NULL, page, index,
2706 last_index + 1 - index);
2707 }
2708
5d4f98a2
YZ
2709 if (!PageUptodate(page)) {
2710 btrfs_readpage(NULL, page);
2711 lock_page(page);
2712 if (!PageUptodate(page)) {
2713 unlock_page(page);
09cbfeaf 2714 put_page(page);
691fa059 2715 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 2716 PAGE_SIZE, true);
8b62f87b 2717 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 2718 PAGE_SIZE);
5d4f98a2 2719 ret = -EIO;
efa56464 2720 goto out;
5d4f98a2
YZ
2721 }
2722 }
5d4f98a2 2723
4eee4fa4 2724 page_start = page_offset(page);
09cbfeaf 2725 page_end = page_start + PAGE_SIZE - 1;
0257bb82 2726
d0082371 2727 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
0257bb82 2728
5d4f98a2
YZ
2729 set_page_extent_mapped(page);
2730
0257bb82
YZ
2731 if (nr < cluster->nr &&
2732 page_start + offset == cluster->boundary[nr]) {
2733 set_extent_bits(&BTRFS_I(inode)->io_tree,
2734 page_start, page_end,
ceeb0ae7 2735 EXTENT_BOUNDARY);
0257bb82
YZ
2736 nr++;
2737 }
5d4f98a2 2738
c2566f22
NB
2739 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
2740 page_end, 0, NULL);
765f3ceb
NB
2741 if (ret) {
2742 unlock_page(page);
2743 put_page(page);
2744 btrfs_delalloc_release_metadata(BTRFS_I(inode),
43b18595 2745 PAGE_SIZE, true);
765f3ceb 2746 btrfs_delalloc_release_extents(BTRFS_I(inode),
8702ba93 2747 PAGE_SIZE);
765f3ceb
NB
2748
2749 clear_extent_bits(&BTRFS_I(inode)->io_tree,
2750 page_start, page_end,
2751 EXTENT_LOCKED | EXTENT_BOUNDARY);
2752 goto out;
2753
2754 }
5d4f98a2 2755 set_page_dirty(page);
5d4f98a2 2756
0257bb82 2757 unlock_extent(&BTRFS_I(inode)->io_tree,
d0082371 2758 page_start, page_end);
5d4f98a2 2759 unlock_page(page);
09cbfeaf 2760 put_page(page);
0257bb82
YZ
2761
2762 index++;
8702ba93 2763 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
efa56464 2764 balance_dirty_pages_ratelimited(inode->i_mapping);
2ff7e61e 2765 btrfs_throttle(fs_info);
7f913c7c
QW
2766 if (btrfs_should_cancel_balance(fs_info)) {
2767 ret = -ECANCELED;
2768 goto out;
2769 }
5d4f98a2 2770 }
0257bb82 2771 WARN_ON(nr != cluster->nr);
efa56464 2772out:
5d4f98a2 2773 kfree(ra);
5d4f98a2
YZ
2774 return ret;
2775}
2776
2777static noinline_for_stack
0257bb82
YZ
2778int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2779 struct file_extent_cluster *cluster)
5d4f98a2 2780{
0257bb82 2781 int ret;
5d4f98a2 2782
0257bb82
YZ
2783 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2784 ret = relocate_file_extent_cluster(inode, cluster);
2785 if (ret)
2786 return ret;
2787 cluster->nr = 0;
5d4f98a2 2788 }
5d4f98a2 2789
0257bb82
YZ
2790 if (!cluster->nr)
2791 cluster->start = extent_key->objectid;
2792 else
2793 BUG_ON(cluster->nr >= MAX_EXTENTS);
2794 cluster->end = extent_key->objectid + extent_key->offset - 1;
2795 cluster->boundary[cluster->nr] = extent_key->objectid;
2796 cluster->nr++;
2797
2798 if (cluster->nr >= MAX_EXTENTS) {
2799 ret = relocate_file_extent_cluster(inode, cluster);
2800 if (ret)
2801 return ret;
2802 cluster->nr = 0;
2803 }
2804 return 0;
5d4f98a2
YZ
2805}
2806
5d4f98a2
YZ
2807/*
2808 * helper to add a tree block to the list.
2809 * the major work is getting the generation and level of the block
2810 */
2811static int add_tree_block(struct reloc_control *rc,
2812 struct btrfs_key *extent_key,
2813 struct btrfs_path *path,
2814 struct rb_root *blocks)
2815{
2816 struct extent_buffer *eb;
2817 struct btrfs_extent_item *ei;
2818 struct btrfs_tree_block_info *bi;
2819 struct tree_block *block;
2820 struct rb_node *rb_node;
2821 u32 item_size;
2822 int level = -1;
7fdf4b60 2823 u64 generation;
5d4f98a2
YZ
2824
2825 eb = path->nodes[0];
2826 item_size = btrfs_item_size_nr(eb, path->slots[0]);
2827
3173a18f
JB
2828 if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
2829 item_size >= sizeof(*ei) + sizeof(*bi)) {
5d4f98a2
YZ
2830 ei = btrfs_item_ptr(eb, path->slots[0],
2831 struct btrfs_extent_item);
3173a18f
JB
2832 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
2833 bi = (struct btrfs_tree_block_info *)(ei + 1);
2834 level = btrfs_tree_block_level(eb, bi);
2835 } else {
2836 level = (int)extent_key->offset;
2837 }
5d4f98a2 2838 generation = btrfs_extent_generation(eb, ei);
6d8ff4e4 2839 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
ba3c2b19
NB
2840 btrfs_print_v0_err(eb->fs_info);
2841 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
2842 return -EINVAL;
5d4f98a2 2843 } else {
5d4f98a2 2844 BUG();
5d4f98a2
YZ
2845 }
2846
b3b4aa74 2847 btrfs_release_path(path);
5d4f98a2
YZ
2848
2849 BUG_ON(level == -1);
2850
2851 block = kmalloc(sizeof(*block), GFP_NOFS);
2852 if (!block)
2853 return -ENOMEM;
2854
2855 block->bytenr = extent_key->objectid;
da17066c 2856 block->key.objectid = rc->extent_root->fs_info->nodesize;
5d4f98a2
YZ
2857 block->key.offset = generation;
2858 block->level = level;
2859 block->key_ready = 0;
2860
e9a28dc5 2861 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
43c04fb1 2862 if (rb_node)
982c92cb
QW
2863 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
2864 -EEXIST);
5d4f98a2
YZ
2865
2866 return 0;
2867}
2868
2869/*
2870 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2871 */
2872static int __add_tree_block(struct reloc_control *rc,
2873 u64 bytenr, u32 blocksize,
2874 struct rb_root *blocks)
2875{
0b246afa 2876 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
2877 struct btrfs_path *path;
2878 struct btrfs_key key;
2879 int ret;
0b246afa 2880 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5d4f98a2 2881
7476dfda 2882 if (tree_block_processed(bytenr, rc))
5d4f98a2
YZ
2883 return 0;
2884
e9a28dc5 2885 if (rb_simple_search(blocks, bytenr))
5d4f98a2
YZ
2886 return 0;
2887
2888 path = btrfs_alloc_path();
2889 if (!path)
2890 return -ENOMEM;
aee68ee5 2891again:
5d4f98a2 2892 key.objectid = bytenr;
aee68ee5
JB
2893 if (skinny) {
2894 key.type = BTRFS_METADATA_ITEM_KEY;
2895 key.offset = (u64)-1;
2896 } else {
2897 key.type = BTRFS_EXTENT_ITEM_KEY;
2898 key.offset = blocksize;
2899 }
5d4f98a2
YZ
2900
2901 path->search_commit_root = 1;
2902 path->skip_locking = 1;
2903 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2904 if (ret < 0)
2905 goto out;
5d4f98a2 2906
aee68ee5
JB
2907 if (ret > 0 && skinny) {
2908 if (path->slots[0]) {
2909 path->slots[0]--;
2910 btrfs_item_key_to_cpu(path->nodes[0], &key,
2911 path->slots[0]);
2912 if (key.objectid == bytenr &&
2913 (key.type == BTRFS_METADATA_ITEM_KEY ||
2914 (key.type == BTRFS_EXTENT_ITEM_KEY &&
2915 key.offset == blocksize)))
2916 ret = 0;
2917 }
2918
2919 if (ret) {
2920 skinny = false;
2921 btrfs_release_path(path);
2922 goto again;
2923 }
3173a18f 2924 }
cdccee99
LB
2925 if (ret) {
2926 ASSERT(ret == 1);
2927 btrfs_print_leaf(path->nodes[0]);
2928 btrfs_err(fs_info,
2929 "tree block extent item (%llu) is not found in extent tree",
2930 bytenr);
2931 WARN_ON(1);
2932 ret = -EINVAL;
2933 goto out;
2934 }
3173a18f 2935
5d4f98a2
YZ
2936 ret = add_tree_block(rc, &key, path, blocks);
2937out:
2938 btrfs_free_path(path);
2939 return ret;
2940}
2941
0af3d00b 2942static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
32da5386 2943 struct btrfs_block_group *block_group,
1bbc621e
CM
2944 struct inode *inode,
2945 u64 ino)
0af3d00b 2946{
0af3d00b
JB
2947 struct btrfs_root *root = fs_info->tree_root;
2948 struct btrfs_trans_handle *trans;
0af3d00b
JB
2949 int ret = 0;
2950
2951 if (inode)
2952 goto truncate;
2953
0202e83f 2954 inode = btrfs_iget(fs_info->sb, ino, root);
2e19f1f9 2955 if (IS_ERR(inode))
0af3d00b 2956 return -ENOENT;
0af3d00b
JB
2957
2958truncate:
2ff7e61e 2959 ret = btrfs_check_trunc_cache_free_space(fs_info,
7b61cd92
MX
2960 &fs_info->global_block_rsv);
2961 if (ret)
2962 goto out;
2963
7a7eaa40 2964 trans = btrfs_join_transaction(root);
0af3d00b 2965 if (IS_ERR(trans)) {
3612b495 2966 ret = PTR_ERR(trans);
0af3d00b
JB
2967 goto out;
2968 }
2969
77ab86bf 2970 ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
0af3d00b 2971
3a45bb20 2972 btrfs_end_transaction(trans);
2ff7e61e 2973 btrfs_btree_balance_dirty(fs_info);
0af3d00b
JB
2974out:
2975 iput(inode);
2976 return ret;
2977}
2978
5d4f98a2 2979/*
19b546d7
QW
2980 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
2981 * cache inode, to avoid free space cache data extent blocking data relocation.
5d4f98a2 2982 */
19b546d7
QW
2983static int delete_v1_space_cache(struct extent_buffer *leaf,
2984 struct btrfs_block_group *block_group,
2985 u64 data_bytenr)
5d4f98a2 2986{
19b546d7
QW
2987 u64 space_cache_ino;
2988 struct btrfs_file_extent_item *ei;
5d4f98a2 2989 struct btrfs_key key;
19b546d7
QW
2990 bool found = false;
2991 int i;
5d4f98a2
YZ
2992 int ret;
2993
19b546d7
QW
2994 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
2995 return 0;
5d4f98a2 2996
19b546d7
QW
2997 for (i = 0; i < btrfs_header_nritems(leaf); i++) {
2998 btrfs_item_key_to_cpu(leaf, &key, i);
2999 if (key.type != BTRFS_EXTENT_DATA_KEY)
3000 continue;
3001 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3002 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG &&
3003 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3004 found = true;
3005 space_cache_ino = key.objectid;
5d4f98a2 3006 break;
5d4f98a2 3007 }
5d4f98a2 3008 }
19b546d7
QW
3009 if (!found)
3010 return -ENOENT;
3011 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3012 space_cache_ino);
3013 return ret;
5d4f98a2
YZ
3014}
3015
3016/*
2c016dc2 3017 * helper to find all tree blocks that reference a given data extent
5d4f98a2
YZ
3018 */
3019static noinline_for_stack
3020int add_data_references(struct reloc_control *rc,
3021 struct btrfs_key *extent_key,
3022 struct btrfs_path *path,
3023 struct rb_root *blocks)
3024{
19b546d7
QW
3025 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3026 struct ulist *leaves = NULL;
3027 struct ulist_iterator leaf_uiter;
3028 struct ulist_node *ref_node = NULL;
3029 const u32 blocksize = fs_info->nodesize;
647f63bd 3030 int ret = 0;
5d4f98a2 3031
19b546d7
QW
3032 btrfs_release_path(path);
3033 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3034 0, &leaves, NULL, true);
3035 if (ret < 0)
3036 return ret;
5d4f98a2 3037
19b546d7
QW
3038 ULIST_ITER_INIT(&leaf_uiter);
3039 while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
3040 struct extent_buffer *eb;
5d4f98a2 3041
1b7ec85e 3042 eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
19b546d7
QW
3043 if (IS_ERR(eb)) {
3044 ret = PTR_ERR(eb);
5d4f98a2 3045 break;
5d4f98a2 3046 }
19b546d7
QW
3047 ret = delete_v1_space_cache(eb, rc->block_group,
3048 extent_key->objectid);
3049 free_extent_buffer(eb);
3050 if (ret < 0)
3051 break;
3052 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3053 if (ret < 0)
5d4f98a2 3054 break;
5d4f98a2 3055 }
19b546d7 3056 if (ret < 0)
5d4f98a2 3057 free_block_list(blocks);
19b546d7
QW
3058 ulist_free(leaves);
3059 return ret;
5d4f98a2
YZ
3060}
3061
3062/*
2c016dc2 3063 * helper to find next unprocessed extent
5d4f98a2
YZ
3064 */
3065static noinline_for_stack
147d256e 3066int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3fd0a558 3067 struct btrfs_key *extent_key)
5d4f98a2 3068{
0b246afa 3069 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
3070 struct btrfs_key key;
3071 struct extent_buffer *leaf;
3072 u64 start, end, last;
3073 int ret;
3074
b3470b5d 3075 last = rc->block_group->start + rc->block_group->length;
5d4f98a2
YZ
3076 while (1) {
3077 cond_resched();
3078 if (rc->search_start >= last) {
3079 ret = 1;
3080 break;
3081 }
3082
3083 key.objectid = rc->search_start;
3084 key.type = BTRFS_EXTENT_ITEM_KEY;
3085 key.offset = 0;
3086
3087 path->search_commit_root = 1;
3088 path->skip_locking = 1;
3089 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3090 0, 0);
3091 if (ret < 0)
3092 break;
3093next:
3094 leaf = path->nodes[0];
3095 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3096 ret = btrfs_next_leaf(rc->extent_root, path);
3097 if (ret != 0)
3098 break;
3099 leaf = path->nodes[0];
3100 }
3101
3102 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3103 if (key.objectid >= last) {
3104 ret = 1;
3105 break;
3106 }
3107
3173a18f
JB
3108 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3109 key.type != BTRFS_METADATA_ITEM_KEY) {
3110 path->slots[0]++;
3111 goto next;
3112 }
3113
3114 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5d4f98a2
YZ
3115 key.objectid + key.offset <= rc->search_start) {
3116 path->slots[0]++;
3117 goto next;
3118 }
3119
3173a18f 3120 if (key.type == BTRFS_METADATA_ITEM_KEY &&
0b246afa 3121 key.objectid + fs_info->nodesize <=
3173a18f
JB
3122 rc->search_start) {
3123 path->slots[0]++;
3124 goto next;
3125 }
3126
5d4f98a2
YZ
3127 ret = find_first_extent_bit(&rc->processed_blocks,
3128 key.objectid, &start, &end,
e6138876 3129 EXTENT_DIRTY, NULL);
5d4f98a2
YZ
3130
3131 if (ret == 0 && start <= key.objectid) {
b3b4aa74 3132 btrfs_release_path(path);
5d4f98a2
YZ
3133 rc->search_start = end + 1;
3134 } else {
3173a18f
JB
3135 if (key.type == BTRFS_EXTENT_ITEM_KEY)
3136 rc->search_start = key.objectid + key.offset;
3137 else
3138 rc->search_start = key.objectid +
0b246afa 3139 fs_info->nodesize;
3fd0a558 3140 memcpy(extent_key, &key, sizeof(key));
5d4f98a2
YZ
3141 return 0;
3142 }
3143 }
b3b4aa74 3144 btrfs_release_path(path);
5d4f98a2
YZ
3145 return ret;
3146}
3147
3148static void set_reloc_control(struct reloc_control *rc)
3149{
3150 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
7585717f
CM
3151
3152 mutex_lock(&fs_info->reloc_mutex);
5d4f98a2 3153 fs_info->reloc_ctl = rc;
7585717f 3154 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2
YZ
3155}
3156
3157static void unset_reloc_control(struct reloc_control *rc)
3158{
3159 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
7585717f
CM
3160
3161 mutex_lock(&fs_info->reloc_mutex);
5d4f98a2 3162 fs_info->reloc_ctl = NULL;
7585717f 3163 mutex_unlock(&fs_info->reloc_mutex);
5d4f98a2
YZ
3164}
3165
3166static int check_extent_flags(u64 flags)
3167{
3168 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3169 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3170 return 1;
3171 if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3172 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3173 return 1;
3174 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3175 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3176 return 1;
3177 return 0;
3178}
3179
3fd0a558
YZ
3180static noinline_for_stack
3181int prepare_to_relocate(struct reloc_control *rc)
3182{
3183 struct btrfs_trans_handle *trans;
ac2fabac 3184 int ret;
3fd0a558 3185
2ff7e61e 3186 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
66d8f3dd 3187 BTRFS_BLOCK_RSV_TEMP);
3fd0a558
YZ
3188 if (!rc->block_rsv)
3189 return -ENOMEM;
3190
3fd0a558 3191 memset(&rc->cluster, 0, sizeof(rc->cluster));
b3470b5d 3192 rc->search_start = rc->block_group->start;
3fd0a558
YZ
3193 rc->extents_found = 0;
3194 rc->nodes_relocated = 0;
3195 rc->merging_rsv_size = 0;
0647bf56 3196 rc->reserved_bytes = 0;
da17066c 3197 rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
0647bf56 3198 RELOCATION_RESERVED_NODES;
ac2fabac
JB
3199 ret = btrfs_block_rsv_refill(rc->extent_root,
3200 rc->block_rsv, rc->block_rsv->size,
3201 BTRFS_RESERVE_FLUSH_ALL);
3202 if (ret)
3203 return ret;
3fd0a558
YZ
3204
3205 rc->create_reloc_tree = 1;
3206 set_reloc_control(rc);
3207
7a7eaa40 3208 trans = btrfs_join_transaction(rc->extent_root);
28818947
LB
3209 if (IS_ERR(trans)) {
3210 unset_reloc_control(rc);
3211 /*
3212 * extent tree is not a ref_cow tree and has no reloc_root to
3213 * cleanup. And callers are responsible to free the above
3214 * block rsv.
3215 */
3216 return PTR_ERR(trans);
3217 }
3a45bb20 3218 btrfs_commit_transaction(trans);
3fd0a558
YZ
3219 return 0;
3220}
76dda93c 3221
5d4f98a2
YZ
3222static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3223{
2ff7e61e 3224 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
5d4f98a2
YZ
3225 struct rb_root blocks = RB_ROOT;
3226 struct btrfs_key key;
3227 struct btrfs_trans_handle *trans = NULL;
3228 struct btrfs_path *path;
3229 struct btrfs_extent_item *ei;
5d4f98a2
YZ
3230 u64 flags;
3231 u32 item_size;
3232 int ret;
3233 int err = 0;
c87f08ca 3234 int progress = 0;
5d4f98a2
YZ
3235
3236 path = btrfs_alloc_path();
3fd0a558 3237 if (!path)
5d4f98a2 3238 return -ENOMEM;
e4058b54 3239 path->reada = READA_FORWARD;
5d4f98a2 3240
3fd0a558
YZ
3241 ret = prepare_to_relocate(rc);
3242 if (ret) {
3243 err = ret;
3244 goto out_free;
3245 }
5d4f98a2
YZ
3246
3247 while (1) {
0647bf56
WS
3248 rc->reserved_bytes = 0;
3249 ret = btrfs_block_rsv_refill(rc->extent_root,
3250 rc->block_rsv, rc->block_rsv->size,
3251 BTRFS_RESERVE_FLUSH_ALL);
3252 if (ret) {
3253 err = ret;
3254 break;
3255 }
c87f08ca 3256 progress++;
a22285a6 3257 trans = btrfs_start_transaction(rc->extent_root, 0);
0f788c58
LB
3258 if (IS_ERR(trans)) {
3259 err = PTR_ERR(trans);
3260 trans = NULL;
3261 break;
3262 }
c87f08ca 3263restart:
3fd0a558 3264 if (update_backref_cache(trans, &rc->backref_cache)) {
3a45bb20 3265 btrfs_end_transaction(trans);
42a657f5 3266 trans = NULL;
3fd0a558
YZ
3267 continue;
3268 }
3269
147d256e 3270 ret = find_next_extent(rc, path, &key);
5d4f98a2
YZ
3271 if (ret < 0)
3272 err = ret;
3273 if (ret != 0)
3274 break;
3275
3276 rc->extents_found++;
3277
3278 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3279 struct btrfs_extent_item);
3fd0a558 3280 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
5d4f98a2
YZ
3281 if (item_size >= sizeof(*ei)) {
3282 flags = btrfs_extent_flags(path->nodes[0], ei);
3283 ret = check_extent_flags(flags);
3284 BUG_ON(ret);
6d8ff4e4 3285 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
ba3c2b19
NB
3286 err = -EINVAL;
3287 btrfs_print_v0_err(trans->fs_info);
3288 btrfs_abort_transaction(trans, err);
3289 break;
5d4f98a2 3290 } else {
5d4f98a2 3291 BUG();
5d4f98a2
YZ
3292 }
3293
3294 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3295 ret = add_tree_block(rc, &key, path, &blocks);
3296 } else if (rc->stage == UPDATE_DATA_PTRS &&
3fd0a558 3297 (flags & BTRFS_EXTENT_FLAG_DATA)) {
5d4f98a2
YZ
3298 ret = add_data_references(rc, &key, path, &blocks);
3299 } else {
b3b4aa74 3300 btrfs_release_path(path);
5d4f98a2
YZ
3301 ret = 0;
3302 }
3303 if (ret < 0) {
3fd0a558 3304 err = ret;
5d4f98a2
YZ
3305 break;
3306 }
3307
3308 if (!RB_EMPTY_ROOT(&blocks)) {
3309 ret = relocate_tree_blocks(trans, rc, &blocks);
3310 if (ret < 0) {
3fd0a558
YZ
3311 if (ret != -EAGAIN) {
3312 err = ret;
3313 break;
3314 }
3315 rc->extents_found--;
3316 rc->search_start = key.objectid;
3317 }
3318 }
3319
3a45bb20 3320 btrfs_end_transaction_throttle(trans);
2ff7e61e 3321 btrfs_btree_balance_dirty(fs_info);
5d4f98a2 3322 trans = NULL;
5d4f98a2
YZ
3323
3324 if (rc->stage == MOVE_DATA_EXTENTS &&
3325 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3326 rc->found_file_extent = 1;
0257bb82 3327 ret = relocate_data_extent(rc->data_inode,
3fd0a558 3328 &key, &rc->cluster);
5d4f98a2
YZ
3329 if (ret < 0) {
3330 err = ret;
3331 break;
3332 }
3333 }
f31ea088
QW
3334 if (btrfs_should_cancel_balance(fs_info)) {
3335 err = -ECANCELED;
3336 break;
3337 }
5d4f98a2 3338 }
c87f08ca 3339 if (trans && progress && err == -ENOSPC) {
43a7e99d 3340 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
9689457b 3341 if (ret == 1) {
c87f08ca
CM
3342 err = 0;
3343 progress = 0;
3344 goto restart;
3345 }
3346 }
3fd0a558 3347
b3b4aa74 3348 btrfs_release_path(path);
91166212 3349 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
5d4f98a2
YZ
3350
3351 if (trans) {
3a45bb20 3352 btrfs_end_transaction_throttle(trans);
2ff7e61e 3353 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
3354 }
3355
0257bb82 3356 if (!err) {
3fd0a558
YZ
3357 ret = relocate_file_extent_cluster(rc->data_inode,
3358 &rc->cluster);
0257bb82
YZ
3359 if (ret < 0)
3360 err = ret;
3361 }
3362
3fd0a558
YZ
3363 rc->create_reloc_tree = 0;
3364 set_reloc_control(rc);
0257bb82 3365
13fe1bdb 3366 btrfs_backref_release_cache(&rc->backref_cache);
63f018be 3367 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
5d4f98a2 3368
7f913c7c
QW
3369 /*
3370 * Even in the case when the relocation is cancelled, we should all go
3371 * through prepare_to_merge() and merge_reloc_roots().
3372 *
3373 * For error (including cancelled balance), prepare_to_merge() will
3374 * mark all reloc trees orphan, then queue them for cleanup in
3375 * merge_reloc_roots()
3376 */
3fd0a558 3377 err = prepare_to_merge(rc, err);
5d4f98a2
YZ
3378
3379 merge_reloc_roots(rc);
3380
3fd0a558 3381 rc->merge_reloc_tree = 0;
5d4f98a2 3382 unset_reloc_control(rc);
63f018be 3383 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
5d4f98a2
YZ
3384
3385 /* get rid of pinned extents */
7a7eaa40 3386 trans = btrfs_join_transaction(rc->extent_root);
62b99540 3387 if (IS_ERR(trans)) {
3612b495 3388 err = PTR_ERR(trans);
62b99540
QW
3389 goto out_free;
3390 }
3a45bb20 3391 btrfs_commit_transaction(trans);
6217b0fa 3392out_free:
d2311e69
QW
3393 ret = clean_dirty_subvols(rc);
3394 if (ret < 0 && !err)
3395 err = ret;
2ff7e61e 3396 btrfs_free_block_rsv(fs_info, rc->block_rsv);
3fd0a558 3397 btrfs_free_path(path);
5d4f98a2
YZ
3398 return err;
3399}
3400
3401static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
0257bb82 3402 struct btrfs_root *root, u64 objectid)
5d4f98a2
YZ
3403{
3404 struct btrfs_path *path;
3405 struct btrfs_inode_item *item;
3406 struct extent_buffer *leaf;
3407 int ret;
3408
3409 path = btrfs_alloc_path();
3410 if (!path)
3411 return -ENOMEM;
3412
3413 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3414 if (ret)
3415 goto out;
3416
3417 leaf = path->nodes[0];
3418 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
b159fa28 3419 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
5d4f98a2 3420 btrfs_set_inode_generation(leaf, item, 1);
0257bb82 3421 btrfs_set_inode_size(leaf, item, 0);
5d4f98a2 3422 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3fd0a558
YZ
3423 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3424 BTRFS_INODE_PREALLOC);
5d4f98a2 3425 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
3426out:
3427 btrfs_free_path(path);
3428 return ret;
3429}
3430
3431/*
3432 * helper to create inode for data relocation.
3433 * the inode is in data relocation tree and its link count is 0
3434 */
3fd0a558
YZ
3435static noinline_for_stack
3436struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
32da5386 3437 struct btrfs_block_group *group)
5d4f98a2
YZ
3438{
3439 struct inode *inode = NULL;
3440 struct btrfs_trans_handle *trans;
3441 struct btrfs_root *root;
4624900d 3442 u64 objectid;
5d4f98a2
YZ
3443 int err = 0;
3444
aeb935a4 3445 root = btrfs_grab_root(fs_info->data_reloc_root);
a22285a6 3446 trans = btrfs_start_transaction(root, 6);
76deacf0 3447 if (IS_ERR(trans)) {
00246528 3448 btrfs_put_root(root);
3fd0a558 3449 return ERR_CAST(trans);
76deacf0 3450 }
5d4f98a2 3451
581bb050 3452 err = btrfs_find_free_objectid(root, &objectid);
5d4f98a2
YZ
3453 if (err)
3454 goto out;
3455
0257bb82 3456 err = __insert_orphan_inode(trans, root, objectid);
5d4f98a2
YZ
3457 BUG_ON(err);
3458
0202e83f 3459 inode = btrfs_iget(fs_info->sb, objectid, root);
2e19f1f9 3460 BUG_ON(IS_ERR(inode));
b3470b5d 3461 BTRFS_I(inode)->index_cnt = group->start;
5d4f98a2 3462
73f2e545 3463 err = btrfs_orphan_add(trans, BTRFS_I(inode));
5d4f98a2 3464out:
00246528 3465 btrfs_put_root(root);
3a45bb20 3466 btrfs_end_transaction(trans);
2ff7e61e 3467 btrfs_btree_balance_dirty(fs_info);
5d4f98a2
YZ
3468 if (err) {
3469 if (inode)
3470 iput(inode);
3471 inode = ERR_PTR(err);
3472 }
3473 return inode;
3474}
3475
c258d6e3 3476static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3fd0a558
YZ
3477{
3478 struct reloc_control *rc;
3479
3480 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3481 if (!rc)
3482 return NULL;
3483
3484 INIT_LIST_HEAD(&rc->reloc_roots);
d2311e69 3485 INIT_LIST_HEAD(&rc->dirty_subvol_roots);
584fb121 3486 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
3fd0a558 3487 mapping_tree_init(&rc->reloc_root_tree);
43eb5f29
QW
3488 extent_io_tree_init(fs_info, &rc->processed_blocks,
3489 IO_TREE_RELOC_BLOCKS, NULL);
3fd0a558
YZ
3490 return rc;
3491}
3492
1a0afa0e
JB
3493static void free_reloc_control(struct reloc_control *rc)
3494{
3495 struct mapping_node *node, *tmp;
3496
3497 free_reloc_roots(&rc->reloc_roots);
3498 rbtree_postorder_for_each_entry_safe(node, tmp,
3499 &rc->reloc_root_tree.rb_root, rb_node)
3500 kfree(node);
3501
3502 kfree(rc);
3503}
3504
ebce0e01
AB
3505/*
3506 * Print the block group being relocated
3507 */
3508static void describe_relocation(struct btrfs_fs_info *fs_info,
32da5386 3509 struct btrfs_block_group *block_group)
ebce0e01 3510{
f89e09cf 3511 char buf[128] = {'\0'};
ebce0e01 3512
f89e09cf 3513 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
ebce0e01
AB
3514
3515 btrfs_info(fs_info,
3516 "relocating block group %llu flags %s",
b3470b5d 3517 block_group->start, buf);
ebce0e01
AB
3518}
3519
430640e3
QW
3520static const char *stage_to_string(int stage)
3521{
3522 if (stage == MOVE_DATA_EXTENTS)
3523 return "move data extents";
3524 if (stage == UPDATE_DATA_PTRS)
3525 return "update data pointers";
3526 return "unknown";
3527}
3528
5d4f98a2
YZ
3529/*
3530 * function to relocate all extents in a block group.
3531 */
6bccf3ab 3532int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
5d4f98a2 3533{
32da5386 3534 struct btrfs_block_group *bg;
6bccf3ab 3535 struct btrfs_root *extent_root = fs_info->extent_root;
5d4f98a2 3536 struct reloc_control *rc;
0af3d00b
JB
3537 struct inode *inode;
3538 struct btrfs_path *path;
5d4f98a2 3539 int ret;
f0486c68 3540 int rw = 0;
5d4f98a2
YZ
3541 int err = 0;
3542
eede2bf3
OS
3543 bg = btrfs_lookup_block_group(fs_info, group_start);
3544 if (!bg)
3545 return -ENOENT;
3546
3547 if (btrfs_pinned_by_swapfile(fs_info, bg)) {
3548 btrfs_put_block_group(bg);
3549 return -ETXTBSY;
3550 }
3551
c258d6e3 3552 rc = alloc_reloc_control(fs_info);
eede2bf3
OS
3553 if (!rc) {
3554 btrfs_put_block_group(bg);
5d4f98a2 3555 return -ENOMEM;
eede2bf3 3556 }
5d4f98a2 3557
f0486c68 3558 rc->extent_root = extent_root;
eede2bf3 3559 rc->block_group = bg;
5d4f98a2 3560
b12de528 3561 ret = btrfs_inc_block_group_ro(rc->block_group, true);
868f401a
Z
3562 if (ret) {
3563 err = ret;
3564 goto out;
f0486c68 3565 }
868f401a 3566 rw = 1;
f0486c68 3567
0af3d00b
JB
3568 path = btrfs_alloc_path();
3569 if (!path) {
3570 err = -ENOMEM;
3571 goto out;
3572 }
3573
7949f339 3574 inode = lookup_free_space_inode(rc->block_group, path);
0af3d00b
JB
3575 btrfs_free_path(path);
3576
3577 if (!IS_ERR(inode))
1bbc621e 3578 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
0af3d00b
JB
3579 else
3580 ret = PTR_ERR(inode);
3581
3582 if (ret && ret != -ENOENT) {
3583 err = ret;
3584 goto out;
3585 }
3586
5d4f98a2
YZ
3587 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3588 if (IS_ERR(rc->data_inode)) {
3589 err = PTR_ERR(rc->data_inode);
3590 rc->data_inode = NULL;
3591 goto out;
3592 }
3593
0b246afa 3594 describe_relocation(fs_info, rc->block_group);
5d4f98a2 3595
9cfa3e34 3596 btrfs_wait_block_group_reservations(rc->block_group);
f78c436c 3597 btrfs_wait_nocow_writers(rc->block_group);
6374e57a 3598 btrfs_wait_ordered_roots(fs_info, U64_MAX,
b3470b5d
DS
3599 rc->block_group->start,
3600 rc->block_group->length);
5d4f98a2
YZ
3601
3602 while (1) {
430640e3
QW
3603 int finishes_stage;
3604
76dda93c 3605 mutex_lock(&fs_info->cleaner_mutex);
5d4f98a2 3606 ret = relocate_block_group(rc);
76dda93c 3607 mutex_unlock(&fs_info->cleaner_mutex);
ff612ba7 3608 if (ret < 0)
5d4f98a2 3609 err = ret;
5d4f98a2 3610
430640e3 3611 finishes_stage = rc->stage;
ff612ba7
JB
3612 /*
3613 * We may have gotten ENOSPC after we already dirtied some
3614 * extents. If writeout happens while we're relocating a
3615 * different block group we could end up hitting the
3616 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
3617 * btrfs_reloc_cow_block. Make sure we write everything out
3618 * properly so we don't trip over this problem, and then break
3619 * out of the loop if we hit an error.
3620 */
5d4f98a2 3621 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
0ef8b726
JB
3622 ret = btrfs_wait_ordered_range(rc->data_inode, 0,
3623 (u64)-1);
ff612ba7 3624 if (ret)
0ef8b726 3625 err = ret;
5d4f98a2
YZ
3626 invalidate_mapping_pages(rc->data_inode->i_mapping,
3627 0, -1);
3628 rc->stage = UPDATE_DATA_PTRS;
5d4f98a2 3629 }
ff612ba7
JB
3630
3631 if (err < 0)
3632 goto out;
3633
3634 if (rc->extents_found == 0)
3635 break;
3636
430640e3
QW
3637 btrfs_info(fs_info, "found %llu extents, stage: %s",
3638 rc->extents_found, stage_to_string(finishes_stage));
5d4f98a2
YZ
3639 }
3640
5d4f98a2
YZ
3641 WARN_ON(rc->block_group->pinned > 0);
3642 WARN_ON(rc->block_group->reserved > 0);
bf38be65 3643 WARN_ON(rc->block_group->used > 0);
5d4f98a2 3644out:
f0486c68 3645 if (err && rw)
2ff7e61e 3646 btrfs_dec_block_group_ro(rc->block_group);
5d4f98a2 3647 iput(rc->data_inode);
5d4f98a2 3648 btrfs_put_block_group(rc->block_group);
1a0afa0e 3649 free_reloc_control(rc);
5d4f98a2
YZ
3650 return err;
3651}
3652
76dda93c
YZ
3653static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3654{
0b246afa 3655 struct btrfs_fs_info *fs_info = root->fs_info;
76dda93c 3656 struct btrfs_trans_handle *trans;
79787eaa 3657 int ret, err;
76dda93c 3658
0b246afa 3659 trans = btrfs_start_transaction(fs_info->tree_root, 0);
79787eaa
JM
3660 if (IS_ERR(trans))
3661 return PTR_ERR(trans);
76dda93c
YZ
3662
3663 memset(&root->root_item.drop_progress, 0,
3664 sizeof(root->root_item.drop_progress));
c8422684 3665 btrfs_set_root_drop_level(&root->root_item, 0);
76dda93c 3666 btrfs_set_root_refs(&root->root_item, 0);
0b246afa 3667 ret = btrfs_update_root(trans, fs_info->tree_root,
76dda93c 3668 &root->root_key, &root->root_item);
76dda93c 3669
3a45bb20 3670 err = btrfs_end_transaction(trans);
79787eaa
JM
3671 if (err)
3672 return err;
3673 return ret;
76dda93c
YZ
3674}
3675
5d4f98a2
YZ
3676/*
3677 * recover relocation interrupted by system crash.
3678 *
3679 * this function resumes merging reloc trees with corresponding fs trees.
3680 * this is important for keeping the sharing of tree blocks
3681 */
3682int btrfs_recover_relocation(struct btrfs_root *root)
3683{
0b246afa 3684 struct btrfs_fs_info *fs_info = root->fs_info;
5d4f98a2
YZ
3685 LIST_HEAD(reloc_roots);
3686 struct btrfs_key key;
3687 struct btrfs_root *fs_root;
3688 struct btrfs_root *reloc_root;
3689 struct btrfs_path *path;
3690 struct extent_buffer *leaf;
3691 struct reloc_control *rc = NULL;
3692 struct btrfs_trans_handle *trans;
3693 int ret;
3694 int err = 0;
3695
3696 path = btrfs_alloc_path();
3697 if (!path)
3698 return -ENOMEM;
e4058b54 3699 path->reada = READA_BACK;
5d4f98a2
YZ
3700
3701 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3702 key.type = BTRFS_ROOT_ITEM_KEY;
3703 key.offset = (u64)-1;
3704
3705 while (1) {
0b246afa 3706 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
5d4f98a2
YZ
3707 path, 0, 0);
3708 if (ret < 0) {
3709 err = ret;
3710 goto out;
3711 }
3712 if (ret > 0) {
3713 if (path->slots[0] == 0)
3714 break;
3715 path->slots[0]--;
3716 }
3717 leaf = path->nodes[0];
3718 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
b3b4aa74 3719 btrfs_release_path(path);
5d4f98a2
YZ
3720
3721 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3722 key.type != BTRFS_ROOT_ITEM_KEY)
3723 break;
3724
3dbf1738 3725 reloc_root = btrfs_read_tree_root(root, &key);
5d4f98a2
YZ
3726 if (IS_ERR(reloc_root)) {
3727 err = PTR_ERR(reloc_root);
3728 goto out;
3729 }
3730
92a7cc42 3731 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
5d4f98a2
YZ
3732 list_add(&reloc_root->root_list, &reloc_roots);
3733
3734 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
a820feb5
DS
3735 fs_root = btrfs_get_fs_root(fs_info,
3736 reloc_root->root_key.offset, false);
5d4f98a2 3737 if (IS_ERR(fs_root)) {
76dda93c
YZ
3738 ret = PTR_ERR(fs_root);
3739 if (ret != -ENOENT) {
3740 err = ret;
3741 goto out;
3742 }
79787eaa
JM
3743 ret = mark_garbage_root(reloc_root);
3744 if (ret < 0) {
3745 err = ret;
3746 goto out;
3747 }
932fd26d 3748 } else {
00246528 3749 btrfs_put_root(fs_root);
5d4f98a2
YZ
3750 }
3751 }
3752
3753 if (key.offset == 0)
3754 break;
3755
3756 key.offset--;
3757 }
b3b4aa74 3758 btrfs_release_path(path);
5d4f98a2
YZ
3759
3760 if (list_empty(&reloc_roots))
3761 goto out;
3762
c258d6e3 3763 rc = alloc_reloc_control(fs_info);
5d4f98a2
YZ
3764 if (!rc) {
3765 err = -ENOMEM;
3766 goto out;
3767 }
3768
0b246afa 3769 rc->extent_root = fs_info->extent_root;
5d4f98a2
YZ
3770
3771 set_reloc_control(rc);
3772
7a7eaa40 3773 trans = btrfs_join_transaction(rc->extent_root);
3612b495 3774 if (IS_ERR(trans)) {
3612b495 3775 err = PTR_ERR(trans);
fb2d83ee 3776 goto out_unset;
3612b495 3777 }
3fd0a558
YZ
3778
3779 rc->merge_reloc_tree = 1;
3780
5d4f98a2
YZ
3781 while (!list_empty(&reloc_roots)) {
3782 reloc_root = list_entry(reloc_roots.next,
3783 struct btrfs_root, root_list);
3784 list_del(&reloc_root->root_list);
3785
3786 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3787 list_add_tail(&reloc_root->root_list,
3788 &rc->reloc_roots);
3789 continue;
3790 }
3791
a820feb5
DS
3792 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
3793 false);
79787eaa
JM
3794 if (IS_ERR(fs_root)) {
3795 err = PTR_ERR(fs_root);
ca1aa281 3796 list_add_tail(&reloc_root->root_list, &reloc_roots);
1402d17d 3797 btrfs_end_transaction(trans);
fb2d83ee 3798 goto out_unset;
79787eaa 3799 }
5d4f98a2 3800
ffd7b339 3801 err = __add_reloc_root(reloc_root);
79787eaa 3802 BUG_ON(err < 0); /* -ENOMEM or logic error */
f44deb74 3803 fs_root->reloc_root = btrfs_grab_root(reloc_root);
00246528 3804 btrfs_put_root(fs_root);
5d4f98a2
YZ
3805 }
3806
3a45bb20 3807 err = btrfs_commit_transaction(trans);
79787eaa 3808 if (err)
fb2d83ee 3809 goto out_unset;
5d4f98a2
YZ
3810
3811 merge_reloc_roots(rc);
3812
3813 unset_reloc_control(rc);
3814
7a7eaa40 3815 trans = btrfs_join_transaction(rc->extent_root);
62b99540 3816 if (IS_ERR(trans)) {
3612b495 3817 err = PTR_ERR(trans);
6217b0fa 3818 goto out_clean;
62b99540 3819 }
3a45bb20 3820 err = btrfs_commit_transaction(trans);
6217b0fa 3821out_clean:
d2311e69
QW
3822 ret = clean_dirty_subvols(rc);
3823 if (ret < 0 && !err)
3824 err = ret;
fb2d83ee
JB
3825out_unset:
3826 unset_reloc_control(rc);
1a0afa0e 3827 free_reloc_control(rc);
3612b495 3828out:
a7571232 3829 free_reloc_roots(&reloc_roots);
aca1bba6 3830
5d4f98a2
YZ
3831 btrfs_free_path(path);
3832
3833 if (err == 0) {
3834 /* cleanup orphan inode in data relocation tree */
aeb935a4
QW
3835 fs_root = btrfs_grab_root(fs_info->data_reloc_root);
3836 ASSERT(fs_root);
3837 err = btrfs_orphan_cleanup(fs_root);
3838 btrfs_put_root(fs_root);
5d4f98a2
YZ
3839 }
3840 return err;
3841}
3842
3843/*
3844 * helper to add ordered checksum for data relocation.
3845 *
3846 * cloning checksum properly handles the nodatasum extents.
3847 * it also saves CPU time to re-calculate the checksum.
3848 */
7bfa9535 3849int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
5d4f98a2 3850{
7bfa9535 3851 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5d4f98a2 3852 struct btrfs_ordered_sum *sums;
5d4f98a2 3853 struct btrfs_ordered_extent *ordered;
5d4f98a2
YZ
3854 int ret;
3855 u64 disk_bytenr;
4577b014 3856 u64 new_bytenr;
5d4f98a2
YZ
3857 LIST_HEAD(list);
3858
7bfa9535 3859 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
bffe633e 3860 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
5d4f98a2 3861
7bfa9535 3862 disk_bytenr = file_pos + inode->index_cnt;
0b246afa 3863 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
a2de733c 3864 disk_bytenr + len - 1, &list, 0);
79787eaa
JM
3865 if (ret)
3866 goto out;
5d4f98a2
YZ
3867
3868 while (!list_empty(&list)) {
3869 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3870 list_del_init(&sums->list);
3871
4577b014
JB
3872 /*
3873 * We need to offset the new_bytenr based on where the csum is.
3874 * We need to do this because we will read in entire prealloc
3875 * extents but we may have written to say the middle of the
3876 * prealloc extent, so we need to make sure the csum goes with
3877 * the right disk offset.
3878 *
3879 * We can do this because the data reloc inode refers strictly
3880 * to the on disk bytes, so we don't have to worry about
3881 * disk_len vs real len like with real inodes since it's all
3882 * disk length.
3883 */
bffe633e 3884 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
4577b014 3885 sums->bytenr = new_bytenr;
5d4f98a2 3886
f9756261 3887 btrfs_add_ordered_sum(ordered, sums);
5d4f98a2 3888 }
79787eaa 3889out:
5d4f98a2 3890 btrfs_put_ordered_extent(ordered);
411fc6bc 3891 return ret;
5d4f98a2 3892}
3fd0a558 3893
83d4cfd4
JB
3894int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
3895 struct btrfs_root *root, struct extent_buffer *buf,
3896 struct extent_buffer *cow)
3fd0a558 3897{
0b246afa 3898 struct btrfs_fs_info *fs_info = root->fs_info;
3fd0a558 3899 struct reloc_control *rc;
a26195a5 3900 struct btrfs_backref_node *node;
3fd0a558
YZ
3901 int first_cow = 0;
3902 int level;
83d4cfd4 3903 int ret = 0;
3fd0a558 3904
0b246afa 3905 rc = fs_info->reloc_ctl;
3fd0a558 3906 if (!rc)
83d4cfd4 3907 return 0;
3fd0a558
YZ
3908
3909 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
3910 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
3911
3912 level = btrfs_header_level(buf);
3913 if (btrfs_header_generation(buf) <=
3914 btrfs_root_last_snapshot(&root->root_item))
3915 first_cow = 1;
3916
3917 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
3918 rc->create_reloc_tree) {
3919 WARN_ON(!first_cow && level == 0);
3920
3921 node = rc->backref_cache.path[level];
3922 BUG_ON(node->bytenr != buf->start &&
3923 node->new_bytenr != buf->start);
3924
b0fe7078 3925 btrfs_backref_drop_node_buffer(node);
67439dad 3926 atomic_inc(&cow->refs);
3fd0a558
YZ
3927 node->eb = cow;
3928 node->new_bytenr = cow->start;
3929
3930 if (!node->pending) {
3931 list_move_tail(&node->list,
3932 &rc->backref_cache.pending[level]);
3933 node->pending = 1;
3934 }
3935
3936 if (first_cow)
9569cc20 3937 mark_block_processed(rc, node);
3fd0a558
YZ
3938
3939 if (first_cow && level > 0)
3940 rc->nodes_relocated += buf->len;
3941 }
3942
83d4cfd4 3943 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
3fd0a558 3944 ret = replace_file_extents(trans, rc, root, cow);
83d4cfd4 3945 return ret;
3fd0a558
YZ
3946}
3947
3948/*
3949 * called before creating snapshot. it calculates metadata reservation
01327610 3950 * required for relocating tree blocks in the snapshot
3fd0a558 3951 */
147d256e 3952void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
3fd0a558
YZ
3953 u64 *bytes_to_reserve)
3954{
10995c04
QW
3955 struct btrfs_root *root = pending->root;
3956 struct reloc_control *rc = root->fs_info->reloc_ctl;
3fd0a558 3957
6282675e 3958 if (!rc || !have_reloc_root(root))
3fd0a558
YZ
3959 return;
3960
3fd0a558
YZ
3961 if (!rc->merge_reloc_tree)
3962 return;
3963
3964 root = root->reloc_root;
3965 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
3966 /*
3967 * relocation is in the stage of merging trees. the space
3968 * used by merging a reloc tree is twice the size of
3969 * relocated tree nodes in the worst case. half for cowing
3970 * the reloc tree, half for cowing the fs tree. the space
3971 * used by cowing the reloc tree will be freed after the
3972 * tree is dropped. if we create snapshot, cowing the fs
3973 * tree may use more space than it frees. so we need
3974 * reserve extra space.
3975 */
3976 *bytes_to_reserve += rc->nodes_relocated;
3977}
3978
3979/*
3980 * called after snapshot is created. migrate block reservation
3981 * and create reloc root for the newly created snapshot
f44deb74
JB
3982 *
3983 * This is similar to btrfs_init_reloc_root(), we come out of here with two
3984 * references held on the reloc_root, one for root->reloc_root and one for
3985 * rc->reloc_roots.
3fd0a558 3986 */
49b25e05 3987int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3fd0a558
YZ
3988 struct btrfs_pending_snapshot *pending)
3989{
3990 struct btrfs_root *root = pending->root;
3991 struct btrfs_root *reloc_root;
3992 struct btrfs_root *new_root;
10995c04 3993 struct reloc_control *rc = root->fs_info->reloc_ctl;
3fd0a558
YZ
3994 int ret;
3995
6282675e 3996 if (!rc || !have_reloc_root(root))
49b25e05 3997 return 0;
3fd0a558
YZ
3998
3999 rc = root->fs_info->reloc_ctl;
4000 rc->merging_rsv_size += rc->nodes_relocated;
4001
4002 if (rc->merge_reloc_tree) {
4003 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4004 rc->block_rsv,
3a584174 4005 rc->nodes_relocated, true);
49b25e05
JM
4006 if (ret)
4007 return ret;
3fd0a558
YZ
4008 }
4009
4010 new_root = pending->snap;
4011 reloc_root = create_reloc_root(trans, root->reloc_root,
4012 new_root->root_key.objectid);
49b25e05
JM
4013 if (IS_ERR(reloc_root))
4014 return PTR_ERR(reloc_root);
3fd0a558 4015
ffd7b339
JM
4016 ret = __add_reloc_root(reloc_root);
4017 BUG_ON(ret < 0);
f44deb74 4018 new_root->reloc_root = btrfs_grab_root(reloc_root);
3fd0a558 4019
49b25e05 4020 if (rc->create_reloc_tree)
3fd0a558 4021 ret = clone_backref_node(trans, rc, root, reloc_root);
49b25e05 4022 return ret;
3fd0a558 4023}