]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - fs/btrfs/relocation.c
mtd: rawnand: marvell: add missing clk_disable_unprepare() on error in marvell_nfc_re...
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / relocation.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2009 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include <linux/error-injection.h>
13 #include "ctree.h"
14 #include "disk-io.h"
15 #include "transaction.h"
16 #include "volumes.h"
17 #include "locking.h"
18 #include "btrfs_inode.h"
19 #include "async-thread.h"
20 #include "free-space-cache.h"
21 #include "qgroup.h"
22 #include "print-tree.h"
23 #include "delalloc-space.h"
24 #include "block-group.h"
25 #include "backref.h"
26 #include "misc.h"
27
28 /*
29 * Relocation overview
30 *
31 * [What does relocation do]
32 *
33 * The objective of relocation is to relocate all extents of the target block
34 * group to other block groups.
35 * This is utilized by resize (shrink only), profile converting, compacting
36 * space, or balance routine to spread chunks over devices.
37 *
38 * Before | After
39 * ------------------------------------------------------------------
40 * BG A: 10 data extents | BG A: deleted
41 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
42 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
43 *
44 * [How does relocation work]
45 *
46 * 1. Mark the target block group read-only
47 * New extents won't be allocated from the target block group.
48 *
49 * 2.1 Record each extent in the target block group
50 * To build a proper map of extents to be relocated.
51 *
52 * 2.2 Build data reloc tree and reloc trees
53 * Data reloc tree will contain an inode, recording all newly relocated
54 * data extents.
55 * There will be only one data reloc tree for one data block group.
56 *
57 * Reloc tree will be a special snapshot of its source tree, containing
58 * relocated tree blocks.
59 * Each tree referring to a tree block in target block group will get its
60 * reloc tree built.
61 *
62 * 2.3 Swap source tree with its corresponding reloc tree
63 * Each involved tree only refers to new extents after swap.
64 *
65 * 3. Cleanup reloc trees and data reloc tree.
66 * As old extents in the target block group are still referenced by reloc
67 * trees, we need to clean them up before really freeing the target block
68 * group.
69 *
70 * The main complexity is in steps 2.2 and 2.3.
71 *
72 * The entry point of relocation is relocate_block_group() function.
73 */
74
75 #define RELOCATION_RESERVED_NODES 256
76 /*
77 * map address of tree root to tree
78 */
79 struct mapping_node {
80 struct {
81 struct rb_node rb_node;
82 u64 bytenr;
83 }; /* Use rb_simle_node for search/insert */
84 void *data;
85 };
86
87 struct mapping_tree {
88 struct rb_root rb_root;
89 spinlock_t lock;
90 };
91
92 /*
93 * present a tree block to process
94 */
95 struct tree_block {
96 struct {
97 struct rb_node rb_node;
98 u64 bytenr;
99 }; /* Use rb_simple_node for search/insert */
100 struct btrfs_key key;
101 unsigned int level:8;
102 unsigned int key_ready:1;
103 };
104
105 #define MAX_EXTENTS 128
106
107 struct file_extent_cluster {
108 u64 start;
109 u64 end;
110 u64 boundary[MAX_EXTENTS];
111 unsigned int nr;
112 };
113
114 struct reloc_control {
115 /* block group to relocate */
116 struct btrfs_block_group *block_group;
117 /* extent tree */
118 struct btrfs_root *extent_root;
119 /* inode for moving data */
120 struct inode *data_inode;
121
122 struct btrfs_block_rsv *block_rsv;
123
124 struct btrfs_backref_cache backref_cache;
125
126 struct file_extent_cluster cluster;
127 /* tree blocks have been processed */
128 struct extent_io_tree processed_blocks;
129 /* map start of tree root to corresponding reloc tree */
130 struct mapping_tree reloc_root_tree;
131 /* list of reloc trees */
132 struct list_head reloc_roots;
133 /* list of subvolume trees that get relocated */
134 struct list_head dirty_subvol_roots;
135 /* size of metadata reservation for merging reloc trees */
136 u64 merging_rsv_size;
137 /* size of relocated tree nodes */
138 u64 nodes_relocated;
139 /* reserved size for block group relocation*/
140 u64 reserved_bytes;
141
142 u64 search_start;
143 u64 extents_found;
144
145 unsigned int stage:8;
146 unsigned int create_reloc_tree:1;
147 unsigned int merge_reloc_tree:1;
148 unsigned int found_file_extent:1;
149 };
150
151 /* stages of data relocation */
152 #define MOVE_DATA_EXTENTS 0
153 #define UPDATE_DATA_PTRS 1
154
155 static void mark_block_processed(struct reloc_control *rc,
156 struct btrfs_backref_node *node)
157 {
158 u32 blocksize;
159
160 if (node->level == 0 ||
161 in_range(node->bytenr, rc->block_group->start,
162 rc->block_group->length)) {
163 blocksize = rc->extent_root->fs_info->nodesize;
164 set_extent_bits(&rc->processed_blocks, node->bytenr,
165 node->bytenr + blocksize - 1, EXTENT_DIRTY);
166 }
167 node->processed = 1;
168 }
169
170
171 static void mapping_tree_init(struct mapping_tree *tree)
172 {
173 tree->rb_root = RB_ROOT;
174 spin_lock_init(&tree->lock);
175 }
176
177 /*
178 * walk up backref nodes until reach node presents tree root
179 */
180 static struct btrfs_backref_node *walk_up_backref(
181 struct btrfs_backref_node *node,
182 struct btrfs_backref_edge *edges[], int *index)
183 {
184 struct btrfs_backref_edge *edge;
185 int idx = *index;
186
187 while (!list_empty(&node->upper)) {
188 edge = list_entry(node->upper.next,
189 struct btrfs_backref_edge, list[LOWER]);
190 edges[idx++] = edge;
191 node = edge->node[UPPER];
192 }
193 BUG_ON(node->detached);
194 *index = idx;
195 return node;
196 }
197
198 /*
199 * walk down backref nodes to find start of next reference path
200 */
201 static struct btrfs_backref_node *walk_down_backref(
202 struct btrfs_backref_edge *edges[], int *index)
203 {
204 struct btrfs_backref_edge *edge;
205 struct btrfs_backref_node *lower;
206 int idx = *index;
207
208 while (idx > 0) {
209 edge = edges[idx - 1];
210 lower = edge->node[LOWER];
211 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
212 idx--;
213 continue;
214 }
215 edge = list_entry(edge->list[LOWER].next,
216 struct btrfs_backref_edge, list[LOWER]);
217 edges[idx - 1] = edge;
218 *index = idx;
219 return edge->node[UPPER];
220 }
221 *index = 0;
222 return NULL;
223 }
224
225 static void update_backref_node(struct btrfs_backref_cache *cache,
226 struct btrfs_backref_node *node, u64 bytenr)
227 {
228 struct rb_node *rb_node;
229 rb_erase(&node->rb_node, &cache->rb_root);
230 node->bytenr = bytenr;
231 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
232 if (rb_node)
233 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
234 }
235
236 /*
237 * update backref cache after a transaction commit
238 */
239 static int update_backref_cache(struct btrfs_trans_handle *trans,
240 struct btrfs_backref_cache *cache)
241 {
242 struct btrfs_backref_node *node;
243 int level = 0;
244
245 if (cache->last_trans == 0) {
246 cache->last_trans = trans->transid;
247 return 0;
248 }
249
250 if (cache->last_trans == trans->transid)
251 return 0;
252
253 /*
254 * detached nodes are used to avoid unnecessary backref
255 * lookup. transaction commit changes the extent tree.
256 * so the detached nodes are no longer useful.
257 */
258 while (!list_empty(&cache->detached)) {
259 node = list_entry(cache->detached.next,
260 struct btrfs_backref_node, list);
261 btrfs_backref_cleanup_node(cache, node);
262 }
263
264 while (!list_empty(&cache->changed)) {
265 node = list_entry(cache->changed.next,
266 struct btrfs_backref_node, list);
267 list_del_init(&node->list);
268 BUG_ON(node->pending);
269 update_backref_node(cache, node, node->new_bytenr);
270 }
271
272 /*
273 * some nodes can be left in the pending list if there were
274 * errors during processing the pending nodes.
275 */
276 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
277 list_for_each_entry(node, &cache->pending[level], list) {
278 BUG_ON(!node->pending);
279 if (node->bytenr == node->new_bytenr)
280 continue;
281 update_backref_node(cache, node, node->new_bytenr);
282 }
283 }
284
285 cache->last_trans = 0;
286 return 1;
287 }
288
289 static bool reloc_root_is_dead(struct btrfs_root *root)
290 {
291 /*
292 * Pair with set_bit/clear_bit in clean_dirty_subvols and
293 * btrfs_update_reloc_root. We need to see the updated bit before
294 * trying to access reloc_root
295 */
296 smp_rmb();
297 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
298 return true;
299 return false;
300 }
301
302 /*
303 * Check if this subvolume tree has valid reloc tree.
304 *
305 * Reloc tree after swap is considered dead, thus not considered as valid.
306 * This is enough for most callers, as they don't distinguish dead reloc root
307 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a
308 * special case.
309 */
310 static bool have_reloc_root(struct btrfs_root *root)
311 {
312 if (reloc_root_is_dead(root))
313 return false;
314 if (!root->reloc_root)
315 return false;
316 return true;
317 }
318
319 int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
320 {
321 struct btrfs_root *reloc_root;
322
323 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
324 return 0;
325
326 /* This root has been merged with its reloc tree, we can ignore it */
327 if (reloc_root_is_dead(root))
328 return 1;
329
330 reloc_root = root->reloc_root;
331 if (!reloc_root)
332 return 0;
333
334 if (btrfs_header_generation(reloc_root->commit_root) ==
335 root->fs_info->running_transaction->transid)
336 return 0;
337 /*
338 * if there is reloc tree and it was created in previous
339 * transaction backref lookup can find the reloc tree,
340 * so backref node for the fs tree root is useless for
341 * relocation.
342 */
343 return 1;
344 }
345
346 /*
347 * find reloc tree by address of tree root
348 */
349 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
350 {
351 struct reloc_control *rc = fs_info->reloc_ctl;
352 struct rb_node *rb_node;
353 struct mapping_node *node;
354 struct btrfs_root *root = NULL;
355
356 ASSERT(rc);
357 spin_lock(&rc->reloc_root_tree.lock);
358 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
359 if (rb_node) {
360 node = rb_entry(rb_node, struct mapping_node, rb_node);
361 root = (struct btrfs_root *)node->data;
362 }
363 spin_unlock(&rc->reloc_root_tree.lock);
364 return btrfs_grab_root(root);
365 }
366
367 /*
368 * For useless nodes, do two major clean ups:
369 *
370 * - Cleanup the children edges and nodes
371 * If child node is also orphan (no parent) during cleanup, then the child
372 * node will also be cleaned up.
373 *
374 * - Freeing up leaves (level 0), keeps nodes detached
375 * For nodes, the node is still cached as "detached"
376 *
377 * Return false if @node is not in the @useless_nodes list.
378 * Return true if @node is in the @useless_nodes list.
379 */
380 static bool handle_useless_nodes(struct reloc_control *rc,
381 struct btrfs_backref_node *node)
382 {
383 struct btrfs_backref_cache *cache = &rc->backref_cache;
384 struct list_head *useless_node = &cache->useless_node;
385 bool ret = false;
386
387 while (!list_empty(useless_node)) {
388 struct btrfs_backref_node *cur;
389
390 cur = list_first_entry(useless_node, struct btrfs_backref_node,
391 list);
392 list_del_init(&cur->list);
393
394 /* Only tree root nodes can be added to @useless_nodes */
395 ASSERT(list_empty(&cur->upper));
396
397 if (cur == node)
398 ret = true;
399
400 /* The node is the lowest node */
401 if (cur->lowest) {
402 list_del_init(&cur->lower);
403 cur->lowest = 0;
404 }
405
406 /* Cleanup the lower edges */
407 while (!list_empty(&cur->lower)) {
408 struct btrfs_backref_edge *edge;
409 struct btrfs_backref_node *lower;
410
411 edge = list_entry(cur->lower.next,
412 struct btrfs_backref_edge, list[UPPER]);
413 list_del(&edge->list[UPPER]);
414 list_del(&edge->list[LOWER]);
415 lower = edge->node[LOWER];
416 btrfs_backref_free_edge(cache, edge);
417
418 /* Child node is also orphan, queue for cleanup */
419 if (list_empty(&lower->upper))
420 list_add(&lower->list, useless_node);
421 }
422 /* Mark this block processed for relocation */
423 mark_block_processed(rc, cur);
424
425 /*
426 * Backref nodes for tree leaves are deleted from the cache.
427 * Backref nodes for upper level tree blocks are left in the
428 * cache to avoid unnecessary backref lookup.
429 */
430 if (cur->level > 0) {
431 list_add(&cur->list, &cache->detached);
432 cur->detached = 1;
433 } else {
434 rb_erase(&cur->rb_node, &cache->rb_root);
435 btrfs_backref_free_node(cache, cur);
436 }
437 }
438 return ret;
439 }
440
441 /*
442 * Build backref tree for a given tree block. Root of the backref tree
443 * corresponds the tree block, leaves of the backref tree correspond roots of
444 * b-trees that reference the tree block.
445 *
446 * The basic idea of this function is check backrefs of a given block to find
447 * upper level blocks that reference the block, and then check backrefs of
448 * these upper level blocks recursively. The recursion stops when tree root is
449 * reached or backrefs for the block is cached.
450 *
451 * NOTE: if we find that backrefs for a block are cached, we know backrefs for
452 * all upper level blocks that directly/indirectly reference the block are also
453 * cached.
454 */
455 static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
456 struct reloc_control *rc, struct btrfs_key *node_key,
457 int level, u64 bytenr)
458 {
459 struct btrfs_backref_iter *iter;
460 struct btrfs_backref_cache *cache = &rc->backref_cache;
461 /* For searching parent of TREE_BLOCK_REF */
462 struct btrfs_path *path;
463 struct btrfs_backref_node *cur;
464 struct btrfs_backref_node *node = NULL;
465 struct btrfs_backref_edge *edge;
466 int ret;
467 int err = 0;
468
469 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
470 if (!iter)
471 return ERR_PTR(-ENOMEM);
472 path = btrfs_alloc_path();
473 if (!path) {
474 err = -ENOMEM;
475 goto out;
476 }
477
478 node = btrfs_backref_alloc_node(cache, bytenr, level);
479 if (!node) {
480 err = -ENOMEM;
481 goto out;
482 }
483
484 node->lowest = 1;
485 cur = node;
486
487 /* Breadth-first search to build backref cache */
488 do {
489 ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
490 cur);
491 if (ret < 0) {
492 err = ret;
493 goto out;
494 }
495 edge = list_first_entry_or_null(&cache->pending_edge,
496 struct btrfs_backref_edge, list[UPPER]);
497 /*
498 * The pending list isn't empty, take the first block to
499 * process
500 */
501 if (edge) {
502 list_del_init(&edge->list[UPPER]);
503 cur = edge->node[UPPER];
504 }
505 } while (edge);
506
507 /* Finish the upper linkage of newly added edges/nodes */
508 ret = btrfs_backref_finish_upper_links(cache, node);
509 if (ret < 0) {
510 err = ret;
511 goto out;
512 }
513
514 if (handle_useless_nodes(rc, node))
515 node = NULL;
516 out:
517 btrfs_backref_iter_free(iter);
518 btrfs_free_path(path);
519 if (err) {
520 btrfs_backref_error_cleanup(cache, node);
521 return ERR_PTR(err);
522 }
523 ASSERT(!node || !node->detached);
524 ASSERT(list_empty(&cache->useless_node) &&
525 list_empty(&cache->pending_edge));
526 return node;
527 }
528
529 /*
530 * helper to add backref node for the newly created snapshot.
531 * the backref node is created by cloning backref node that
532 * corresponds to root of source tree
533 */
534 static int clone_backref_node(struct btrfs_trans_handle *trans,
535 struct reloc_control *rc,
536 struct btrfs_root *src,
537 struct btrfs_root *dest)
538 {
539 struct btrfs_root *reloc_root = src->reloc_root;
540 struct btrfs_backref_cache *cache = &rc->backref_cache;
541 struct btrfs_backref_node *node = NULL;
542 struct btrfs_backref_node *new_node;
543 struct btrfs_backref_edge *edge;
544 struct btrfs_backref_edge *new_edge;
545 struct rb_node *rb_node;
546
547 if (cache->last_trans > 0)
548 update_backref_cache(trans, cache);
549
550 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
551 if (rb_node) {
552 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
553 if (node->detached)
554 node = NULL;
555 else
556 BUG_ON(node->new_bytenr != reloc_root->node->start);
557 }
558
559 if (!node) {
560 rb_node = rb_simple_search(&cache->rb_root,
561 reloc_root->commit_root->start);
562 if (rb_node) {
563 node = rb_entry(rb_node, struct btrfs_backref_node,
564 rb_node);
565 BUG_ON(node->detached);
566 }
567 }
568
569 if (!node)
570 return 0;
571
572 new_node = btrfs_backref_alloc_node(cache, dest->node->start,
573 node->level);
574 if (!new_node)
575 return -ENOMEM;
576
577 new_node->lowest = node->lowest;
578 new_node->checked = 1;
579 new_node->root = btrfs_grab_root(dest);
580 ASSERT(new_node->root);
581
582 if (!node->lowest) {
583 list_for_each_entry(edge, &node->lower, list[UPPER]) {
584 new_edge = btrfs_backref_alloc_edge(cache);
585 if (!new_edge)
586 goto fail;
587
588 btrfs_backref_link_edge(new_edge, edge->node[LOWER],
589 new_node, LINK_UPPER);
590 }
591 } else {
592 list_add_tail(&new_node->lower, &cache->leaves);
593 }
594
595 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
596 &new_node->rb_node);
597 if (rb_node)
598 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
599
600 if (!new_node->lowest) {
601 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
602 list_add_tail(&new_edge->list[LOWER],
603 &new_edge->node[LOWER]->upper);
604 }
605 }
606 return 0;
607 fail:
608 while (!list_empty(&new_node->lower)) {
609 new_edge = list_entry(new_node->lower.next,
610 struct btrfs_backref_edge, list[UPPER]);
611 list_del(&new_edge->list[UPPER]);
612 btrfs_backref_free_edge(cache, new_edge);
613 }
614 btrfs_backref_free_node(cache, new_node);
615 return -ENOMEM;
616 }
617
618 /*
619 * helper to add 'address of tree root -> reloc tree' mapping
620 */
621 static int __must_check __add_reloc_root(struct btrfs_root *root)
622 {
623 struct btrfs_fs_info *fs_info = root->fs_info;
624 struct rb_node *rb_node;
625 struct mapping_node *node;
626 struct reloc_control *rc = fs_info->reloc_ctl;
627
628 node = kmalloc(sizeof(*node), GFP_NOFS);
629 if (!node)
630 return -ENOMEM;
631
632 node->bytenr = root->commit_root->start;
633 node->data = root;
634
635 spin_lock(&rc->reloc_root_tree.lock);
636 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
637 node->bytenr, &node->rb_node);
638 spin_unlock(&rc->reloc_root_tree.lock);
639 if (rb_node) {
640 btrfs_panic(fs_info, -EEXIST,
641 "Duplicate root found for start=%llu while inserting into relocation tree",
642 node->bytenr);
643 }
644
645 list_add_tail(&root->root_list, &rc->reloc_roots);
646 return 0;
647 }
648
649 /*
650 * helper to delete the 'address of tree root -> reloc tree'
651 * mapping
652 */
653 static void __del_reloc_root(struct btrfs_root *root)
654 {
655 struct btrfs_fs_info *fs_info = root->fs_info;
656 struct rb_node *rb_node;
657 struct mapping_node *node = NULL;
658 struct reloc_control *rc = fs_info->reloc_ctl;
659 bool put_ref = false;
660
661 if (rc && root->node) {
662 spin_lock(&rc->reloc_root_tree.lock);
663 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
664 root->commit_root->start);
665 if (rb_node) {
666 node = rb_entry(rb_node, struct mapping_node, rb_node);
667 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
668 RB_CLEAR_NODE(&node->rb_node);
669 }
670 spin_unlock(&rc->reloc_root_tree.lock);
671 ASSERT(!node || (struct btrfs_root *)node->data == root);
672 }
673
674 /*
675 * We only put the reloc root here if it's on the list. There's a lot
676 * of places where the pattern is to splice the rc->reloc_roots, process
677 * the reloc roots, and then add the reloc root back onto
678 * rc->reloc_roots. If we call __del_reloc_root while it's off of the
679 * list we don't want the reference being dropped, because the guy
680 * messing with the list is in charge of the reference.
681 */
682 spin_lock(&fs_info->trans_lock);
683 if (!list_empty(&root->root_list)) {
684 put_ref = true;
685 list_del_init(&root->root_list);
686 }
687 spin_unlock(&fs_info->trans_lock);
688 if (put_ref)
689 btrfs_put_root(root);
690 kfree(node);
691 }
692
693 /*
694 * helper to update the 'address of tree root -> reloc tree'
695 * mapping
696 */
697 static int __update_reloc_root(struct btrfs_root *root)
698 {
699 struct btrfs_fs_info *fs_info = root->fs_info;
700 struct rb_node *rb_node;
701 struct mapping_node *node = NULL;
702 struct reloc_control *rc = fs_info->reloc_ctl;
703
704 spin_lock(&rc->reloc_root_tree.lock);
705 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
706 root->commit_root->start);
707 if (rb_node) {
708 node = rb_entry(rb_node, struct mapping_node, rb_node);
709 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
710 }
711 spin_unlock(&rc->reloc_root_tree.lock);
712
713 if (!node)
714 return 0;
715 BUG_ON((struct btrfs_root *)node->data != root);
716
717 spin_lock(&rc->reloc_root_tree.lock);
718 node->bytenr = root->node->start;
719 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
720 node->bytenr, &node->rb_node);
721 spin_unlock(&rc->reloc_root_tree.lock);
722 if (rb_node)
723 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
724 return 0;
725 }
726
727 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
728 struct btrfs_root *root, u64 objectid)
729 {
730 struct btrfs_fs_info *fs_info = root->fs_info;
731 struct btrfs_root *reloc_root;
732 struct extent_buffer *eb;
733 struct btrfs_root_item *root_item;
734 struct btrfs_key root_key;
735 int ret = 0;
736 bool must_abort = false;
737
738 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
739 if (!root_item)
740 return ERR_PTR(-ENOMEM);
741
742 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
743 root_key.type = BTRFS_ROOT_ITEM_KEY;
744 root_key.offset = objectid;
745
746 if (root->root_key.objectid == objectid) {
747 u64 commit_root_gen;
748
749 /* called by btrfs_init_reloc_root */
750 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
751 BTRFS_TREE_RELOC_OBJECTID);
752 if (ret)
753 goto fail;
754
755 /*
756 * Set the last_snapshot field to the generation of the commit
757 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
758 * correctly (returns true) when the relocation root is created
759 * either inside the critical section of a transaction commit
760 * (through transaction.c:qgroup_account_snapshot()) and when
761 * it's created before the transaction commit is started.
762 */
763 commit_root_gen = btrfs_header_generation(root->commit_root);
764 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
765 } else {
766 /*
767 * called by btrfs_reloc_post_snapshot_hook.
768 * the source tree is a reloc tree, all tree blocks
769 * modified after it was created have RELOC flag
770 * set in their headers. so it's OK to not update
771 * the 'last_snapshot'.
772 */
773 ret = btrfs_copy_root(trans, root, root->node, &eb,
774 BTRFS_TREE_RELOC_OBJECTID);
775 if (ret)
776 goto fail;
777 }
778
779 /*
780 * We have changed references at this point, we must abort the
781 * transaction if anything fails.
782 */
783 must_abort = true;
784
785 memcpy(root_item, &root->root_item, sizeof(*root_item));
786 btrfs_set_root_bytenr(root_item, eb->start);
787 btrfs_set_root_level(root_item, btrfs_header_level(eb));
788 btrfs_set_root_generation(root_item, trans->transid);
789
790 if (root->root_key.objectid == objectid) {
791 btrfs_set_root_refs(root_item, 0);
792 memset(&root_item->drop_progress, 0,
793 sizeof(struct btrfs_disk_key));
794 btrfs_set_root_drop_level(root_item, 0);
795 }
796
797 btrfs_tree_unlock(eb);
798 free_extent_buffer(eb);
799
800 ret = btrfs_insert_root(trans, fs_info->tree_root,
801 &root_key, root_item);
802 if (ret)
803 goto fail;
804
805 kfree(root_item);
806
807 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
808 if (IS_ERR(reloc_root)) {
809 ret = PTR_ERR(reloc_root);
810 goto abort;
811 }
812 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
813 reloc_root->last_trans = trans->transid;
814 return reloc_root;
815 fail:
816 kfree(root_item);
817 abort:
818 if (must_abort)
819 btrfs_abort_transaction(trans, ret);
820 return ERR_PTR(ret);
821 }
822
823 /*
824 * create reloc tree for a given fs tree. reloc tree is just a
825 * snapshot of the fs tree with special root objectid.
826 *
827 * The reloc_root comes out of here with two references, one for
828 * root->reloc_root, and another for being on the rc->reloc_roots list.
829 */
830 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
831 struct btrfs_root *root)
832 {
833 struct btrfs_fs_info *fs_info = root->fs_info;
834 struct btrfs_root *reloc_root;
835 struct reloc_control *rc = fs_info->reloc_ctl;
836 struct btrfs_block_rsv *rsv;
837 int clear_rsv = 0;
838 int ret;
839
840 if (!rc)
841 return 0;
842
843 /*
844 * The subvolume has reloc tree but the swap is finished, no need to
845 * create/update the dead reloc tree
846 */
847 if (reloc_root_is_dead(root))
848 return 0;
849
850 /*
851 * This is subtle but important. We do not do
852 * record_root_in_transaction for reloc roots, instead we record their
853 * corresponding fs root, and then here we update the last trans for the
854 * reloc root. This means that we have to do this for the entire life
855 * of the reloc root, regardless of which stage of the relocation we are
856 * in.
857 */
858 if (root->reloc_root) {
859 reloc_root = root->reloc_root;
860 reloc_root->last_trans = trans->transid;
861 return 0;
862 }
863
864 /*
865 * We are merging reloc roots, we do not need new reloc trees. Also
866 * reloc trees never need their own reloc tree.
867 */
868 if (!rc->create_reloc_tree ||
869 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
870 return 0;
871
872 if (!trans->reloc_reserved) {
873 rsv = trans->block_rsv;
874 trans->block_rsv = rc->block_rsv;
875 clear_rsv = 1;
876 }
877 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
878 if (clear_rsv)
879 trans->block_rsv = rsv;
880
881 ret = __add_reloc_root(reloc_root);
882 BUG_ON(ret < 0);
883 root->reloc_root = btrfs_grab_root(reloc_root);
884 return 0;
885 }
886
887 /*
888 * update root item of reloc tree
889 */
890 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
891 struct btrfs_root *root)
892 {
893 struct btrfs_fs_info *fs_info = root->fs_info;
894 struct btrfs_root *reloc_root;
895 struct btrfs_root_item *root_item;
896 int ret;
897
898 if (!have_reloc_root(root))
899 return 0;
900
901 reloc_root = root->reloc_root;
902 root_item = &reloc_root->root_item;
903
904 /*
905 * We are probably ok here, but __del_reloc_root() will drop its ref of
906 * the root. We have the ref for root->reloc_root, but just in case
907 * hold it while we update the reloc root.
908 */
909 btrfs_grab_root(reloc_root);
910
911 /* root->reloc_root will stay until current relocation finished */
912 if (fs_info->reloc_ctl->merge_reloc_tree &&
913 btrfs_root_refs(root_item) == 0) {
914 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
915 /*
916 * Mark the tree as dead before we change reloc_root so
917 * have_reloc_root will not touch it from now on.
918 */
919 smp_wmb();
920 __del_reloc_root(reloc_root);
921 }
922
923 if (reloc_root->commit_root != reloc_root->node) {
924 __update_reloc_root(reloc_root);
925 btrfs_set_root_node(root_item, reloc_root->node);
926 free_extent_buffer(reloc_root->commit_root);
927 reloc_root->commit_root = btrfs_root_node(reloc_root);
928 }
929
930 ret = btrfs_update_root(trans, fs_info->tree_root,
931 &reloc_root->root_key, root_item);
932 btrfs_put_root(reloc_root);
933 return ret;
934 }
935
936 /*
937 * helper to find first cached inode with inode number >= objectid
938 * in a subvolume
939 */
940 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
941 {
942 struct rb_node *node;
943 struct rb_node *prev;
944 struct btrfs_inode *entry;
945 struct inode *inode;
946
947 spin_lock(&root->inode_lock);
948 again:
949 node = root->inode_tree.rb_node;
950 prev = NULL;
951 while (node) {
952 prev = node;
953 entry = rb_entry(node, struct btrfs_inode, rb_node);
954
955 if (objectid < btrfs_ino(entry))
956 node = node->rb_left;
957 else if (objectid > btrfs_ino(entry))
958 node = node->rb_right;
959 else
960 break;
961 }
962 if (!node) {
963 while (prev) {
964 entry = rb_entry(prev, struct btrfs_inode, rb_node);
965 if (objectid <= btrfs_ino(entry)) {
966 node = prev;
967 break;
968 }
969 prev = rb_next(prev);
970 }
971 }
972 while (node) {
973 entry = rb_entry(node, struct btrfs_inode, rb_node);
974 inode = igrab(&entry->vfs_inode);
975 if (inode) {
976 spin_unlock(&root->inode_lock);
977 return inode;
978 }
979
980 objectid = btrfs_ino(entry) + 1;
981 if (cond_resched_lock(&root->inode_lock))
982 goto again;
983
984 node = rb_next(node);
985 }
986 spin_unlock(&root->inode_lock);
987 return NULL;
988 }
989
990 /*
991 * get new location of data
992 */
993 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
994 u64 bytenr, u64 num_bytes)
995 {
996 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
997 struct btrfs_path *path;
998 struct btrfs_file_extent_item *fi;
999 struct extent_buffer *leaf;
1000 int ret;
1001
1002 path = btrfs_alloc_path();
1003 if (!path)
1004 return -ENOMEM;
1005
1006 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1007 ret = btrfs_lookup_file_extent(NULL, root, path,
1008 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1009 if (ret < 0)
1010 goto out;
1011 if (ret > 0) {
1012 ret = -ENOENT;
1013 goto out;
1014 }
1015
1016 leaf = path->nodes[0];
1017 fi = btrfs_item_ptr(leaf, path->slots[0],
1018 struct btrfs_file_extent_item);
1019
1020 BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1021 btrfs_file_extent_compression(leaf, fi) ||
1022 btrfs_file_extent_encryption(leaf, fi) ||
1023 btrfs_file_extent_other_encoding(leaf, fi));
1024
1025 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1026 ret = -EINVAL;
1027 goto out;
1028 }
1029
1030 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1031 ret = 0;
1032 out:
1033 btrfs_free_path(path);
1034 return ret;
1035 }
1036
1037 /*
1038 * update file extent items in the tree leaf to point to
1039 * the new locations.
1040 */
1041 static noinline_for_stack
1042 int replace_file_extents(struct btrfs_trans_handle *trans,
1043 struct reloc_control *rc,
1044 struct btrfs_root *root,
1045 struct extent_buffer *leaf)
1046 {
1047 struct btrfs_fs_info *fs_info = root->fs_info;
1048 struct btrfs_key key;
1049 struct btrfs_file_extent_item *fi;
1050 struct inode *inode = NULL;
1051 u64 parent;
1052 u64 bytenr;
1053 u64 new_bytenr = 0;
1054 u64 num_bytes;
1055 u64 end;
1056 u32 nritems;
1057 u32 i;
1058 int ret = 0;
1059 int first = 1;
1060 int dirty = 0;
1061
1062 if (rc->stage != UPDATE_DATA_PTRS)
1063 return 0;
1064
1065 /* reloc trees always use full backref */
1066 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1067 parent = leaf->start;
1068 else
1069 parent = 0;
1070
1071 nritems = btrfs_header_nritems(leaf);
1072 for (i = 0; i < nritems; i++) {
1073 struct btrfs_ref ref = { 0 };
1074
1075 cond_resched();
1076 btrfs_item_key_to_cpu(leaf, &key, i);
1077 if (key.type != BTRFS_EXTENT_DATA_KEY)
1078 continue;
1079 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1080 if (btrfs_file_extent_type(leaf, fi) ==
1081 BTRFS_FILE_EXTENT_INLINE)
1082 continue;
1083 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1084 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1085 if (bytenr == 0)
1086 continue;
1087 if (!in_range(bytenr, rc->block_group->start,
1088 rc->block_group->length))
1089 continue;
1090
1091 /*
1092 * if we are modifying block in fs tree, wait for readpage
1093 * to complete and drop the extent cache
1094 */
1095 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1096 if (first) {
1097 inode = find_next_inode(root, key.objectid);
1098 first = 0;
1099 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1100 btrfs_add_delayed_iput(inode);
1101 inode = find_next_inode(root, key.objectid);
1102 }
1103 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1104 end = key.offset +
1105 btrfs_file_extent_num_bytes(leaf, fi);
1106 WARN_ON(!IS_ALIGNED(key.offset,
1107 fs_info->sectorsize));
1108 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1109 end--;
1110 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1111 key.offset, end);
1112 if (!ret)
1113 continue;
1114
1115 btrfs_drop_extent_cache(BTRFS_I(inode),
1116 key.offset, end, 1);
1117 unlock_extent(&BTRFS_I(inode)->io_tree,
1118 key.offset, end);
1119 }
1120 }
1121
1122 ret = get_new_location(rc->data_inode, &new_bytenr,
1123 bytenr, num_bytes);
1124 if (ret) {
1125 /*
1126 * Don't have to abort since we've not changed anything
1127 * in the file extent yet.
1128 */
1129 break;
1130 }
1131
1132 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1133 dirty = 1;
1134
1135 key.offset -= btrfs_file_extent_offset(leaf, fi);
1136 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1137 num_bytes, parent);
1138 ref.real_root = root->root_key.objectid;
1139 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1140 key.objectid, key.offset);
1141 ret = btrfs_inc_extent_ref(trans, &ref);
1142 if (ret) {
1143 btrfs_abort_transaction(trans, ret);
1144 break;
1145 }
1146
1147 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1148 num_bytes, parent);
1149 ref.real_root = root->root_key.objectid;
1150 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1151 key.objectid, key.offset);
1152 ret = btrfs_free_extent(trans, &ref);
1153 if (ret) {
1154 btrfs_abort_transaction(trans, ret);
1155 break;
1156 }
1157 }
1158 if (dirty)
1159 btrfs_mark_buffer_dirty(leaf);
1160 if (inode)
1161 btrfs_add_delayed_iput(inode);
1162 return ret;
1163 }
1164
1165 static noinline_for_stack
1166 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1167 struct btrfs_path *path, int level)
1168 {
1169 struct btrfs_disk_key key1;
1170 struct btrfs_disk_key key2;
1171 btrfs_node_key(eb, &key1, slot);
1172 btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1173 return memcmp(&key1, &key2, sizeof(key1));
1174 }
1175
1176 /*
1177 * try to replace tree blocks in fs tree with the new blocks
1178 * in reloc tree. tree blocks haven't been modified since the
1179 * reloc tree was create can be replaced.
1180 *
1181 * if a block was replaced, level of the block + 1 is returned.
1182 * if no block got replaced, 0 is returned. if there are other
1183 * errors, a negative error number is returned.
1184 */
1185 static noinline_for_stack
1186 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1187 struct btrfs_root *dest, struct btrfs_root *src,
1188 struct btrfs_path *path, struct btrfs_key *next_key,
1189 int lowest_level, int max_level)
1190 {
1191 struct btrfs_fs_info *fs_info = dest->fs_info;
1192 struct extent_buffer *eb;
1193 struct extent_buffer *parent;
1194 struct btrfs_ref ref = { 0 };
1195 struct btrfs_key key;
1196 u64 old_bytenr;
1197 u64 new_bytenr;
1198 u64 old_ptr_gen;
1199 u64 new_ptr_gen;
1200 u64 last_snapshot;
1201 u32 blocksize;
1202 int cow = 0;
1203 int level;
1204 int ret;
1205 int slot;
1206
1207 ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1208 ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1209
1210 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1211 again:
1212 slot = path->slots[lowest_level];
1213 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1214
1215 eb = btrfs_lock_root_node(dest);
1216 level = btrfs_header_level(eb);
1217
1218 if (level < lowest_level) {
1219 btrfs_tree_unlock(eb);
1220 free_extent_buffer(eb);
1221 return 0;
1222 }
1223
1224 if (cow) {
1225 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1226 BTRFS_NESTING_COW);
1227 BUG_ON(ret);
1228 }
1229
1230 if (next_key) {
1231 next_key->objectid = (u64)-1;
1232 next_key->type = (u8)-1;
1233 next_key->offset = (u64)-1;
1234 }
1235
1236 parent = eb;
1237 while (1) {
1238 level = btrfs_header_level(parent);
1239 ASSERT(level >= lowest_level);
1240
1241 ret = btrfs_bin_search(parent, &key, &slot);
1242 if (ret < 0)
1243 break;
1244 if (ret && slot > 0)
1245 slot--;
1246
1247 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1248 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1249
1250 old_bytenr = btrfs_node_blockptr(parent, slot);
1251 blocksize = fs_info->nodesize;
1252 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1253
1254 if (level <= max_level) {
1255 eb = path->nodes[level];
1256 new_bytenr = btrfs_node_blockptr(eb,
1257 path->slots[level]);
1258 new_ptr_gen = btrfs_node_ptr_generation(eb,
1259 path->slots[level]);
1260 } else {
1261 new_bytenr = 0;
1262 new_ptr_gen = 0;
1263 }
1264
1265 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1266 ret = level;
1267 break;
1268 }
1269
1270 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1271 memcmp_node_keys(parent, slot, path, level)) {
1272 if (level <= lowest_level) {
1273 ret = 0;
1274 break;
1275 }
1276
1277 eb = btrfs_read_node_slot(parent, slot);
1278 if (IS_ERR(eb)) {
1279 ret = PTR_ERR(eb);
1280 break;
1281 }
1282 btrfs_tree_lock(eb);
1283 if (cow) {
1284 ret = btrfs_cow_block(trans, dest, eb, parent,
1285 slot, &eb,
1286 BTRFS_NESTING_COW);
1287 BUG_ON(ret);
1288 }
1289
1290 btrfs_tree_unlock(parent);
1291 free_extent_buffer(parent);
1292
1293 parent = eb;
1294 continue;
1295 }
1296
1297 if (!cow) {
1298 btrfs_tree_unlock(parent);
1299 free_extent_buffer(parent);
1300 cow = 1;
1301 goto again;
1302 }
1303
1304 btrfs_node_key_to_cpu(path->nodes[level], &key,
1305 path->slots[level]);
1306 btrfs_release_path(path);
1307
1308 path->lowest_level = level;
1309 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1310 path->lowest_level = 0;
1311 BUG_ON(ret);
1312
1313 /*
1314 * Info qgroup to trace both subtrees.
1315 *
1316 * We must trace both trees.
1317 * 1) Tree reloc subtree
1318 * If not traced, we will leak data numbers
1319 * 2) Fs subtree
1320 * If not traced, we will double count old data
1321 *
1322 * We don't scan the subtree right now, but only record
1323 * the swapped tree blocks.
1324 * The real subtree rescan is delayed until we have new
1325 * CoW on the subtree root node before transaction commit.
1326 */
1327 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1328 rc->block_group, parent, slot,
1329 path->nodes[level], path->slots[level],
1330 last_snapshot);
1331 if (ret < 0)
1332 break;
1333 /*
1334 * swap blocks in fs tree and reloc tree.
1335 */
1336 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1337 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1338 btrfs_mark_buffer_dirty(parent);
1339
1340 btrfs_set_node_blockptr(path->nodes[level],
1341 path->slots[level], old_bytenr);
1342 btrfs_set_node_ptr_generation(path->nodes[level],
1343 path->slots[level], old_ptr_gen);
1344 btrfs_mark_buffer_dirty(path->nodes[level]);
1345
1346 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1347 blocksize, path->nodes[level]->start);
1348 ref.skip_qgroup = true;
1349 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1350 ret = btrfs_inc_extent_ref(trans, &ref);
1351 BUG_ON(ret);
1352 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1353 blocksize, 0);
1354 ref.skip_qgroup = true;
1355 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1356 ret = btrfs_inc_extent_ref(trans, &ref);
1357 BUG_ON(ret);
1358
1359 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1360 blocksize, path->nodes[level]->start);
1361 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1362 ref.skip_qgroup = true;
1363 ret = btrfs_free_extent(trans, &ref);
1364 BUG_ON(ret);
1365
1366 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1367 blocksize, 0);
1368 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1369 ref.skip_qgroup = true;
1370 ret = btrfs_free_extent(trans, &ref);
1371 BUG_ON(ret);
1372
1373 btrfs_unlock_up_safe(path, 0);
1374
1375 ret = level;
1376 break;
1377 }
1378 btrfs_tree_unlock(parent);
1379 free_extent_buffer(parent);
1380 return ret;
1381 }
1382
1383 /*
1384 * helper to find next relocated block in reloc tree
1385 */
1386 static noinline_for_stack
1387 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1388 int *level)
1389 {
1390 struct extent_buffer *eb;
1391 int i;
1392 u64 last_snapshot;
1393 u32 nritems;
1394
1395 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1396
1397 for (i = 0; i < *level; i++) {
1398 free_extent_buffer(path->nodes[i]);
1399 path->nodes[i] = NULL;
1400 }
1401
1402 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1403 eb = path->nodes[i];
1404 nritems = btrfs_header_nritems(eb);
1405 while (path->slots[i] + 1 < nritems) {
1406 path->slots[i]++;
1407 if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1408 last_snapshot)
1409 continue;
1410
1411 *level = i;
1412 return 0;
1413 }
1414 free_extent_buffer(path->nodes[i]);
1415 path->nodes[i] = NULL;
1416 }
1417 return 1;
1418 }
1419
1420 /*
1421 * walk down reloc tree to find relocated block of lowest level
1422 */
1423 static noinline_for_stack
1424 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1425 int *level)
1426 {
1427 struct extent_buffer *eb = NULL;
1428 int i;
1429 u64 ptr_gen = 0;
1430 u64 last_snapshot;
1431 u32 nritems;
1432
1433 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1434
1435 for (i = *level; i > 0; i--) {
1436 eb = path->nodes[i];
1437 nritems = btrfs_header_nritems(eb);
1438 while (path->slots[i] < nritems) {
1439 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1440 if (ptr_gen > last_snapshot)
1441 break;
1442 path->slots[i]++;
1443 }
1444 if (path->slots[i] >= nritems) {
1445 if (i == *level)
1446 break;
1447 *level = i + 1;
1448 return 0;
1449 }
1450 if (i == 1) {
1451 *level = i;
1452 return 0;
1453 }
1454
1455 eb = btrfs_read_node_slot(eb, path->slots[i]);
1456 if (IS_ERR(eb))
1457 return PTR_ERR(eb);
1458 BUG_ON(btrfs_header_level(eb) != i - 1);
1459 path->nodes[i - 1] = eb;
1460 path->slots[i - 1] = 0;
1461 }
1462 return 1;
1463 }
1464
1465 /*
1466 * invalidate extent cache for file extents whose key in range of
1467 * [min_key, max_key)
1468 */
1469 static int invalidate_extent_cache(struct btrfs_root *root,
1470 struct btrfs_key *min_key,
1471 struct btrfs_key *max_key)
1472 {
1473 struct btrfs_fs_info *fs_info = root->fs_info;
1474 struct inode *inode = NULL;
1475 u64 objectid;
1476 u64 start, end;
1477 u64 ino;
1478
1479 objectid = min_key->objectid;
1480 while (1) {
1481 cond_resched();
1482 iput(inode);
1483
1484 if (objectid > max_key->objectid)
1485 break;
1486
1487 inode = find_next_inode(root, objectid);
1488 if (!inode)
1489 break;
1490 ino = btrfs_ino(BTRFS_I(inode));
1491
1492 if (ino > max_key->objectid) {
1493 iput(inode);
1494 break;
1495 }
1496
1497 objectid = ino + 1;
1498 if (!S_ISREG(inode->i_mode))
1499 continue;
1500
1501 if (unlikely(min_key->objectid == ino)) {
1502 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1503 continue;
1504 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1505 start = 0;
1506 else {
1507 start = min_key->offset;
1508 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1509 }
1510 } else {
1511 start = 0;
1512 }
1513
1514 if (unlikely(max_key->objectid == ino)) {
1515 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1516 continue;
1517 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1518 end = (u64)-1;
1519 } else {
1520 if (max_key->offset == 0)
1521 continue;
1522 end = max_key->offset;
1523 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1524 end--;
1525 }
1526 } else {
1527 end = (u64)-1;
1528 }
1529
1530 /* the lock_extent waits for readpage to complete */
1531 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
1532 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
1533 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1534 }
1535 return 0;
1536 }
1537
1538 static int find_next_key(struct btrfs_path *path, int level,
1539 struct btrfs_key *key)
1540
1541 {
1542 while (level < BTRFS_MAX_LEVEL) {
1543 if (!path->nodes[level])
1544 break;
1545 if (path->slots[level] + 1 <
1546 btrfs_header_nritems(path->nodes[level])) {
1547 btrfs_node_key_to_cpu(path->nodes[level], key,
1548 path->slots[level] + 1);
1549 return 0;
1550 }
1551 level++;
1552 }
1553 return 1;
1554 }
1555
1556 /*
1557 * Insert current subvolume into reloc_control::dirty_subvol_roots
1558 */
1559 static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
1560 struct reloc_control *rc,
1561 struct btrfs_root *root)
1562 {
1563 struct btrfs_root *reloc_root = root->reloc_root;
1564 struct btrfs_root_item *reloc_root_item;
1565
1566 /* @root must be a subvolume tree root with a valid reloc tree */
1567 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1568 ASSERT(reloc_root);
1569
1570 reloc_root_item = &reloc_root->root_item;
1571 memset(&reloc_root_item->drop_progress, 0,
1572 sizeof(reloc_root_item->drop_progress));
1573 btrfs_set_root_drop_level(reloc_root_item, 0);
1574 btrfs_set_root_refs(reloc_root_item, 0);
1575 btrfs_update_reloc_root(trans, root);
1576
1577 if (list_empty(&root->reloc_dirty_list)) {
1578 btrfs_grab_root(root);
1579 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1580 }
1581 }
1582
1583 static int clean_dirty_subvols(struct reloc_control *rc)
1584 {
1585 struct btrfs_root *root;
1586 struct btrfs_root *next;
1587 int ret = 0;
1588 int ret2;
1589
1590 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1591 reloc_dirty_list) {
1592 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1593 /* Merged subvolume, cleanup its reloc root */
1594 struct btrfs_root *reloc_root = root->reloc_root;
1595
1596 list_del_init(&root->reloc_dirty_list);
1597 root->reloc_root = NULL;
1598 /*
1599 * Need barrier to ensure clear_bit() only happens after
1600 * root->reloc_root = NULL. Pairs with have_reloc_root.
1601 */
1602 smp_wmb();
1603 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1604 if (reloc_root) {
1605 /*
1606 * btrfs_drop_snapshot drops our ref we hold for
1607 * ->reloc_root. If it fails however we must
1608 * drop the ref ourselves.
1609 */
1610 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
1611 if (ret2 < 0) {
1612 btrfs_put_root(reloc_root);
1613 if (!ret)
1614 ret = ret2;
1615 }
1616 }
1617 btrfs_put_root(root);
1618 } else {
1619 /* Orphan reloc tree, just clean it up */
1620 ret2 = btrfs_drop_snapshot(root, 0, 1);
1621 if (ret2 < 0) {
1622 btrfs_put_root(root);
1623 if (!ret)
1624 ret = ret2;
1625 }
1626 }
1627 }
1628 return ret;
1629 }
1630
1631 /*
1632 * merge the relocated tree blocks in reloc tree with corresponding
1633 * fs tree.
1634 */
1635 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1636 struct btrfs_root *root)
1637 {
1638 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1639 struct btrfs_key key;
1640 struct btrfs_key next_key;
1641 struct btrfs_trans_handle *trans = NULL;
1642 struct btrfs_root *reloc_root;
1643 struct btrfs_root_item *root_item;
1644 struct btrfs_path *path;
1645 struct extent_buffer *leaf;
1646 int reserve_level;
1647 int level;
1648 int max_level;
1649 int replaced = 0;
1650 int ret = 0;
1651 u32 min_reserved;
1652
1653 path = btrfs_alloc_path();
1654 if (!path)
1655 return -ENOMEM;
1656 path->reada = READA_FORWARD;
1657
1658 reloc_root = root->reloc_root;
1659 root_item = &reloc_root->root_item;
1660
1661 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1662 level = btrfs_root_level(root_item);
1663 atomic_inc(&reloc_root->node->refs);
1664 path->nodes[level] = reloc_root->node;
1665 path->slots[level] = 0;
1666 } else {
1667 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1668
1669 level = btrfs_root_drop_level(root_item);
1670 BUG_ON(level == 0);
1671 path->lowest_level = level;
1672 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1673 path->lowest_level = 0;
1674 if (ret < 0) {
1675 btrfs_free_path(path);
1676 return ret;
1677 }
1678
1679 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1680 path->slots[level]);
1681 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1682
1683 btrfs_unlock_up_safe(path, 0);
1684 }
1685
1686 /*
1687 * In merge_reloc_root(), we modify the upper level pointer to swap the
1688 * tree blocks between reloc tree and subvolume tree. Thus for tree
1689 * block COW, we COW at most from level 1 to root level for each tree.
1690 *
1691 * Thus the needed metadata size is at most root_level * nodesize,
1692 * and * 2 since we have two trees to COW.
1693 */
1694 reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1695 min_reserved = fs_info->nodesize * reserve_level * 2;
1696 memset(&next_key, 0, sizeof(next_key));
1697
1698 while (1) {
1699 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
1700 BTRFS_RESERVE_FLUSH_LIMIT);
1701 if (ret)
1702 goto out;
1703 trans = btrfs_start_transaction(root, 0);
1704 if (IS_ERR(trans)) {
1705 ret = PTR_ERR(trans);
1706 trans = NULL;
1707 goto out;
1708 }
1709
1710 /*
1711 * At this point we no longer have a reloc_control, so we can't
1712 * depend on btrfs_init_reloc_root to update our last_trans.
1713 *
1714 * But that's ok, we started the trans handle on our
1715 * corresponding fs_root, which means it's been added to the
1716 * dirty list. At commit time we'll still call
1717 * btrfs_update_reloc_root() and update our root item
1718 * appropriately.
1719 */
1720 reloc_root->last_trans = trans->transid;
1721 trans->block_rsv = rc->block_rsv;
1722
1723 replaced = 0;
1724 max_level = level;
1725
1726 ret = walk_down_reloc_tree(reloc_root, path, &level);
1727 if (ret < 0)
1728 goto out;
1729 if (ret > 0)
1730 break;
1731
1732 if (!find_next_key(path, level, &key) &&
1733 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1734 ret = 0;
1735 } else {
1736 ret = replace_path(trans, rc, root, reloc_root, path,
1737 &next_key, level, max_level);
1738 }
1739 if (ret < 0)
1740 goto out;
1741 if (ret > 0) {
1742 level = ret;
1743 btrfs_node_key_to_cpu(path->nodes[level], &key,
1744 path->slots[level]);
1745 replaced = 1;
1746 }
1747
1748 ret = walk_up_reloc_tree(reloc_root, path, &level);
1749 if (ret > 0)
1750 break;
1751
1752 BUG_ON(level == 0);
1753 /*
1754 * save the merging progress in the drop_progress.
1755 * this is OK since root refs == 1 in this case.
1756 */
1757 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1758 path->slots[level]);
1759 btrfs_set_root_drop_level(root_item, level);
1760
1761 btrfs_end_transaction_throttle(trans);
1762 trans = NULL;
1763
1764 btrfs_btree_balance_dirty(fs_info);
1765
1766 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1767 invalidate_extent_cache(root, &key, &next_key);
1768 }
1769
1770 /*
1771 * handle the case only one block in the fs tree need to be
1772 * relocated and the block is tree root.
1773 */
1774 leaf = btrfs_lock_root_node(root);
1775 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1776 BTRFS_NESTING_COW);
1777 btrfs_tree_unlock(leaf);
1778 free_extent_buffer(leaf);
1779 out:
1780 btrfs_free_path(path);
1781
1782 if (ret == 0)
1783 insert_dirty_subvol(trans, rc, root);
1784
1785 if (trans)
1786 btrfs_end_transaction_throttle(trans);
1787
1788 btrfs_btree_balance_dirty(fs_info);
1789
1790 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1791 invalidate_extent_cache(root, &key, &next_key);
1792
1793 return ret;
1794 }
1795
1796 static noinline_for_stack
1797 int prepare_to_merge(struct reloc_control *rc, int err)
1798 {
1799 struct btrfs_root *root = rc->extent_root;
1800 struct btrfs_fs_info *fs_info = root->fs_info;
1801 struct btrfs_root *reloc_root;
1802 struct btrfs_trans_handle *trans;
1803 LIST_HEAD(reloc_roots);
1804 u64 num_bytes = 0;
1805 int ret;
1806
1807 mutex_lock(&fs_info->reloc_mutex);
1808 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1809 rc->merging_rsv_size += rc->nodes_relocated * 2;
1810 mutex_unlock(&fs_info->reloc_mutex);
1811
1812 again:
1813 if (!err) {
1814 num_bytes = rc->merging_rsv_size;
1815 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
1816 BTRFS_RESERVE_FLUSH_ALL);
1817 if (ret)
1818 err = ret;
1819 }
1820
1821 trans = btrfs_join_transaction(rc->extent_root);
1822 if (IS_ERR(trans)) {
1823 if (!err)
1824 btrfs_block_rsv_release(fs_info, rc->block_rsv,
1825 num_bytes, NULL);
1826 return PTR_ERR(trans);
1827 }
1828
1829 if (!err) {
1830 if (num_bytes != rc->merging_rsv_size) {
1831 btrfs_end_transaction(trans);
1832 btrfs_block_rsv_release(fs_info, rc->block_rsv,
1833 num_bytes, NULL);
1834 goto again;
1835 }
1836 }
1837
1838 rc->merge_reloc_tree = 1;
1839
1840 while (!list_empty(&rc->reloc_roots)) {
1841 reloc_root = list_entry(rc->reloc_roots.next,
1842 struct btrfs_root, root_list);
1843 list_del_init(&reloc_root->root_list);
1844
1845 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1846 false);
1847 BUG_ON(IS_ERR(root));
1848 BUG_ON(root->reloc_root != reloc_root);
1849
1850 /*
1851 * set reference count to 1, so btrfs_recover_relocation
1852 * knows it should resumes merging
1853 */
1854 if (!err)
1855 btrfs_set_root_refs(&reloc_root->root_item, 1);
1856 btrfs_update_reloc_root(trans, root);
1857
1858 list_add(&reloc_root->root_list, &reloc_roots);
1859 btrfs_put_root(root);
1860 }
1861
1862 list_splice(&reloc_roots, &rc->reloc_roots);
1863
1864 if (!err)
1865 btrfs_commit_transaction(trans);
1866 else
1867 btrfs_end_transaction(trans);
1868 return err;
1869 }
1870
1871 static noinline_for_stack
1872 void free_reloc_roots(struct list_head *list)
1873 {
1874 struct btrfs_root *reloc_root, *tmp;
1875
1876 list_for_each_entry_safe(reloc_root, tmp, list, root_list)
1877 __del_reloc_root(reloc_root);
1878 }
1879
1880 static noinline_for_stack
1881 void merge_reloc_roots(struct reloc_control *rc)
1882 {
1883 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1884 struct btrfs_root *root;
1885 struct btrfs_root *reloc_root;
1886 LIST_HEAD(reloc_roots);
1887 int found = 0;
1888 int ret = 0;
1889 again:
1890 root = rc->extent_root;
1891
1892 /*
1893 * this serializes us with btrfs_record_root_in_transaction,
1894 * we have to make sure nobody is in the middle of
1895 * adding their roots to the list while we are
1896 * doing this splice
1897 */
1898 mutex_lock(&fs_info->reloc_mutex);
1899 list_splice_init(&rc->reloc_roots, &reloc_roots);
1900 mutex_unlock(&fs_info->reloc_mutex);
1901
1902 while (!list_empty(&reloc_roots)) {
1903 found = 1;
1904 reloc_root = list_entry(reloc_roots.next,
1905 struct btrfs_root, root_list);
1906
1907 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1908 false);
1909 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1910 BUG_ON(IS_ERR(root));
1911 BUG_ON(root->reloc_root != reloc_root);
1912 ret = merge_reloc_root(rc, root);
1913 btrfs_put_root(root);
1914 if (ret) {
1915 if (list_empty(&reloc_root->root_list))
1916 list_add_tail(&reloc_root->root_list,
1917 &reloc_roots);
1918 goto out;
1919 }
1920 } else {
1921 if (!IS_ERR(root)) {
1922 if (root->reloc_root == reloc_root) {
1923 root->reloc_root = NULL;
1924 btrfs_put_root(reloc_root);
1925 }
1926 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
1927 &root->state);
1928 btrfs_put_root(root);
1929 }
1930
1931 list_del_init(&reloc_root->root_list);
1932 /* Don't forget to queue this reloc root for cleanup */
1933 list_add_tail(&reloc_root->reloc_dirty_list,
1934 &rc->dirty_subvol_roots);
1935 }
1936 }
1937
1938 if (found) {
1939 found = 0;
1940 goto again;
1941 }
1942 out:
1943 if (ret) {
1944 btrfs_handle_fs_error(fs_info, ret, NULL);
1945 free_reloc_roots(&reloc_roots);
1946
1947 /* new reloc root may be added */
1948 mutex_lock(&fs_info->reloc_mutex);
1949 list_splice_init(&rc->reloc_roots, &reloc_roots);
1950 mutex_unlock(&fs_info->reloc_mutex);
1951 free_reloc_roots(&reloc_roots);
1952 }
1953
1954 /*
1955 * We used to have
1956 *
1957 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1958 *
1959 * here, but it's wrong. If we fail to start the transaction in
1960 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
1961 * have actually been removed from the reloc_root_tree rb tree. This is
1962 * fine because we're bailing here, and we hold a reference on the root
1963 * for the list that holds it, so these roots will be cleaned up when we
1964 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
1965 * will be cleaned up on unmount.
1966 *
1967 * The remaining nodes will be cleaned up by free_reloc_control.
1968 */
1969 }
1970
1971 static void free_block_list(struct rb_root *blocks)
1972 {
1973 struct tree_block *block;
1974 struct rb_node *rb_node;
1975 while ((rb_node = rb_first(blocks))) {
1976 block = rb_entry(rb_node, struct tree_block, rb_node);
1977 rb_erase(rb_node, blocks);
1978 kfree(block);
1979 }
1980 }
1981
1982 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1983 struct btrfs_root *reloc_root)
1984 {
1985 struct btrfs_fs_info *fs_info = reloc_root->fs_info;
1986 struct btrfs_root *root;
1987 int ret;
1988
1989 if (reloc_root->last_trans == trans->transid)
1990 return 0;
1991
1992 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
1993 BUG_ON(IS_ERR(root));
1994 BUG_ON(root->reloc_root != reloc_root);
1995 ret = btrfs_record_root_in_trans(trans, root);
1996 btrfs_put_root(root);
1997
1998 return ret;
1999 }
2000
2001 static noinline_for_stack
2002 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2003 struct reloc_control *rc,
2004 struct btrfs_backref_node *node,
2005 struct btrfs_backref_edge *edges[])
2006 {
2007 struct btrfs_backref_node *next;
2008 struct btrfs_root *root;
2009 int index = 0;
2010
2011 next = node;
2012 while (1) {
2013 cond_resched();
2014 next = walk_up_backref(next, edges, &index);
2015 root = next->root;
2016 BUG_ON(!root);
2017 BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
2018
2019 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2020 record_reloc_root_in_trans(trans, root);
2021 break;
2022 }
2023
2024 btrfs_record_root_in_trans(trans, root);
2025 root = root->reloc_root;
2026
2027 if (next->new_bytenr != root->node->start) {
2028 BUG_ON(next->new_bytenr);
2029 BUG_ON(!list_empty(&next->list));
2030 next->new_bytenr = root->node->start;
2031 btrfs_put_root(next->root);
2032 next->root = btrfs_grab_root(root);
2033 ASSERT(next->root);
2034 list_add_tail(&next->list,
2035 &rc->backref_cache.changed);
2036 mark_block_processed(rc, next);
2037 break;
2038 }
2039
2040 WARN_ON(1);
2041 root = NULL;
2042 next = walk_down_backref(edges, &index);
2043 if (!next || next->level <= node->level)
2044 break;
2045 }
2046 if (!root)
2047 return NULL;
2048
2049 next = node;
2050 /* setup backref node path for btrfs_reloc_cow_block */
2051 while (1) {
2052 rc->backref_cache.path[next->level] = next;
2053 if (--index < 0)
2054 break;
2055 next = edges[index]->node[UPPER];
2056 }
2057 return root;
2058 }
2059
2060 /*
2061 * Select a tree root for relocation.
2062 *
2063 * Return NULL if the block is not shareable. We should use do_relocation() in
2064 * this case.
2065 *
2066 * Return a tree root pointer if the block is shareable.
2067 * Return -ENOENT if the block is root of reloc tree.
2068 */
2069 static noinline_for_stack
2070 struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2071 {
2072 struct btrfs_backref_node *next;
2073 struct btrfs_root *root;
2074 struct btrfs_root *fs_root = NULL;
2075 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2076 int index = 0;
2077
2078 next = node;
2079 while (1) {
2080 cond_resched();
2081 next = walk_up_backref(next, edges, &index);
2082 root = next->root;
2083 BUG_ON(!root);
2084
2085 /* No other choice for non-shareable tree */
2086 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2087 return root;
2088
2089 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2090 fs_root = root;
2091
2092 if (next != node)
2093 return NULL;
2094
2095 next = walk_down_backref(edges, &index);
2096 if (!next || next->level <= node->level)
2097 break;
2098 }
2099
2100 if (!fs_root)
2101 return ERR_PTR(-ENOENT);
2102 return fs_root;
2103 }
2104
2105 static noinline_for_stack
2106 u64 calcu_metadata_size(struct reloc_control *rc,
2107 struct btrfs_backref_node *node, int reserve)
2108 {
2109 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2110 struct btrfs_backref_node *next = node;
2111 struct btrfs_backref_edge *edge;
2112 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2113 u64 num_bytes = 0;
2114 int index = 0;
2115
2116 BUG_ON(reserve && node->processed);
2117
2118 while (next) {
2119 cond_resched();
2120 while (1) {
2121 if (next->processed && (reserve || next != node))
2122 break;
2123
2124 num_bytes += fs_info->nodesize;
2125
2126 if (list_empty(&next->upper))
2127 break;
2128
2129 edge = list_entry(next->upper.next,
2130 struct btrfs_backref_edge, list[LOWER]);
2131 edges[index++] = edge;
2132 next = edge->node[UPPER];
2133 }
2134 next = walk_down_backref(edges, &index);
2135 }
2136 return num_bytes;
2137 }
2138
2139 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2140 struct reloc_control *rc,
2141 struct btrfs_backref_node *node)
2142 {
2143 struct btrfs_root *root = rc->extent_root;
2144 struct btrfs_fs_info *fs_info = root->fs_info;
2145 u64 num_bytes;
2146 int ret;
2147 u64 tmp;
2148
2149 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2150
2151 trans->block_rsv = rc->block_rsv;
2152 rc->reserved_bytes += num_bytes;
2153
2154 /*
2155 * We are under a transaction here so we can only do limited flushing.
2156 * If we get an enospc just kick back -EAGAIN so we know to drop the
2157 * transaction and try to refill when we can flush all the things.
2158 */
2159 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2160 BTRFS_RESERVE_FLUSH_LIMIT);
2161 if (ret) {
2162 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2163 while (tmp <= rc->reserved_bytes)
2164 tmp <<= 1;
2165 /*
2166 * only one thread can access block_rsv at this point,
2167 * so we don't need hold lock to protect block_rsv.
2168 * we expand more reservation size here to allow enough
2169 * space for relocation and we will return earlier in
2170 * enospc case.
2171 */
2172 rc->block_rsv->size = tmp + fs_info->nodesize *
2173 RELOCATION_RESERVED_NODES;
2174 return -EAGAIN;
2175 }
2176
2177 return 0;
2178 }
2179
2180 /*
2181 * relocate a block tree, and then update pointers in upper level
2182 * blocks that reference the block to point to the new location.
2183 *
2184 * if called by link_to_upper, the block has already been relocated.
2185 * in that case this function just updates pointers.
2186 */
2187 static int do_relocation(struct btrfs_trans_handle *trans,
2188 struct reloc_control *rc,
2189 struct btrfs_backref_node *node,
2190 struct btrfs_key *key,
2191 struct btrfs_path *path, int lowest)
2192 {
2193 struct btrfs_backref_node *upper;
2194 struct btrfs_backref_edge *edge;
2195 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2196 struct btrfs_root *root;
2197 struct extent_buffer *eb;
2198 u32 blocksize;
2199 u64 bytenr;
2200 int slot;
2201 int ret = 0;
2202
2203 BUG_ON(lowest && node->eb);
2204
2205 path->lowest_level = node->level + 1;
2206 rc->backref_cache.path[node->level] = node;
2207 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2208 struct btrfs_ref ref = { 0 };
2209
2210 cond_resched();
2211
2212 upper = edge->node[UPPER];
2213 root = select_reloc_root(trans, rc, upper, edges);
2214 BUG_ON(!root);
2215
2216 if (upper->eb && !upper->locked) {
2217 if (!lowest) {
2218 ret = btrfs_bin_search(upper->eb, key, &slot);
2219 if (ret < 0)
2220 goto next;
2221 BUG_ON(ret);
2222 bytenr = btrfs_node_blockptr(upper->eb, slot);
2223 if (node->eb->start == bytenr)
2224 goto next;
2225 }
2226 btrfs_backref_drop_node_buffer(upper);
2227 }
2228
2229 if (!upper->eb) {
2230 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2231 if (ret) {
2232 if (ret > 0)
2233 ret = -ENOENT;
2234
2235 btrfs_release_path(path);
2236 break;
2237 }
2238
2239 if (!upper->eb) {
2240 upper->eb = path->nodes[upper->level];
2241 path->nodes[upper->level] = NULL;
2242 } else {
2243 BUG_ON(upper->eb != path->nodes[upper->level]);
2244 }
2245
2246 upper->locked = 1;
2247 path->locks[upper->level] = 0;
2248
2249 slot = path->slots[upper->level];
2250 btrfs_release_path(path);
2251 } else {
2252 ret = btrfs_bin_search(upper->eb, key, &slot);
2253 if (ret < 0)
2254 goto next;
2255 BUG_ON(ret);
2256 }
2257
2258 bytenr = btrfs_node_blockptr(upper->eb, slot);
2259 if (lowest) {
2260 if (bytenr != node->bytenr) {
2261 btrfs_err(root->fs_info,
2262 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2263 bytenr, node->bytenr, slot,
2264 upper->eb->start);
2265 ret = -EIO;
2266 goto next;
2267 }
2268 } else {
2269 if (node->eb->start == bytenr)
2270 goto next;
2271 }
2272
2273 blocksize = root->fs_info->nodesize;
2274 eb = btrfs_read_node_slot(upper->eb, slot);
2275 if (IS_ERR(eb)) {
2276 ret = PTR_ERR(eb);
2277 goto next;
2278 }
2279 btrfs_tree_lock(eb);
2280
2281 if (!node->eb) {
2282 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2283 slot, &eb, BTRFS_NESTING_COW);
2284 btrfs_tree_unlock(eb);
2285 free_extent_buffer(eb);
2286 if (ret < 0)
2287 goto next;
2288 BUG_ON(node->eb != eb);
2289 } else {
2290 btrfs_set_node_blockptr(upper->eb, slot,
2291 node->eb->start);
2292 btrfs_set_node_ptr_generation(upper->eb, slot,
2293 trans->transid);
2294 btrfs_mark_buffer_dirty(upper->eb);
2295
2296 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2297 node->eb->start, blocksize,
2298 upper->eb->start);
2299 ref.real_root = root->root_key.objectid;
2300 btrfs_init_tree_ref(&ref, node->level,
2301 btrfs_header_owner(upper->eb));
2302 ret = btrfs_inc_extent_ref(trans, &ref);
2303 BUG_ON(ret);
2304
2305 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2306 BUG_ON(ret);
2307 }
2308 next:
2309 if (!upper->pending)
2310 btrfs_backref_drop_node_buffer(upper);
2311 else
2312 btrfs_backref_unlock_node_buffer(upper);
2313 if (ret)
2314 break;
2315 }
2316
2317 if (!ret && node->pending) {
2318 btrfs_backref_drop_node_buffer(node);
2319 list_move_tail(&node->list, &rc->backref_cache.changed);
2320 node->pending = 0;
2321 }
2322
2323 path->lowest_level = 0;
2324 BUG_ON(ret == -ENOSPC);
2325 return ret;
2326 }
2327
2328 static int link_to_upper(struct btrfs_trans_handle *trans,
2329 struct reloc_control *rc,
2330 struct btrfs_backref_node *node,
2331 struct btrfs_path *path)
2332 {
2333 struct btrfs_key key;
2334
2335 btrfs_node_key_to_cpu(node->eb, &key, 0);
2336 return do_relocation(trans, rc, node, &key, path, 0);
2337 }
2338
2339 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2340 struct reloc_control *rc,
2341 struct btrfs_path *path, int err)
2342 {
2343 LIST_HEAD(list);
2344 struct btrfs_backref_cache *cache = &rc->backref_cache;
2345 struct btrfs_backref_node *node;
2346 int level;
2347 int ret;
2348
2349 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2350 while (!list_empty(&cache->pending[level])) {
2351 node = list_entry(cache->pending[level].next,
2352 struct btrfs_backref_node, list);
2353 list_move_tail(&node->list, &list);
2354 BUG_ON(!node->pending);
2355
2356 if (!err) {
2357 ret = link_to_upper(trans, rc, node, path);
2358 if (ret < 0)
2359 err = ret;
2360 }
2361 }
2362 list_splice_init(&list, &cache->pending[level]);
2363 }
2364 return err;
2365 }
2366
2367 /*
2368 * mark a block and all blocks directly/indirectly reference the block
2369 * as processed.
2370 */
2371 static void update_processed_blocks(struct reloc_control *rc,
2372 struct btrfs_backref_node *node)
2373 {
2374 struct btrfs_backref_node *next = node;
2375 struct btrfs_backref_edge *edge;
2376 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2377 int index = 0;
2378
2379 while (next) {
2380 cond_resched();
2381 while (1) {
2382 if (next->processed)
2383 break;
2384
2385 mark_block_processed(rc, next);
2386
2387 if (list_empty(&next->upper))
2388 break;
2389
2390 edge = list_entry(next->upper.next,
2391 struct btrfs_backref_edge, list[LOWER]);
2392 edges[index++] = edge;
2393 next = edge->node[UPPER];
2394 }
2395 next = walk_down_backref(edges, &index);
2396 }
2397 }
2398
2399 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2400 {
2401 u32 blocksize = rc->extent_root->fs_info->nodesize;
2402
2403 if (test_range_bit(&rc->processed_blocks, bytenr,
2404 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2405 return 1;
2406 return 0;
2407 }
2408
2409 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2410 struct tree_block *block)
2411 {
2412 struct extent_buffer *eb;
2413
2414 eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset,
2415 block->level, NULL);
2416 if (IS_ERR(eb)) {
2417 return PTR_ERR(eb);
2418 } else if (!extent_buffer_uptodate(eb)) {
2419 free_extent_buffer(eb);
2420 return -EIO;
2421 }
2422 if (block->level == 0)
2423 btrfs_item_key_to_cpu(eb, &block->key, 0);
2424 else
2425 btrfs_node_key_to_cpu(eb, &block->key, 0);
2426 free_extent_buffer(eb);
2427 block->key_ready = 1;
2428 return 0;
2429 }
2430
2431 /*
2432 * helper function to relocate a tree block
2433 */
2434 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2435 struct reloc_control *rc,
2436 struct btrfs_backref_node *node,
2437 struct btrfs_key *key,
2438 struct btrfs_path *path)
2439 {
2440 struct btrfs_root *root;
2441 int ret = 0;
2442
2443 if (!node)
2444 return 0;
2445
2446 /*
2447 * If we fail here we want to drop our backref_node because we are going
2448 * to start over and regenerate the tree for it.
2449 */
2450 ret = reserve_metadata_space(trans, rc, node);
2451 if (ret)
2452 goto out;
2453
2454 BUG_ON(node->processed);
2455 root = select_one_root(node);
2456 if (root == ERR_PTR(-ENOENT)) {
2457 update_processed_blocks(rc, node);
2458 goto out;
2459 }
2460
2461 if (root) {
2462 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2463 BUG_ON(node->new_bytenr);
2464 BUG_ON(!list_empty(&node->list));
2465 btrfs_record_root_in_trans(trans, root);
2466 root = root->reloc_root;
2467 node->new_bytenr = root->node->start;
2468 btrfs_put_root(node->root);
2469 node->root = btrfs_grab_root(root);
2470 ASSERT(node->root);
2471 list_add_tail(&node->list, &rc->backref_cache.changed);
2472 } else {
2473 path->lowest_level = node->level;
2474 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2475 btrfs_release_path(path);
2476 if (ret > 0)
2477 ret = 0;
2478 }
2479 if (!ret)
2480 update_processed_blocks(rc, node);
2481 } else {
2482 ret = do_relocation(trans, rc, node, key, path, 1);
2483 }
2484 out:
2485 if (ret || node->level == 0 || node->cowonly)
2486 btrfs_backref_cleanup_node(&rc->backref_cache, node);
2487 return ret;
2488 }
2489
2490 /*
2491 * relocate a list of blocks
2492 */
2493 static noinline_for_stack
2494 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2495 struct reloc_control *rc, struct rb_root *blocks)
2496 {
2497 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2498 struct btrfs_backref_node *node;
2499 struct btrfs_path *path;
2500 struct tree_block *block;
2501 struct tree_block *next;
2502 int ret;
2503 int err = 0;
2504
2505 path = btrfs_alloc_path();
2506 if (!path) {
2507 err = -ENOMEM;
2508 goto out_free_blocks;
2509 }
2510
2511 /* Kick in readahead for tree blocks with missing keys */
2512 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2513 if (!block->key_ready)
2514 btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0,
2515 block->level);
2516 }
2517
2518 /* Get first keys */
2519 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2520 if (!block->key_ready) {
2521 err = get_tree_block_key(fs_info, block);
2522 if (err)
2523 goto out_free_path;
2524 }
2525 }
2526
2527 /* Do tree relocation */
2528 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2529 node = build_backref_tree(rc, &block->key,
2530 block->level, block->bytenr);
2531 if (IS_ERR(node)) {
2532 err = PTR_ERR(node);
2533 goto out;
2534 }
2535
2536 ret = relocate_tree_block(trans, rc, node, &block->key,
2537 path);
2538 if (ret < 0) {
2539 err = ret;
2540 break;
2541 }
2542 }
2543 out:
2544 err = finish_pending_nodes(trans, rc, path, err);
2545
2546 out_free_path:
2547 btrfs_free_path(path);
2548 out_free_blocks:
2549 free_block_list(blocks);
2550 return err;
2551 }
2552
2553 static noinline_for_stack int prealloc_file_extent_cluster(
2554 struct btrfs_inode *inode,
2555 struct file_extent_cluster *cluster)
2556 {
2557 u64 alloc_hint = 0;
2558 u64 start;
2559 u64 end;
2560 u64 offset = inode->index_cnt;
2561 u64 num_bytes;
2562 int nr;
2563 int ret = 0;
2564 u64 prealloc_start = cluster->start - offset;
2565 u64 prealloc_end = cluster->end - offset;
2566 u64 cur_offset = prealloc_start;
2567
2568 BUG_ON(cluster->start != cluster->boundary[0]);
2569 ret = btrfs_alloc_data_chunk_ondemand(inode,
2570 prealloc_end + 1 - prealloc_start);
2571 if (ret)
2572 return ret;
2573
2574 inode_lock(&inode->vfs_inode);
2575 for (nr = 0; nr < cluster->nr; nr++) {
2576 start = cluster->boundary[nr] - offset;
2577 if (nr + 1 < cluster->nr)
2578 end = cluster->boundary[nr + 1] - 1 - offset;
2579 else
2580 end = cluster->end - offset;
2581
2582 lock_extent(&inode->io_tree, start, end);
2583 num_bytes = end + 1 - start;
2584 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2585 num_bytes, num_bytes,
2586 end + 1, &alloc_hint);
2587 cur_offset = end + 1;
2588 unlock_extent(&inode->io_tree, start, end);
2589 if (ret)
2590 break;
2591 }
2592 inode_unlock(&inode->vfs_inode);
2593
2594 if (cur_offset < prealloc_end)
2595 btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
2596 prealloc_end + 1 - cur_offset);
2597 return ret;
2598 }
2599
2600 static noinline_for_stack
2601 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2602 u64 block_start)
2603 {
2604 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2605 struct extent_map *em;
2606 int ret = 0;
2607
2608 em = alloc_extent_map();
2609 if (!em)
2610 return -ENOMEM;
2611
2612 em->start = start;
2613 em->len = end + 1 - start;
2614 em->block_len = em->len;
2615 em->block_start = block_start;
2616 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2617
2618 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2619 while (1) {
2620 write_lock(&em_tree->lock);
2621 ret = add_extent_mapping(em_tree, em, 0);
2622 write_unlock(&em_tree->lock);
2623 if (ret != -EEXIST) {
2624 free_extent_map(em);
2625 break;
2626 }
2627 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
2628 }
2629 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2630 return ret;
2631 }
2632
2633 /*
2634 * Allow error injection to test balance cancellation
2635 */
2636 int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
2637 {
2638 return atomic_read(&fs_info->balance_cancel_req) ||
2639 fatal_signal_pending(current);
2640 }
2641 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2642
2643 static int relocate_file_extent_cluster(struct inode *inode,
2644 struct file_extent_cluster *cluster)
2645 {
2646 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2647 u64 page_start;
2648 u64 page_end;
2649 u64 offset = BTRFS_I(inode)->index_cnt;
2650 unsigned long index;
2651 unsigned long last_index;
2652 struct page *page;
2653 struct file_ra_state *ra;
2654 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2655 int nr = 0;
2656 int ret = 0;
2657
2658 if (!cluster->nr)
2659 return 0;
2660
2661 ra = kzalloc(sizeof(*ra), GFP_NOFS);
2662 if (!ra)
2663 return -ENOMEM;
2664
2665 ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
2666 if (ret)
2667 goto out;
2668
2669 file_ra_state_init(ra, inode->i_mapping);
2670
2671 ret = setup_extent_mapping(inode, cluster->start - offset,
2672 cluster->end - offset, cluster->start);
2673 if (ret)
2674 goto out;
2675
2676 index = (cluster->start - offset) >> PAGE_SHIFT;
2677 last_index = (cluster->end - offset) >> PAGE_SHIFT;
2678 while (index <= last_index) {
2679 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
2680 PAGE_SIZE);
2681 if (ret)
2682 goto out;
2683
2684 page = find_lock_page(inode->i_mapping, index);
2685 if (!page) {
2686 page_cache_sync_readahead(inode->i_mapping,
2687 ra, NULL, index,
2688 last_index + 1 - index);
2689 page = find_or_create_page(inode->i_mapping, index,
2690 mask);
2691 if (!page) {
2692 btrfs_delalloc_release_metadata(BTRFS_I(inode),
2693 PAGE_SIZE, true);
2694 btrfs_delalloc_release_extents(BTRFS_I(inode),
2695 PAGE_SIZE);
2696 ret = -ENOMEM;
2697 goto out;
2698 }
2699 }
2700
2701 if (PageReadahead(page)) {
2702 page_cache_async_readahead(inode->i_mapping,
2703 ra, NULL, page, index,
2704 last_index + 1 - index);
2705 }
2706
2707 if (!PageUptodate(page)) {
2708 btrfs_readpage(NULL, page);
2709 lock_page(page);
2710 if (!PageUptodate(page)) {
2711 unlock_page(page);
2712 put_page(page);
2713 btrfs_delalloc_release_metadata(BTRFS_I(inode),
2714 PAGE_SIZE, true);
2715 btrfs_delalloc_release_extents(BTRFS_I(inode),
2716 PAGE_SIZE);
2717 ret = -EIO;
2718 goto out;
2719 }
2720 }
2721
2722 page_start = page_offset(page);
2723 page_end = page_start + PAGE_SIZE - 1;
2724
2725 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
2726
2727 set_page_extent_mapped(page);
2728
2729 if (nr < cluster->nr &&
2730 page_start + offset == cluster->boundary[nr]) {
2731 set_extent_bits(&BTRFS_I(inode)->io_tree,
2732 page_start, page_end,
2733 EXTENT_BOUNDARY);
2734 nr++;
2735 }
2736
2737 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
2738 page_end, 0, NULL);
2739 if (ret) {
2740 unlock_page(page);
2741 put_page(page);
2742 btrfs_delalloc_release_metadata(BTRFS_I(inode),
2743 PAGE_SIZE, true);
2744 btrfs_delalloc_release_extents(BTRFS_I(inode),
2745 PAGE_SIZE);
2746
2747 clear_extent_bits(&BTRFS_I(inode)->io_tree,
2748 page_start, page_end,
2749 EXTENT_LOCKED | EXTENT_BOUNDARY);
2750 goto out;
2751
2752 }
2753 set_page_dirty(page);
2754
2755 unlock_extent(&BTRFS_I(inode)->io_tree,
2756 page_start, page_end);
2757 unlock_page(page);
2758 put_page(page);
2759
2760 index++;
2761 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2762 balance_dirty_pages_ratelimited(inode->i_mapping);
2763 btrfs_throttle(fs_info);
2764 if (btrfs_should_cancel_balance(fs_info)) {
2765 ret = -ECANCELED;
2766 goto out;
2767 }
2768 }
2769 WARN_ON(nr != cluster->nr);
2770 out:
2771 kfree(ra);
2772 return ret;
2773 }
2774
2775 static noinline_for_stack
2776 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2777 struct file_extent_cluster *cluster)
2778 {
2779 int ret;
2780
2781 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2782 ret = relocate_file_extent_cluster(inode, cluster);
2783 if (ret)
2784 return ret;
2785 cluster->nr = 0;
2786 }
2787
2788 if (!cluster->nr)
2789 cluster->start = extent_key->objectid;
2790 else
2791 BUG_ON(cluster->nr >= MAX_EXTENTS);
2792 cluster->end = extent_key->objectid + extent_key->offset - 1;
2793 cluster->boundary[cluster->nr] = extent_key->objectid;
2794 cluster->nr++;
2795
2796 if (cluster->nr >= MAX_EXTENTS) {
2797 ret = relocate_file_extent_cluster(inode, cluster);
2798 if (ret)
2799 return ret;
2800 cluster->nr = 0;
2801 }
2802 return 0;
2803 }
2804
2805 /*
2806 * helper to add a tree block to the list.
2807 * the major work is getting the generation and level of the block
2808 */
2809 static int add_tree_block(struct reloc_control *rc,
2810 struct btrfs_key *extent_key,
2811 struct btrfs_path *path,
2812 struct rb_root *blocks)
2813 {
2814 struct extent_buffer *eb;
2815 struct btrfs_extent_item *ei;
2816 struct btrfs_tree_block_info *bi;
2817 struct tree_block *block;
2818 struct rb_node *rb_node;
2819 u32 item_size;
2820 int level = -1;
2821 u64 generation;
2822
2823 eb = path->nodes[0];
2824 item_size = btrfs_item_size_nr(eb, path->slots[0]);
2825
2826 if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
2827 item_size >= sizeof(*ei) + sizeof(*bi)) {
2828 ei = btrfs_item_ptr(eb, path->slots[0],
2829 struct btrfs_extent_item);
2830 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
2831 bi = (struct btrfs_tree_block_info *)(ei + 1);
2832 level = btrfs_tree_block_level(eb, bi);
2833 } else {
2834 level = (int)extent_key->offset;
2835 }
2836 generation = btrfs_extent_generation(eb, ei);
2837 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
2838 btrfs_print_v0_err(eb->fs_info);
2839 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
2840 return -EINVAL;
2841 } else {
2842 BUG();
2843 }
2844
2845 btrfs_release_path(path);
2846
2847 BUG_ON(level == -1);
2848
2849 block = kmalloc(sizeof(*block), GFP_NOFS);
2850 if (!block)
2851 return -ENOMEM;
2852
2853 block->bytenr = extent_key->objectid;
2854 block->key.objectid = rc->extent_root->fs_info->nodesize;
2855 block->key.offset = generation;
2856 block->level = level;
2857 block->key_ready = 0;
2858
2859 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
2860 if (rb_node)
2861 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
2862 -EEXIST);
2863
2864 return 0;
2865 }
2866
2867 /*
2868 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2869 */
2870 static int __add_tree_block(struct reloc_control *rc,
2871 u64 bytenr, u32 blocksize,
2872 struct rb_root *blocks)
2873 {
2874 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2875 struct btrfs_path *path;
2876 struct btrfs_key key;
2877 int ret;
2878 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
2879
2880 if (tree_block_processed(bytenr, rc))
2881 return 0;
2882
2883 if (rb_simple_search(blocks, bytenr))
2884 return 0;
2885
2886 path = btrfs_alloc_path();
2887 if (!path)
2888 return -ENOMEM;
2889 again:
2890 key.objectid = bytenr;
2891 if (skinny) {
2892 key.type = BTRFS_METADATA_ITEM_KEY;
2893 key.offset = (u64)-1;
2894 } else {
2895 key.type = BTRFS_EXTENT_ITEM_KEY;
2896 key.offset = blocksize;
2897 }
2898
2899 path->search_commit_root = 1;
2900 path->skip_locking = 1;
2901 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2902 if (ret < 0)
2903 goto out;
2904
2905 if (ret > 0 && skinny) {
2906 if (path->slots[0]) {
2907 path->slots[0]--;
2908 btrfs_item_key_to_cpu(path->nodes[0], &key,
2909 path->slots[0]);
2910 if (key.objectid == bytenr &&
2911 (key.type == BTRFS_METADATA_ITEM_KEY ||
2912 (key.type == BTRFS_EXTENT_ITEM_KEY &&
2913 key.offset == blocksize)))
2914 ret = 0;
2915 }
2916
2917 if (ret) {
2918 skinny = false;
2919 btrfs_release_path(path);
2920 goto again;
2921 }
2922 }
2923 if (ret) {
2924 ASSERT(ret == 1);
2925 btrfs_print_leaf(path->nodes[0]);
2926 btrfs_err(fs_info,
2927 "tree block extent item (%llu) is not found in extent tree",
2928 bytenr);
2929 WARN_ON(1);
2930 ret = -EINVAL;
2931 goto out;
2932 }
2933
2934 ret = add_tree_block(rc, &key, path, blocks);
2935 out:
2936 btrfs_free_path(path);
2937 return ret;
2938 }
2939
2940 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
2941 struct btrfs_block_group *block_group,
2942 struct inode *inode,
2943 u64 ino)
2944 {
2945 struct btrfs_root *root = fs_info->tree_root;
2946 struct btrfs_trans_handle *trans;
2947 int ret = 0;
2948
2949 if (inode)
2950 goto truncate;
2951
2952 inode = btrfs_iget(fs_info->sb, ino, root);
2953 if (IS_ERR(inode))
2954 return -ENOENT;
2955
2956 truncate:
2957 ret = btrfs_check_trunc_cache_free_space(fs_info,
2958 &fs_info->global_block_rsv);
2959 if (ret)
2960 goto out;
2961
2962 trans = btrfs_join_transaction(root);
2963 if (IS_ERR(trans)) {
2964 ret = PTR_ERR(trans);
2965 goto out;
2966 }
2967
2968 ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
2969
2970 btrfs_end_transaction(trans);
2971 btrfs_btree_balance_dirty(fs_info);
2972 out:
2973 iput(inode);
2974 return ret;
2975 }
2976
2977 /*
2978 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
2979 * cache inode, to avoid free space cache data extent blocking data relocation.
2980 */
2981 static int delete_v1_space_cache(struct extent_buffer *leaf,
2982 struct btrfs_block_group *block_group,
2983 u64 data_bytenr)
2984 {
2985 u64 space_cache_ino;
2986 struct btrfs_file_extent_item *ei;
2987 struct btrfs_key key;
2988 bool found = false;
2989 int i;
2990 int ret;
2991
2992 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
2993 return 0;
2994
2995 for (i = 0; i < btrfs_header_nritems(leaf); i++) {
2996 u8 type;
2997
2998 btrfs_item_key_to_cpu(leaf, &key, i);
2999 if (key.type != BTRFS_EXTENT_DATA_KEY)
3000 continue;
3001 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3002 type = btrfs_file_extent_type(leaf, ei);
3003
3004 if ((type == BTRFS_FILE_EXTENT_REG ||
3005 type == BTRFS_FILE_EXTENT_PREALLOC) &&
3006 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3007 found = true;
3008 space_cache_ino = key.objectid;
3009 break;
3010 }
3011 }
3012 if (!found)
3013 return -ENOENT;
3014 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3015 space_cache_ino);
3016 return ret;
3017 }
3018
3019 /*
3020 * helper to find all tree blocks that reference a given data extent
3021 */
3022 static noinline_for_stack
3023 int add_data_references(struct reloc_control *rc,
3024 struct btrfs_key *extent_key,
3025 struct btrfs_path *path,
3026 struct rb_root *blocks)
3027 {
3028 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3029 struct ulist *leaves = NULL;
3030 struct ulist_iterator leaf_uiter;
3031 struct ulist_node *ref_node = NULL;
3032 const u32 blocksize = fs_info->nodesize;
3033 int ret = 0;
3034
3035 btrfs_release_path(path);
3036 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3037 0, &leaves, NULL, true);
3038 if (ret < 0)
3039 return ret;
3040
3041 ULIST_ITER_INIT(&leaf_uiter);
3042 while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
3043 struct extent_buffer *eb;
3044
3045 eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
3046 if (IS_ERR(eb)) {
3047 ret = PTR_ERR(eb);
3048 break;
3049 }
3050 ret = delete_v1_space_cache(eb, rc->block_group,
3051 extent_key->objectid);
3052 free_extent_buffer(eb);
3053 if (ret < 0)
3054 break;
3055 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3056 if (ret < 0)
3057 break;
3058 }
3059 if (ret < 0)
3060 free_block_list(blocks);
3061 ulist_free(leaves);
3062 return ret;
3063 }
3064
3065 /*
3066 * helper to find next unprocessed extent
3067 */
3068 static noinline_for_stack
3069 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3070 struct btrfs_key *extent_key)
3071 {
3072 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3073 struct btrfs_key key;
3074 struct extent_buffer *leaf;
3075 u64 start, end, last;
3076 int ret;
3077
3078 last = rc->block_group->start + rc->block_group->length;
3079 while (1) {
3080 cond_resched();
3081 if (rc->search_start >= last) {
3082 ret = 1;
3083 break;
3084 }
3085
3086 key.objectid = rc->search_start;
3087 key.type = BTRFS_EXTENT_ITEM_KEY;
3088 key.offset = 0;
3089
3090 path->search_commit_root = 1;
3091 path->skip_locking = 1;
3092 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3093 0, 0);
3094 if (ret < 0)
3095 break;
3096 next:
3097 leaf = path->nodes[0];
3098 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3099 ret = btrfs_next_leaf(rc->extent_root, path);
3100 if (ret != 0)
3101 break;
3102 leaf = path->nodes[0];
3103 }
3104
3105 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3106 if (key.objectid >= last) {
3107 ret = 1;
3108 break;
3109 }
3110
3111 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3112 key.type != BTRFS_METADATA_ITEM_KEY) {
3113 path->slots[0]++;
3114 goto next;
3115 }
3116
3117 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3118 key.objectid + key.offset <= rc->search_start) {
3119 path->slots[0]++;
3120 goto next;
3121 }
3122
3123 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3124 key.objectid + fs_info->nodesize <=
3125 rc->search_start) {
3126 path->slots[0]++;
3127 goto next;
3128 }
3129
3130 ret = find_first_extent_bit(&rc->processed_blocks,
3131 key.objectid, &start, &end,
3132 EXTENT_DIRTY, NULL);
3133
3134 if (ret == 0 && start <= key.objectid) {
3135 btrfs_release_path(path);
3136 rc->search_start = end + 1;
3137 } else {
3138 if (key.type == BTRFS_EXTENT_ITEM_KEY)
3139 rc->search_start = key.objectid + key.offset;
3140 else
3141 rc->search_start = key.objectid +
3142 fs_info->nodesize;
3143 memcpy(extent_key, &key, sizeof(key));
3144 return 0;
3145 }
3146 }
3147 btrfs_release_path(path);
3148 return ret;
3149 }
3150
3151 static void set_reloc_control(struct reloc_control *rc)
3152 {
3153 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3154
3155 mutex_lock(&fs_info->reloc_mutex);
3156 fs_info->reloc_ctl = rc;
3157 mutex_unlock(&fs_info->reloc_mutex);
3158 }
3159
3160 static void unset_reloc_control(struct reloc_control *rc)
3161 {
3162 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3163
3164 mutex_lock(&fs_info->reloc_mutex);
3165 fs_info->reloc_ctl = NULL;
3166 mutex_unlock(&fs_info->reloc_mutex);
3167 }
3168
3169 static int check_extent_flags(u64 flags)
3170 {
3171 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3172 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3173 return 1;
3174 if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3175 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3176 return 1;
3177 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3178 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3179 return 1;
3180 return 0;
3181 }
3182
3183 static noinline_for_stack
3184 int prepare_to_relocate(struct reloc_control *rc)
3185 {
3186 struct btrfs_trans_handle *trans;
3187 int ret;
3188
3189 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3190 BTRFS_BLOCK_RSV_TEMP);
3191 if (!rc->block_rsv)
3192 return -ENOMEM;
3193
3194 memset(&rc->cluster, 0, sizeof(rc->cluster));
3195 rc->search_start = rc->block_group->start;
3196 rc->extents_found = 0;
3197 rc->nodes_relocated = 0;
3198 rc->merging_rsv_size = 0;
3199 rc->reserved_bytes = 0;
3200 rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3201 RELOCATION_RESERVED_NODES;
3202 ret = btrfs_block_rsv_refill(rc->extent_root,
3203 rc->block_rsv, rc->block_rsv->size,
3204 BTRFS_RESERVE_FLUSH_ALL);
3205 if (ret)
3206 return ret;
3207
3208 rc->create_reloc_tree = 1;
3209 set_reloc_control(rc);
3210
3211 trans = btrfs_join_transaction(rc->extent_root);
3212 if (IS_ERR(trans)) {
3213 unset_reloc_control(rc);
3214 /*
3215 * extent tree is not a ref_cow tree and has no reloc_root to
3216 * cleanup. And callers are responsible to free the above
3217 * block rsv.
3218 */
3219 return PTR_ERR(trans);
3220 }
3221 btrfs_commit_transaction(trans);
3222 return 0;
3223 }
3224
3225 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3226 {
3227 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3228 struct rb_root blocks = RB_ROOT;
3229 struct btrfs_key key;
3230 struct btrfs_trans_handle *trans = NULL;
3231 struct btrfs_path *path;
3232 struct btrfs_extent_item *ei;
3233 u64 flags;
3234 u32 item_size;
3235 int ret;
3236 int err = 0;
3237 int progress = 0;
3238
3239 path = btrfs_alloc_path();
3240 if (!path)
3241 return -ENOMEM;
3242 path->reada = READA_FORWARD;
3243
3244 ret = prepare_to_relocate(rc);
3245 if (ret) {
3246 err = ret;
3247 goto out_free;
3248 }
3249
3250 while (1) {
3251 rc->reserved_bytes = 0;
3252 ret = btrfs_block_rsv_refill(rc->extent_root,
3253 rc->block_rsv, rc->block_rsv->size,
3254 BTRFS_RESERVE_FLUSH_ALL);
3255 if (ret) {
3256 err = ret;
3257 break;
3258 }
3259 progress++;
3260 trans = btrfs_start_transaction(rc->extent_root, 0);
3261 if (IS_ERR(trans)) {
3262 err = PTR_ERR(trans);
3263 trans = NULL;
3264 break;
3265 }
3266 restart:
3267 if (update_backref_cache(trans, &rc->backref_cache)) {
3268 btrfs_end_transaction(trans);
3269 trans = NULL;
3270 continue;
3271 }
3272
3273 ret = find_next_extent(rc, path, &key);
3274 if (ret < 0)
3275 err = ret;
3276 if (ret != 0)
3277 break;
3278
3279 rc->extents_found++;
3280
3281 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3282 struct btrfs_extent_item);
3283 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3284 if (item_size >= sizeof(*ei)) {
3285 flags = btrfs_extent_flags(path->nodes[0], ei);
3286 ret = check_extent_flags(flags);
3287 BUG_ON(ret);
3288 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3289 err = -EINVAL;
3290 btrfs_print_v0_err(trans->fs_info);
3291 btrfs_abort_transaction(trans, err);
3292 break;
3293 } else {
3294 BUG();
3295 }
3296
3297 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3298 ret = add_tree_block(rc, &key, path, &blocks);
3299 } else if (rc->stage == UPDATE_DATA_PTRS &&
3300 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3301 ret = add_data_references(rc, &key, path, &blocks);
3302 } else {
3303 btrfs_release_path(path);
3304 ret = 0;
3305 }
3306 if (ret < 0) {
3307 err = ret;
3308 break;
3309 }
3310
3311 if (!RB_EMPTY_ROOT(&blocks)) {
3312 ret = relocate_tree_blocks(trans, rc, &blocks);
3313 if (ret < 0) {
3314 if (ret != -EAGAIN) {
3315 err = ret;
3316 break;
3317 }
3318 rc->extents_found--;
3319 rc->search_start = key.objectid;
3320 }
3321 }
3322
3323 btrfs_end_transaction_throttle(trans);
3324 btrfs_btree_balance_dirty(fs_info);
3325 trans = NULL;
3326
3327 if (rc->stage == MOVE_DATA_EXTENTS &&
3328 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3329 rc->found_file_extent = 1;
3330 ret = relocate_data_extent(rc->data_inode,
3331 &key, &rc->cluster);
3332 if (ret < 0) {
3333 err = ret;
3334 break;
3335 }
3336 }
3337 if (btrfs_should_cancel_balance(fs_info)) {
3338 err = -ECANCELED;
3339 break;
3340 }
3341 }
3342 if (trans && progress && err == -ENOSPC) {
3343 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3344 if (ret == 1) {
3345 err = 0;
3346 progress = 0;
3347 goto restart;
3348 }
3349 }
3350
3351 btrfs_release_path(path);
3352 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
3353
3354 if (trans) {
3355 btrfs_end_transaction_throttle(trans);
3356 btrfs_btree_balance_dirty(fs_info);
3357 }
3358
3359 if (!err) {
3360 ret = relocate_file_extent_cluster(rc->data_inode,
3361 &rc->cluster);
3362 if (ret < 0)
3363 err = ret;
3364 }
3365
3366 rc->create_reloc_tree = 0;
3367 set_reloc_control(rc);
3368
3369 btrfs_backref_release_cache(&rc->backref_cache);
3370 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3371
3372 /*
3373 * Even in the case when the relocation is cancelled, we should all go
3374 * through prepare_to_merge() and merge_reloc_roots().
3375 *
3376 * For error (including cancelled balance), prepare_to_merge() will
3377 * mark all reloc trees orphan, then queue them for cleanup in
3378 * merge_reloc_roots()
3379 */
3380 err = prepare_to_merge(rc, err);
3381
3382 merge_reloc_roots(rc);
3383
3384 rc->merge_reloc_tree = 0;
3385 unset_reloc_control(rc);
3386 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3387
3388 /* get rid of pinned extents */
3389 trans = btrfs_join_transaction(rc->extent_root);
3390 if (IS_ERR(trans)) {
3391 err = PTR_ERR(trans);
3392 goto out_free;
3393 }
3394 btrfs_commit_transaction(trans);
3395 out_free:
3396 ret = clean_dirty_subvols(rc);
3397 if (ret < 0 && !err)
3398 err = ret;
3399 btrfs_free_block_rsv(fs_info, rc->block_rsv);
3400 btrfs_free_path(path);
3401 return err;
3402 }
3403
3404 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3405 struct btrfs_root *root, u64 objectid)
3406 {
3407 struct btrfs_path *path;
3408 struct btrfs_inode_item *item;
3409 struct extent_buffer *leaf;
3410 int ret;
3411
3412 path = btrfs_alloc_path();
3413 if (!path)
3414 return -ENOMEM;
3415
3416 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3417 if (ret)
3418 goto out;
3419
3420 leaf = path->nodes[0];
3421 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3422 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3423 btrfs_set_inode_generation(leaf, item, 1);
3424 btrfs_set_inode_size(leaf, item, 0);
3425 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3426 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3427 BTRFS_INODE_PREALLOC);
3428 btrfs_mark_buffer_dirty(leaf);
3429 out:
3430 btrfs_free_path(path);
3431 return ret;
3432 }
3433
3434 /*
3435 * helper to create inode for data relocation.
3436 * the inode is in data relocation tree and its link count is 0
3437 */
3438 static noinline_for_stack
3439 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3440 struct btrfs_block_group *group)
3441 {
3442 struct inode *inode = NULL;
3443 struct btrfs_trans_handle *trans;
3444 struct btrfs_root *root;
3445 u64 objectid;
3446 int err = 0;
3447
3448 root = btrfs_grab_root(fs_info->data_reloc_root);
3449 trans = btrfs_start_transaction(root, 6);
3450 if (IS_ERR(trans)) {
3451 btrfs_put_root(root);
3452 return ERR_CAST(trans);
3453 }
3454
3455 err = btrfs_find_free_objectid(root, &objectid);
3456 if (err)
3457 goto out;
3458
3459 err = __insert_orphan_inode(trans, root, objectid);
3460 BUG_ON(err);
3461
3462 inode = btrfs_iget(fs_info->sb, objectid, root);
3463 BUG_ON(IS_ERR(inode));
3464 BTRFS_I(inode)->index_cnt = group->start;
3465
3466 err = btrfs_orphan_add(trans, BTRFS_I(inode));
3467 out:
3468 btrfs_put_root(root);
3469 btrfs_end_transaction(trans);
3470 btrfs_btree_balance_dirty(fs_info);
3471 if (err) {
3472 if (inode)
3473 iput(inode);
3474 inode = ERR_PTR(err);
3475 }
3476 return inode;
3477 }
3478
3479 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3480 {
3481 struct reloc_control *rc;
3482
3483 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3484 if (!rc)
3485 return NULL;
3486
3487 INIT_LIST_HEAD(&rc->reloc_roots);
3488 INIT_LIST_HEAD(&rc->dirty_subvol_roots);
3489 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
3490 mapping_tree_init(&rc->reloc_root_tree);
3491 extent_io_tree_init(fs_info, &rc->processed_blocks,
3492 IO_TREE_RELOC_BLOCKS, NULL);
3493 return rc;
3494 }
3495
3496 static void free_reloc_control(struct reloc_control *rc)
3497 {
3498 struct mapping_node *node, *tmp;
3499
3500 free_reloc_roots(&rc->reloc_roots);
3501 rbtree_postorder_for_each_entry_safe(node, tmp,
3502 &rc->reloc_root_tree.rb_root, rb_node)
3503 kfree(node);
3504
3505 kfree(rc);
3506 }
3507
3508 /*
3509 * Print the block group being relocated
3510 */
3511 static void describe_relocation(struct btrfs_fs_info *fs_info,
3512 struct btrfs_block_group *block_group)
3513 {
3514 char buf[128] = {'\0'};
3515
3516 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
3517
3518 btrfs_info(fs_info,
3519 "relocating block group %llu flags %s",
3520 block_group->start, buf);
3521 }
3522
3523 static const char *stage_to_string(int stage)
3524 {
3525 if (stage == MOVE_DATA_EXTENTS)
3526 return "move data extents";
3527 if (stage == UPDATE_DATA_PTRS)
3528 return "update data pointers";
3529 return "unknown";
3530 }
3531
3532 /*
3533 * function to relocate all extents in a block group.
3534 */
3535 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
3536 {
3537 struct btrfs_block_group *bg;
3538 struct btrfs_root *extent_root = fs_info->extent_root;
3539 struct reloc_control *rc;
3540 struct inode *inode;
3541 struct btrfs_path *path;
3542 int ret;
3543 int rw = 0;
3544 int err = 0;
3545
3546 bg = btrfs_lookup_block_group(fs_info, group_start);
3547 if (!bg)
3548 return -ENOENT;
3549
3550 if (btrfs_pinned_by_swapfile(fs_info, bg)) {
3551 btrfs_put_block_group(bg);
3552 return -ETXTBSY;
3553 }
3554
3555 rc = alloc_reloc_control(fs_info);
3556 if (!rc) {
3557 btrfs_put_block_group(bg);
3558 return -ENOMEM;
3559 }
3560
3561 rc->extent_root = extent_root;
3562 rc->block_group = bg;
3563
3564 ret = btrfs_inc_block_group_ro(rc->block_group, true);
3565 if (ret) {
3566 err = ret;
3567 goto out;
3568 }
3569 rw = 1;
3570
3571 path = btrfs_alloc_path();
3572 if (!path) {
3573 err = -ENOMEM;
3574 goto out;
3575 }
3576
3577 inode = lookup_free_space_inode(rc->block_group, path);
3578 btrfs_free_path(path);
3579
3580 if (!IS_ERR(inode))
3581 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
3582 else
3583 ret = PTR_ERR(inode);
3584
3585 if (ret && ret != -ENOENT) {
3586 err = ret;
3587 goto out;
3588 }
3589
3590 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3591 if (IS_ERR(rc->data_inode)) {
3592 err = PTR_ERR(rc->data_inode);
3593 rc->data_inode = NULL;
3594 goto out;
3595 }
3596
3597 describe_relocation(fs_info, rc->block_group);
3598
3599 btrfs_wait_block_group_reservations(rc->block_group);
3600 btrfs_wait_nocow_writers(rc->block_group);
3601 btrfs_wait_ordered_roots(fs_info, U64_MAX,
3602 rc->block_group->start,
3603 rc->block_group->length);
3604
3605 while (1) {
3606 int finishes_stage;
3607
3608 mutex_lock(&fs_info->cleaner_mutex);
3609 ret = relocate_block_group(rc);
3610 mutex_unlock(&fs_info->cleaner_mutex);
3611 if (ret < 0)
3612 err = ret;
3613
3614 finishes_stage = rc->stage;
3615 /*
3616 * We may have gotten ENOSPC after we already dirtied some
3617 * extents. If writeout happens while we're relocating a
3618 * different block group we could end up hitting the
3619 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
3620 * btrfs_reloc_cow_block. Make sure we write everything out
3621 * properly so we don't trip over this problem, and then break
3622 * out of the loop if we hit an error.
3623 */
3624 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3625 ret = btrfs_wait_ordered_range(rc->data_inode, 0,
3626 (u64)-1);
3627 if (ret)
3628 err = ret;
3629 invalidate_mapping_pages(rc->data_inode->i_mapping,
3630 0, -1);
3631 rc->stage = UPDATE_DATA_PTRS;
3632 }
3633
3634 if (err < 0)
3635 goto out;
3636
3637 if (rc->extents_found == 0)
3638 break;
3639
3640 btrfs_info(fs_info, "found %llu extents, stage: %s",
3641 rc->extents_found, stage_to_string(finishes_stage));
3642 }
3643
3644 WARN_ON(rc->block_group->pinned > 0);
3645 WARN_ON(rc->block_group->reserved > 0);
3646 WARN_ON(rc->block_group->used > 0);
3647 out:
3648 if (err && rw)
3649 btrfs_dec_block_group_ro(rc->block_group);
3650 iput(rc->data_inode);
3651 btrfs_put_block_group(rc->block_group);
3652 free_reloc_control(rc);
3653 return err;
3654 }
3655
3656 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3657 {
3658 struct btrfs_fs_info *fs_info = root->fs_info;
3659 struct btrfs_trans_handle *trans;
3660 int ret, err;
3661
3662 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3663 if (IS_ERR(trans))
3664 return PTR_ERR(trans);
3665
3666 memset(&root->root_item.drop_progress, 0,
3667 sizeof(root->root_item.drop_progress));
3668 btrfs_set_root_drop_level(&root->root_item, 0);
3669 btrfs_set_root_refs(&root->root_item, 0);
3670 ret = btrfs_update_root(trans, fs_info->tree_root,
3671 &root->root_key, &root->root_item);
3672
3673 err = btrfs_end_transaction(trans);
3674 if (err)
3675 return err;
3676 return ret;
3677 }
3678
3679 /*
3680 * recover relocation interrupted by system crash.
3681 *
3682 * this function resumes merging reloc trees with corresponding fs trees.
3683 * this is important for keeping the sharing of tree blocks
3684 */
3685 int btrfs_recover_relocation(struct btrfs_root *root)
3686 {
3687 struct btrfs_fs_info *fs_info = root->fs_info;
3688 LIST_HEAD(reloc_roots);
3689 struct btrfs_key key;
3690 struct btrfs_root *fs_root;
3691 struct btrfs_root *reloc_root;
3692 struct btrfs_path *path;
3693 struct extent_buffer *leaf;
3694 struct reloc_control *rc = NULL;
3695 struct btrfs_trans_handle *trans;
3696 int ret;
3697 int err = 0;
3698
3699 path = btrfs_alloc_path();
3700 if (!path)
3701 return -ENOMEM;
3702 path->reada = READA_BACK;
3703
3704 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3705 key.type = BTRFS_ROOT_ITEM_KEY;
3706 key.offset = (u64)-1;
3707
3708 while (1) {
3709 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
3710 path, 0, 0);
3711 if (ret < 0) {
3712 err = ret;
3713 goto out;
3714 }
3715 if (ret > 0) {
3716 if (path->slots[0] == 0)
3717 break;
3718 path->slots[0]--;
3719 }
3720 leaf = path->nodes[0];
3721 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3722 btrfs_release_path(path);
3723
3724 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3725 key.type != BTRFS_ROOT_ITEM_KEY)
3726 break;
3727
3728 reloc_root = btrfs_read_tree_root(root, &key);
3729 if (IS_ERR(reloc_root)) {
3730 err = PTR_ERR(reloc_root);
3731 goto out;
3732 }
3733
3734 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
3735 list_add(&reloc_root->root_list, &reloc_roots);
3736
3737 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3738 fs_root = btrfs_get_fs_root(fs_info,
3739 reloc_root->root_key.offset, false);
3740 if (IS_ERR(fs_root)) {
3741 ret = PTR_ERR(fs_root);
3742 if (ret != -ENOENT) {
3743 err = ret;
3744 goto out;
3745 }
3746 ret = mark_garbage_root(reloc_root);
3747 if (ret < 0) {
3748 err = ret;
3749 goto out;
3750 }
3751 } else {
3752 btrfs_put_root(fs_root);
3753 }
3754 }
3755
3756 if (key.offset == 0)
3757 break;
3758
3759 key.offset--;
3760 }
3761 btrfs_release_path(path);
3762
3763 if (list_empty(&reloc_roots))
3764 goto out;
3765
3766 rc = alloc_reloc_control(fs_info);
3767 if (!rc) {
3768 err = -ENOMEM;
3769 goto out;
3770 }
3771
3772 rc->extent_root = fs_info->extent_root;
3773
3774 set_reloc_control(rc);
3775
3776 trans = btrfs_join_transaction(rc->extent_root);
3777 if (IS_ERR(trans)) {
3778 err = PTR_ERR(trans);
3779 goto out_unset;
3780 }
3781
3782 rc->merge_reloc_tree = 1;
3783
3784 while (!list_empty(&reloc_roots)) {
3785 reloc_root = list_entry(reloc_roots.next,
3786 struct btrfs_root, root_list);
3787 list_del(&reloc_root->root_list);
3788
3789 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3790 list_add_tail(&reloc_root->root_list,
3791 &rc->reloc_roots);
3792 continue;
3793 }
3794
3795 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
3796 false);
3797 if (IS_ERR(fs_root)) {
3798 err = PTR_ERR(fs_root);
3799 list_add_tail(&reloc_root->root_list, &reloc_roots);
3800 btrfs_end_transaction(trans);
3801 goto out_unset;
3802 }
3803
3804 err = __add_reloc_root(reloc_root);
3805 BUG_ON(err < 0); /* -ENOMEM or logic error */
3806 fs_root->reloc_root = btrfs_grab_root(reloc_root);
3807 btrfs_put_root(fs_root);
3808 }
3809
3810 err = btrfs_commit_transaction(trans);
3811 if (err)
3812 goto out_unset;
3813
3814 merge_reloc_roots(rc);
3815
3816 unset_reloc_control(rc);
3817
3818 trans = btrfs_join_transaction(rc->extent_root);
3819 if (IS_ERR(trans)) {
3820 err = PTR_ERR(trans);
3821 goto out_clean;
3822 }
3823 err = btrfs_commit_transaction(trans);
3824 out_clean:
3825 ret = clean_dirty_subvols(rc);
3826 if (ret < 0 && !err)
3827 err = ret;
3828 out_unset:
3829 unset_reloc_control(rc);
3830 free_reloc_control(rc);
3831 out:
3832 free_reloc_roots(&reloc_roots);
3833
3834 btrfs_free_path(path);
3835
3836 if (err == 0) {
3837 /* cleanup orphan inode in data relocation tree */
3838 fs_root = btrfs_grab_root(fs_info->data_reloc_root);
3839 ASSERT(fs_root);
3840 err = btrfs_orphan_cleanup(fs_root);
3841 btrfs_put_root(fs_root);
3842 }
3843 return err;
3844 }
3845
3846 /*
3847 * helper to add ordered checksum for data relocation.
3848 *
3849 * cloning checksum properly handles the nodatasum extents.
3850 * it also saves CPU time to re-calculate the checksum.
3851 */
3852 int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
3853 {
3854 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3855 struct btrfs_ordered_sum *sums;
3856 struct btrfs_ordered_extent *ordered;
3857 int ret;
3858 u64 disk_bytenr;
3859 u64 new_bytenr;
3860 LIST_HEAD(list);
3861
3862 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3863 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
3864
3865 disk_bytenr = file_pos + inode->index_cnt;
3866 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
3867 disk_bytenr + len - 1, &list, 0);
3868 if (ret)
3869 goto out;
3870
3871 while (!list_empty(&list)) {
3872 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3873 list_del_init(&sums->list);
3874
3875 /*
3876 * We need to offset the new_bytenr based on where the csum is.
3877 * We need to do this because we will read in entire prealloc
3878 * extents but we may have written to say the middle of the
3879 * prealloc extent, so we need to make sure the csum goes with
3880 * the right disk offset.
3881 *
3882 * We can do this because the data reloc inode refers strictly
3883 * to the on disk bytes, so we don't have to worry about
3884 * disk_len vs real len like with real inodes since it's all
3885 * disk length.
3886 */
3887 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
3888 sums->bytenr = new_bytenr;
3889
3890 btrfs_add_ordered_sum(ordered, sums);
3891 }
3892 out:
3893 btrfs_put_ordered_extent(ordered);
3894 return ret;
3895 }
3896
3897 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
3898 struct btrfs_root *root, struct extent_buffer *buf,
3899 struct extent_buffer *cow)
3900 {
3901 struct btrfs_fs_info *fs_info = root->fs_info;
3902 struct reloc_control *rc;
3903 struct btrfs_backref_node *node;
3904 int first_cow = 0;
3905 int level;
3906 int ret = 0;
3907
3908 rc = fs_info->reloc_ctl;
3909 if (!rc)
3910 return 0;
3911
3912 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
3913 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
3914
3915 level = btrfs_header_level(buf);
3916 if (btrfs_header_generation(buf) <=
3917 btrfs_root_last_snapshot(&root->root_item))
3918 first_cow = 1;
3919
3920 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
3921 rc->create_reloc_tree) {
3922 WARN_ON(!first_cow && level == 0);
3923
3924 node = rc->backref_cache.path[level];
3925 BUG_ON(node->bytenr != buf->start &&
3926 node->new_bytenr != buf->start);
3927
3928 btrfs_backref_drop_node_buffer(node);
3929 atomic_inc(&cow->refs);
3930 node->eb = cow;
3931 node->new_bytenr = cow->start;
3932
3933 if (!node->pending) {
3934 list_move_tail(&node->list,
3935 &rc->backref_cache.pending[level]);
3936 node->pending = 1;
3937 }
3938
3939 if (first_cow)
3940 mark_block_processed(rc, node);
3941
3942 if (first_cow && level > 0)
3943 rc->nodes_relocated += buf->len;
3944 }
3945
3946 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
3947 ret = replace_file_extents(trans, rc, root, cow);
3948 return ret;
3949 }
3950
3951 /*
3952 * called before creating snapshot. it calculates metadata reservation
3953 * required for relocating tree blocks in the snapshot
3954 */
3955 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
3956 u64 *bytes_to_reserve)
3957 {
3958 struct btrfs_root *root = pending->root;
3959 struct reloc_control *rc = root->fs_info->reloc_ctl;
3960
3961 if (!rc || !have_reloc_root(root))
3962 return;
3963
3964 if (!rc->merge_reloc_tree)
3965 return;
3966
3967 root = root->reloc_root;
3968 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
3969 /*
3970 * relocation is in the stage of merging trees. the space
3971 * used by merging a reloc tree is twice the size of
3972 * relocated tree nodes in the worst case. half for cowing
3973 * the reloc tree, half for cowing the fs tree. the space
3974 * used by cowing the reloc tree will be freed after the
3975 * tree is dropped. if we create snapshot, cowing the fs
3976 * tree may use more space than it frees. so we need
3977 * reserve extra space.
3978 */
3979 *bytes_to_reserve += rc->nodes_relocated;
3980 }
3981
3982 /*
3983 * called after snapshot is created. migrate block reservation
3984 * and create reloc root for the newly created snapshot
3985 *
3986 * This is similar to btrfs_init_reloc_root(), we come out of here with two
3987 * references held on the reloc_root, one for root->reloc_root and one for
3988 * rc->reloc_roots.
3989 */
3990 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3991 struct btrfs_pending_snapshot *pending)
3992 {
3993 struct btrfs_root *root = pending->root;
3994 struct btrfs_root *reloc_root;
3995 struct btrfs_root *new_root;
3996 struct reloc_control *rc = root->fs_info->reloc_ctl;
3997 int ret;
3998
3999 if (!rc || !have_reloc_root(root))
4000 return 0;
4001
4002 rc = root->fs_info->reloc_ctl;
4003 rc->merging_rsv_size += rc->nodes_relocated;
4004
4005 if (rc->merge_reloc_tree) {
4006 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4007 rc->block_rsv,
4008 rc->nodes_relocated, true);
4009 if (ret)
4010 return ret;
4011 }
4012
4013 new_root = pending->snap;
4014 reloc_root = create_reloc_root(trans, root->reloc_root,
4015 new_root->root_key.objectid);
4016 if (IS_ERR(reloc_root))
4017 return PTR_ERR(reloc_root);
4018
4019 ret = __add_reloc_root(reloc_root);
4020 BUG_ON(ret < 0);
4021 new_root->reloc_root = btrfs_grab_root(reloc_root);
4022
4023 if (rc->create_reloc_tree)
4024 ret = clone_backref_node(trans, rc, root, reloc_root);
4025 return ret;
4026 }