2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
25 #include "transaction.h"
26 #include "print-tree.h"
29 static int split_node(struct btrfs_trans_handle
*trans
, struct btrfs_root
30 *root
, struct btrfs_path
*path
, int level
);
31 static int split_leaf(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
32 const struct btrfs_key
*ins_key
, struct btrfs_path
*path
,
33 int data_size
, int extend
);
34 static int push_node_left(struct btrfs_trans_handle
*trans
,
35 struct btrfs_fs_info
*fs_info
,
36 struct extent_buffer
*dst
,
37 struct extent_buffer
*src
, int empty
);
38 static int balance_node_right(struct btrfs_trans_handle
*trans
,
39 struct btrfs_fs_info
*fs_info
,
40 struct extent_buffer
*dst_buf
,
41 struct extent_buffer
*src_buf
);
42 static void del_ptr(struct btrfs_root
*root
, struct btrfs_path
*path
,
44 static int tree_mod_log_free_eb(struct btrfs_fs_info
*fs_info
,
45 struct extent_buffer
*eb
);
47 struct btrfs_path
*btrfs_alloc_path(void)
49 return kmem_cache_zalloc(btrfs_path_cachep
, GFP_NOFS
);
53 * set all locked nodes in the path to blocking locks. This should
54 * be done before scheduling
56 noinline
void btrfs_set_path_blocking(struct btrfs_path
*p
)
59 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++) {
60 if (!p
->nodes
[i
] || !p
->locks
[i
])
62 btrfs_set_lock_blocking_rw(p
->nodes
[i
], p
->locks
[i
]);
63 if (p
->locks
[i
] == BTRFS_READ_LOCK
)
64 p
->locks
[i
] = BTRFS_READ_LOCK_BLOCKING
;
65 else if (p
->locks
[i
] == BTRFS_WRITE_LOCK
)
66 p
->locks
[i
] = BTRFS_WRITE_LOCK_BLOCKING
;
71 * reset all the locked nodes in the patch to spinning locks.
73 * held is used to keep lockdep happy, when lockdep is enabled
74 * we set held to a blocking lock before we go around and
75 * retake all the spinlocks in the path. You can safely use NULL
78 noinline
void btrfs_clear_path_blocking(struct btrfs_path
*p
,
79 struct extent_buffer
*held
, int held_rw
)
84 btrfs_set_lock_blocking_rw(held
, held_rw
);
85 if (held_rw
== BTRFS_WRITE_LOCK
)
86 held_rw
= BTRFS_WRITE_LOCK_BLOCKING
;
87 else if (held_rw
== BTRFS_READ_LOCK
)
88 held_rw
= BTRFS_READ_LOCK_BLOCKING
;
90 btrfs_set_path_blocking(p
);
92 for (i
= BTRFS_MAX_LEVEL
- 1; i
>= 0; i
--) {
93 if (p
->nodes
[i
] && p
->locks
[i
]) {
94 btrfs_clear_lock_blocking_rw(p
->nodes
[i
], p
->locks
[i
]);
95 if (p
->locks
[i
] == BTRFS_WRITE_LOCK_BLOCKING
)
96 p
->locks
[i
] = BTRFS_WRITE_LOCK
;
97 else if (p
->locks
[i
] == BTRFS_READ_LOCK_BLOCKING
)
98 p
->locks
[i
] = BTRFS_READ_LOCK
;
103 btrfs_clear_lock_blocking_rw(held
, held_rw
);
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path
*p
)
111 btrfs_release_path(p
);
112 kmem_cache_free(btrfs_path_cachep
, p
);
116 * path release drops references on the extent buffers in the path
117 * and it drops any locks held by this path
119 * It is safe to call this on paths that no locks or extent buffers held.
121 noinline
void btrfs_release_path(struct btrfs_path
*p
)
125 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++) {
130 btrfs_tree_unlock_rw(p
->nodes
[i
], p
->locks
[i
]);
133 free_extent_buffer(p
->nodes
[i
]);
139 * safely gets a reference on the root node of a tree. A lock
140 * is not taken, so a concurrent writer may put a different node
141 * at the root of the tree. See btrfs_lock_root_node for the
144 * The extent buffer returned by this has a reference taken, so
145 * it won't disappear. It may stop being the root of the tree
146 * at any time because there are no locks held.
148 struct extent_buffer
*btrfs_root_node(struct btrfs_root
*root
)
150 struct extent_buffer
*eb
;
154 eb
= rcu_dereference(root
->node
);
157 * RCU really hurts here, we could free up the root node because
158 * it was COWed but we may not get the new root node yet so do
159 * the inc_not_zero dance and if it doesn't work then
160 * synchronize_rcu and try again.
162 if (atomic_inc_not_zero(&eb
->refs
)) {
172 /* loop around taking references on and locking the root node of the
173 * tree until you end up with a lock on the root. A locked buffer
174 * is returned, with a reference held.
176 struct extent_buffer
*btrfs_lock_root_node(struct btrfs_root
*root
)
178 struct extent_buffer
*eb
;
181 eb
= btrfs_root_node(root
);
183 if (eb
== root
->node
)
185 btrfs_tree_unlock(eb
);
186 free_extent_buffer(eb
);
191 /* loop around taking references on and locking the root node of the
192 * tree until you end up with a lock on the root. A locked buffer
193 * is returned, with a reference held.
195 struct extent_buffer
*btrfs_read_lock_root_node(struct btrfs_root
*root
)
197 struct extent_buffer
*eb
;
200 eb
= btrfs_root_node(root
);
201 btrfs_tree_read_lock(eb
);
202 if (eb
== root
->node
)
204 btrfs_tree_read_unlock(eb
);
205 free_extent_buffer(eb
);
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211 * put onto a simple dirty list. transaction.c walks this to make sure they
212 * get properly updated on disk.
214 static void add_root_to_dirty_list(struct btrfs_root
*root
)
216 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
218 if (test_bit(BTRFS_ROOT_DIRTY
, &root
->state
) ||
219 !test_bit(BTRFS_ROOT_TRACK_DIRTY
, &root
->state
))
222 spin_lock(&fs_info
->trans_lock
);
223 if (!test_and_set_bit(BTRFS_ROOT_DIRTY
, &root
->state
)) {
224 /* Want the extent tree to be the last on the list */
225 if (root
->objectid
== BTRFS_EXTENT_TREE_OBJECTID
)
226 list_move_tail(&root
->dirty_list
,
227 &fs_info
->dirty_cowonly_roots
);
229 list_move(&root
->dirty_list
,
230 &fs_info
->dirty_cowonly_roots
);
232 spin_unlock(&fs_info
->trans_lock
);
236 * used by snapshot creation to make a copy of a root for a tree with
237 * a given objectid. The buffer with the new root node is returned in
238 * cow_ret, and this func returns zero on success or a negative error code.
240 int btrfs_copy_root(struct btrfs_trans_handle
*trans
,
241 struct btrfs_root
*root
,
242 struct extent_buffer
*buf
,
243 struct extent_buffer
**cow_ret
, u64 new_root_objectid
)
245 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
246 struct extent_buffer
*cow
;
249 struct btrfs_disk_key disk_key
;
251 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
) &&
252 trans
->transid
!= fs_info
->running_transaction
->transid
);
253 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
) &&
254 trans
->transid
!= root
->last_trans
);
256 level
= btrfs_header_level(buf
);
258 btrfs_item_key(buf
, &disk_key
, 0);
260 btrfs_node_key(buf
, &disk_key
, 0);
262 cow
= btrfs_alloc_tree_block(trans
, root
, 0, new_root_objectid
,
263 &disk_key
, level
, buf
->start
, 0);
267 copy_extent_buffer_full(cow
, buf
);
268 btrfs_set_header_bytenr(cow
, cow
->start
);
269 btrfs_set_header_generation(cow
, trans
->transid
);
270 btrfs_set_header_backref_rev(cow
, BTRFS_MIXED_BACKREF_REV
);
271 btrfs_clear_header_flag(cow
, BTRFS_HEADER_FLAG_WRITTEN
|
272 BTRFS_HEADER_FLAG_RELOC
);
273 if (new_root_objectid
== BTRFS_TREE_RELOC_OBJECTID
)
274 btrfs_set_header_flag(cow
, BTRFS_HEADER_FLAG_RELOC
);
276 btrfs_set_header_owner(cow
, new_root_objectid
);
278 write_extent_buffer_fsid(cow
, fs_info
->fsid
);
280 WARN_ON(btrfs_header_generation(buf
) > trans
->transid
);
281 if (new_root_objectid
== BTRFS_TREE_RELOC_OBJECTID
)
282 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
284 ret
= btrfs_inc_ref(trans
, root
, cow
, 0);
289 btrfs_mark_buffer_dirty(cow
);
298 MOD_LOG_KEY_REMOVE_WHILE_FREEING
,
299 MOD_LOG_KEY_REMOVE_WHILE_MOVING
,
301 MOD_LOG_ROOT_REPLACE
,
304 struct tree_mod_move
{
309 struct tree_mod_root
{
314 struct tree_mod_elem
{
320 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
323 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
326 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
327 struct btrfs_disk_key key
;
330 /* this is used for op == MOD_LOG_MOVE_KEYS */
331 struct tree_mod_move move
;
333 /* this is used for op == MOD_LOG_ROOT_REPLACE */
334 struct tree_mod_root old_root
;
338 * Pull a new tree mod seq number for our operation.
340 static inline u64
btrfs_inc_tree_mod_seq(struct btrfs_fs_info
*fs_info
)
342 return atomic64_inc_return(&fs_info
->tree_mod_seq
);
346 * This adds a new blocker to the tree mod log's blocker list if the @elem
347 * passed does not already have a sequence number set. So when a caller expects
348 * to record tree modifications, it should ensure to set elem->seq to zero
349 * before calling btrfs_get_tree_mod_seq.
350 * Returns a fresh, unused tree log modification sequence number, even if no new
353 u64
btrfs_get_tree_mod_seq(struct btrfs_fs_info
*fs_info
,
354 struct seq_list
*elem
)
356 write_lock(&fs_info
->tree_mod_log_lock
);
357 spin_lock(&fs_info
->tree_mod_seq_lock
);
359 elem
->seq
= btrfs_inc_tree_mod_seq(fs_info
);
360 list_add_tail(&elem
->list
, &fs_info
->tree_mod_seq_list
);
362 spin_unlock(&fs_info
->tree_mod_seq_lock
);
363 write_unlock(&fs_info
->tree_mod_log_lock
);
368 void btrfs_put_tree_mod_seq(struct btrfs_fs_info
*fs_info
,
369 struct seq_list
*elem
)
371 struct rb_root
*tm_root
;
372 struct rb_node
*node
;
373 struct rb_node
*next
;
374 struct seq_list
*cur_elem
;
375 struct tree_mod_elem
*tm
;
376 u64 min_seq
= (u64
)-1;
377 u64 seq_putting
= elem
->seq
;
382 spin_lock(&fs_info
->tree_mod_seq_lock
);
383 list_del(&elem
->list
);
386 list_for_each_entry(cur_elem
, &fs_info
->tree_mod_seq_list
, list
) {
387 if (cur_elem
->seq
< min_seq
) {
388 if (seq_putting
> cur_elem
->seq
) {
390 * blocker with lower sequence number exists, we
391 * cannot remove anything from the log
393 spin_unlock(&fs_info
->tree_mod_seq_lock
);
396 min_seq
= cur_elem
->seq
;
399 spin_unlock(&fs_info
->tree_mod_seq_lock
);
402 * anything that's lower than the lowest existing (read: blocked)
403 * sequence number can be removed from the tree.
405 write_lock(&fs_info
->tree_mod_log_lock
);
406 tm_root
= &fs_info
->tree_mod_log
;
407 for (node
= rb_first(tm_root
); node
; node
= next
) {
408 next
= rb_next(node
);
409 tm
= rb_entry(node
, struct tree_mod_elem
, node
);
410 if (tm
->seq
>= min_seq
)
412 rb_erase(node
, tm_root
);
415 write_unlock(&fs_info
->tree_mod_log_lock
);
419 * key order of the log:
420 * node/leaf start address -> sequence
422 * The 'start address' is the logical address of the *new* root node
423 * for root replace operations, or the logical address of the affected
424 * block for all other operations.
426 * Note: must be called with write lock for fs_info::tree_mod_log_lock.
429 __tree_mod_log_insert(struct btrfs_fs_info
*fs_info
, struct tree_mod_elem
*tm
)
431 struct rb_root
*tm_root
;
432 struct rb_node
**new;
433 struct rb_node
*parent
= NULL
;
434 struct tree_mod_elem
*cur
;
436 tm
->seq
= btrfs_inc_tree_mod_seq(fs_info
);
438 tm_root
= &fs_info
->tree_mod_log
;
439 new = &tm_root
->rb_node
;
441 cur
= rb_entry(*new, struct tree_mod_elem
, node
);
443 if (cur
->logical
< tm
->logical
)
444 new = &((*new)->rb_left
);
445 else if (cur
->logical
> tm
->logical
)
446 new = &((*new)->rb_right
);
447 else if (cur
->seq
< tm
->seq
)
448 new = &((*new)->rb_left
);
449 else if (cur
->seq
> tm
->seq
)
450 new = &((*new)->rb_right
);
455 rb_link_node(&tm
->node
, parent
, new);
456 rb_insert_color(&tm
->node
, tm_root
);
461 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
462 * returns zero with the tree_mod_log_lock acquired. The caller must hold
463 * this until all tree mod log insertions are recorded in the rb tree and then
464 * write unlock fs_info::tree_mod_log_lock.
466 static inline int tree_mod_dont_log(struct btrfs_fs_info
*fs_info
,
467 struct extent_buffer
*eb
) {
469 if (list_empty(&(fs_info
)->tree_mod_seq_list
))
471 if (eb
&& btrfs_header_level(eb
) == 0)
474 write_lock(&fs_info
->tree_mod_log_lock
);
475 if (list_empty(&(fs_info
)->tree_mod_seq_list
)) {
476 write_unlock(&fs_info
->tree_mod_log_lock
);
483 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
484 static inline int tree_mod_need_log(const struct btrfs_fs_info
*fs_info
,
485 struct extent_buffer
*eb
)
488 if (list_empty(&(fs_info
)->tree_mod_seq_list
))
490 if (eb
&& btrfs_header_level(eb
) == 0)
496 static struct tree_mod_elem
*
497 alloc_tree_mod_elem(struct extent_buffer
*eb
, int slot
,
498 enum mod_log_op op
, gfp_t flags
)
500 struct tree_mod_elem
*tm
;
502 tm
= kzalloc(sizeof(*tm
), flags
);
506 tm
->logical
= eb
->start
;
507 if (op
!= MOD_LOG_KEY_ADD
) {
508 btrfs_node_key(eb
, &tm
->key
, slot
);
509 tm
->blockptr
= btrfs_node_blockptr(eb
, slot
);
513 tm
->generation
= btrfs_node_ptr_generation(eb
, slot
);
514 RB_CLEAR_NODE(&tm
->node
);
520 tree_mod_log_insert_key(struct btrfs_fs_info
*fs_info
,
521 struct extent_buffer
*eb
, int slot
,
522 enum mod_log_op op
, gfp_t flags
)
524 struct tree_mod_elem
*tm
;
527 if (!tree_mod_need_log(fs_info
, eb
))
530 tm
= alloc_tree_mod_elem(eb
, slot
, op
, flags
);
534 if (tree_mod_dont_log(fs_info
, eb
)) {
539 ret
= __tree_mod_log_insert(fs_info
, tm
);
540 write_unlock(&eb
->fs_info
->tree_mod_log_lock
);
548 tree_mod_log_insert_move(struct btrfs_fs_info
*fs_info
,
549 struct extent_buffer
*eb
, int dst_slot
, int src_slot
,
552 struct tree_mod_elem
*tm
= NULL
;
553 struct tree_mod_elem
**tm_list
= NULL
;
558 if (!tree_mod_need_log(fs_info
, eb
))
561 tm_list
= kcalloc(nr_items
, sizeof(struct tree_mod_elem
*), GFP_NOFS
);
565 tm
= kzalloc(sizeof(*tm
), GFP_NOFS
);
571 tm
->logical
= eb
->start
;
573 tm
->move
.dst_slot
= dst_slot
;
574 tm
->move
.nr_items
= nr_items
;
575 tm
->op
= MOD_LOG_MOVE_KEYS
;
577 for (i
= 0; i
+ dst_slot
< src_slot
&& i
< nr_items
; i
++) {
578 tm_list
[i
] = alloc_tree_mod_elem(eb
, i
+ dst_slot
,
579 MOD_LOG_KEY_REMOVE_WHILE_MOVING
, GFP_NOFS
);
586 if (tree_mod_dont_log(fs_info
, eb
))
591 * When we override something during the move, we log these removals.
592 * This can only happen when we move towards the beginning of the
593 * buffer, i.e. dst_slot < src_slot.
595 for (i
= 0; i
+ dst_slot
< src_slot
&& i
< nr_items
; i
++) {
596 ret
= __tree_mod_log_insert(fs_info
, tm_list
[i
]);
601 ret
= __tree_mod_log_insert(fs_info
, tm
);
604 write_unlock(&eb
->fs_info
->tree_mod_log_lock
);
609 for (i
= 0; i
< nr_items
; i
++) {
610 if (tm_list
[i
] && !RB_EMPTY_NODE(&tm_list
[i
]->node
))
611 rb_erase(&tm_list
[i
]->node
, &fs_info
->tree_mod_log
);
615 write_unlock(&eb
->fs_info
->tree_mod_log_lock
);
623 __tree_mod_log_free_eb(struct btrfs_fs_info
*fs_info
,
624 struct tree_mod_elem
**tm_list
,
630 for (i
= nritems
- 1; i
>= 0; i
--) {
631 ret
= __tree_mod_log_insert(fs_info
, tm_list
[i
]);
633 for (j
= nritems
- 1; j
> i
; j
--)
634 rb_erase(&tm_list
[j
]->node
,
635 &fs_info
->tree_mod_log
);
644 tree_mod_log_insert_root(struct btrfs_fs_info
*fs_info
,
645 struct extent_buffer
*old_root
,
646 struct extent_buffer
*new_root
,
649 struct tree_mod_elem
*tm
= NULL
;
650 struct tree_mod_elem
**tm_list
= NULL
;
655 if (!tree_mod_need_log(fs_info
, NULL
))
658 if (log_removal
&& btrfs_header_level(old_root
) > 0) {
659 nritems
= btrfs_header_nritems(old_root
);
660 tm_list
= kcalloc(nritems
, sizeof(struct tree_mod_elem
*),
666 for (i
= 0; i
< nritems
; i
++) {
667 tm_list
[i
] = alloc_tree_mod_elem(old_root
, i
,
668 MOD_LOG_KEY_REMOVE_WHILE_FREEING
, GFP_NOFS
);
676 tm
= kzalloc(sizeof(*tm
), GFP_NOFS
);
682 tm
->logical
= new_root
->start
;
683 tm
->old_root
.logical
= old_root
->start
;
684 tm
->old_root
.level
= btrfs_header_level(old_root
);
685 tm
->generation
= btrfs_header_generation(old_root
);
686 tm
->op
= MOD_LOG_ROOT_REPLACE
;
688 if (tree_mod_dont_log(fs_info
, NULL
))
692 ret
= __tree_mod_log_free_eb(fs_info
, tm_list
, nritems
);
694 ret
= __tree_mod_log_insert(fs_info
, tm
);
696 write_unlock(&fs_info
->tree_mod_log_lock
);
705 for (i
= 0; i
< nritems
; i
++)
714 static struct tree_mod_elem
*
715 __tree_mod_log_search(struct btrfs_fs_info
*fs_info
, u64 start
, u64 min_seq
,
718 struct rb_root
*tm_root
;
719 struct rb_node
*node
;
720 struct tree_mod_elem
*cur
= NULL
;
721 struct tree_mod_elem
*found
= NULL
;
723 read_lock(&fs_info
->tree_mod_log_lock
);
724 tm_root
= &fs_info
->tree_mod_log
;
725 node
= tm_root
->rb_node
;
727 cur
= rb_entry(node
, struct tree_mod_elem
, node
);
728 if (cur
->logical
< start
) {
729 node
= node
->rb_left
;
730 } else if (cur
->logical
> start
) {
731 node
= node
->rb_right
;
732 } else if (cur
->seq
< min_seq
) {
733 node
= node
->rb_left
;
734 } else if (!smallest
) {
735 /* we want the node with the highest seq */
737 BUG_ON(found
->seq
> cur
->seq
);
739 node
= node
->rb_left
;
740 } else if (cur
->seq
> min_seq
) {
741 /* we want the node with the smallest seq */
743 BUG_ON(found
->seq
< cur
->seq
);
745 node
= node
->rb_right
;
751 read_unlock(&fs_info
->tree_mod_log_lock
);
757 * this returns the element from the log with the smallest time sequence
758 * value that's in the log (the oldest log item). any element with a time
759 * sequence lower than min_seq will be ignored.
761 static struct tree_mod_elem
*
762 tree_mod_log_search_oldest(struct btrfs_fs_info
*fs_info
, u64 start
,
765 return __tree_mod_log_search(fs_info
, start
, min_seq
, 1);
769 * this returns the element from the log with the largest time sequence
770 * value that's in the log (the most recent log item). any element with
771 * a time sequence lower than min_seq will be ignored.
773 static struct tree_mod_elem
*
774 tree_mod_log_search(struct btrfs_fs_info
*fs_info
, u64 start
, u64 min_seq
)
776 return __tree_mod_log_search(fs_info
, start
, min_seq
, 0);
780 tree_mod_log_eb_copy(struct btrfs_fs_info
*fs_info
, struct extent_buffer
*dst
,
781 struct extent_buffer
*src
, unsigned long dst_offset
,
782 unsigned long src_offset
, int nr_items
)
785 struct tree_mod_elem
**tm_list
= NULL
;
786 struct tree_mod_elem
**tm_list_add
, **tm_list_rem
;
790 if (!tree_mod_need_log(fs_info
, NULL
))
793 if (btrfs_header_level(dst
) == 0 && btrfs_header_level(src
) == 0)
796 tm_list
= kcalloc(nr_items
* 2, sizeof(struct tree_mod_elem
*),
801 tm_list_add
= tm_list
;
802 tm_list_rem
= tm_list
+ nr_items
;
803 for (i
= 0; i
< nr_items
; i
++) {
804 tm_list_rem
[i
] = alloc_tree_mod_elem(src
, i
+ src_offset
,
805 MOD_LOG_KEY_REMOVE
, GFP_NOFS
);
806 if (!tm_list_rem
[i
]) {
811 tm_list_add
[i
] = alloc_tree_mod_elem(dst
, i
+ dst_offset
,
812 MOD_LOG_KEY_ADD
, GFP_NOFS
);
813 if (!tm_list_add
[i
]) {
819 if (tree_mod_dont_log(fs_info
, NULL
))
823 for (i
= 0; i
< nr_items
; i
++) {
824 ret
= __tree_mod_log_insert(fs_info
, tm_list_rem
[i
]);
827 ret
= __tree_mod_log_insert(fs_info
, tm_list_add
[i
]);
832 write_unlock(&fs_info
->tree_mod_log_lock
);
838 for (i
= 0; i
< nr_items
* 2; i
++) {
839 if (tm_list
[i
] && !RB_EMPTY_NODE(&tm_list
[i
]->node
))
840 rb_erase(&tm_list
[i
]->node
, &fs_info
->tree_mod_log
);
844 write_unlock(&fs_info
->tree_mod_log_lock
);
851 tree_mod_log_eb_move(struct btrfs_fs_info
*fs_info
, struct extent_buffer
*dst
,
852 int dst_offset
, int src_offset
, int nr_items
)
855 ret
= tree_mod_log_insert_move(fs_info
, dst
, dst_offset
, src_offset
,
861 tree_mod_log_set_node_key(struct btrfs_fs_info
*fs_info
,
862 struct extent_buffer
*eb
, int slot
, int atomic
)
866 ret
= tree_mod_log_insert_key(fs_info
, eb
, slot
,
868 atomic
? GFP_ATOMIC
: GFP_NOFS
);
873 tree_mod_log_free_eb(struct btrfs_fs_info
*fs_info
, struct extent_buffer
*eb
)
875 struct tree_mod_elem
**tm_list
= NULL
;
880 if (btrfs_header_level(eb
) == 0)
883 if (!tree_mod_need_log(fs_info
, NULL
))
886 nritems
= btrfs_header_nritems(eb
);
887 tm_list
= kcalloc(nritems
, sizeof(struct tree_mod_elem
*), GFP_NOFS
);
891 for (i
= 0; i
< nritems
; i
++) {
892 tm_list
[i
] = alloc_tree_mod_elem(eb
, i
,
893 MOD_LOG_KEY_REMOVE_WHILE_FREEING
, GFP_NOFS
);
900 if (tree_mod_dont_log(fs_info
, eb
))
903 ret
= __tree_mod_log_free_eb(fs_info
, tm_list
, nritems
);
904 write_unlock(&eb
->fs_info
->tree_mod_log_lock
);
912 for (i
= 0; i
< nritems
; i
++)
920 tree_mod_log_set_root_pointer(struct btrfs_root
*root
,
921 struct extent_buffer
*new_root_node
,
925 ret
= tree_mod_log_insert_root(root
->fs_info
, root
->node
,
926 new_root_node
, log_removal
);
931 * check if the tree block can be shared by multiple trees
933 int btrfs_block_can_be_shared(struct btrfs_root
*root
,
934 struct extent_buffer
*buf
)
937 * Tree blocks not in reference counted trees and tree roots
938 * are never shared. If a block was allocated after the last
939 * snapshot and the block was not allocated by tree relocation,
940 * we know the block is not shared.
942 if (test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
) &&
943 buf
!= root
->node
&& buf
!= root
->commit_root
&&
944 (btrfs_header_generation(buf
) <=
945 btrfs_root_last_snapshot(&root
->root_item
) ||
946 btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_RELOC
)))
948 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
949 if (test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
) &&
950 btrfs_header_backref_rev(buf
) < BTRFS_MIXED_BACKREF_REV
)
956 static noinline
int update_ref_for_cow(struct btrfs_trans_handle
*trans
,
957 struct btrfs_root
*root
,
958 struct extent_buffer
*buf
,
959 struct extent_buffer
*cow
,
962 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
970 * Backrefs update rules:
972 * Always use full backrefs for extent pointers in tree block
973 * allocated by tree relocation.
975 * If a shared tree block is no longer referenced by its owner
976 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
977 * use full backrefs for extent pointers in tree block.
979 * If a tree block is been relocating
980 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
981 * use full backrefs for extent pointers in tree block.
982 * The reason for this is some operations (such as drop tree)
983 * are only allowed for blocks use full backrefs.
986 if (btrfs_block_can_be_shared(root
, buf
)) {
987 ret
= btrfs_lookup_extent_info(trans
, fs_info
, buf
->start
,
988 btrfs_header_level(buf
), 1,
994 btrfs_handle_fs_error(fs_info
, ret
, NULL
);
999 if (root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
||
1000 btrfs_header_backref_rev(buf
) < BTRFS_MIXED_BACKREF_REV
)
1001 flags
= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
1006 owner
= btrfs_header_owner(buf
);
1007 BUG_ON(owner
== BTRFS_TREE_RELOC_OBJECTID
&&
1008 !(flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
));
1011 if ((owner
== root
->root_key
.objectid
||
1012 root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
) &&
1013 !(flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
)) {
1014 ret
= btrfs_inc_ref(trans
, root
, buf
, 1);
1018 if (root
->root_key
.objectid
==
1019 BTRFS_TREE_RELOC_OBJECTID
) {
1020 ret
= btrfs_dec_ref(trans
, root
, buf
, 0);
1023 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
1027 new_flags
|= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
1030 if (root
->root_key
.objectid
==
1031 BTRFS_TREE_RELOC_OBJECTID
)
1032 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
1034 ret
= btrfs_inc_ref(trans
, root
, cow
, 0);
1038 if (new_flags
!= 0) {
1039 int level
= btrfs_header_level(buf
);
1041 ret
= btrfs_set_disk_extent_flags(trans
, fs_info
,
1044 new_flags
, level
, 0);
1049 if (flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
) {
1050 if (root
->root_key
.objectid
==
1051 BTRFS_TREE_RELOC_OBJECTID
)
1052 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
1054 ret
= btrfs_inc_ref(trans
, root
, cow
, 0);
1057 ret
= btrfs_dec_ref(trans
, root
, buf
, 1);
1061 clean_tree_block(fs_info
, buf
);
1067 static struct extent_buffer
*alloc_tree_block_no_bg_flush(
1068 struct btrfs_trans_handle
*trans
,
1069 struct btrfs_root
*root
,
1071 const struct btrfs_disk_key
*disk_key
,
1076 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1077 struct extent_buffer
*ret
;
1080 * If we are COWing a node/leaf from the extent, chunk, device or free
1081 * space trees, make sure that we do not finish block group creation of
1082 * pending block groups. We do this to avoid a deadlock.
1083 * COWing can result in allocation of a new chunk, and flushing pending
1084 * block groups (btrfs_create_pending_block_groups()) can be triggered
1085 * when finishing allocation of a new chunk. Creation of a pending block
1086 * group modifies the extent, chunk, device and free space trees,
1087 * therefore we could deadlock with ourselves since we are holding a
1088 * lock on an extent buffer that btrfs_create_pending_block_groups() may
1090 * For similar reasons, we also need to delay flushing pending block
1091 * groups when splitting a leaf or node, from one of those trees, since
1092 * we are holding a write lock on it and its parent or when inserting a
1093 * new root node for one of those trees.
1095 if (root
== fs_info
->extent_root
||
1096 root
== fs_info
->chunk_root
||
1097 root
== fs_info
->dev_root
||
1098 root
== fs_info
->free_space_root
)
1099 trans
->can_flush_pending_bgs
= false;
1101 ret
= btrfs_alloc_tree_block(trans
, root
, parent_start
,
1102 root
->root_key
.objectid
, disk_key
, level
,
1104 trans
->can_flush_pending_bgs
= true;
1110 * does the dirty work in cow of a single block. The parent block (if
1111 * supplied) is updated to point to the new cow copy. The new buffer is marked
1112 * dirty and returned locked. If you modify the block it needs to be marked
1115 * search_start -- an allocation hint for the new block
1117 * empty_size -- a hint that you plan on doing more cow. This is the size in
1118 * bytes the allocator should try to find free next to the block it returns.
1119 * This is just a hint and may be ignored by the allocator.
1121 static noinline
int __btrfs_cow_block(struct btrfs_trans_handle
*trans
,
1122 struct btrfs_root
*root
,
1123 struct extent_buffer
*buf
,
1124 struct extent_buffer
*parent
, int parent_slot
,
1125 struct extent_buffer
**cow_ret
,
1126 u64 search_start
, u64 empty_size
)
1128 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1129 struct btrfs_disk_key disk_key
;
1130 struct extent_buffer
*cow
;
1133 int unlock_orig
= 0;
1134 u64 parent_start
= 0;
1136 if (*cow_ret
== buf
)
1139 btrfs_assert_tree_locked(buf
);
1141 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
) &&
1142 trans
->transid
!= fs_info
->running_transaction
->transid
);
1143 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
) &&
1144 trans
->transid
!= root
->last_trans
);
1146 level
= btrfs_header_level(buf
);
1149 btrfs_item_key(buf
, &disk_key
, 0);
1151 btrfs_node_key(buf
, &disk_key
, 0);
1153 if ((root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
) && parent
)
1154 parent_start
= parent
->start
;
1156 cow
= alloc_tree_block_no_bg_flush(trans
, root
, parent_start
, &disk_key
,
1157 level
, search_start
, empty_size
);
1159 return PTR_ERR(cow
);
1161 /* cow is set to blocking by btrfs_init_new_buffer */
1163 copy_extent_buffer_full(cow
, buf
);
1164 btrfs_set_header_bytenr(cow
, cow
->start
);
1165 btrfs_set_header_generation(cow
, trans
->transid
);
1166 btrfs_set_header_backref_rev(cow
, BTRFS_MIXED_BACKREF_REV
);
1167 btrfs_clear_header_flag(cow
, BTRFS_HEADER_FLAG_WRITTEN
|
1168 BTRFS_HEADER_FLAG_RELOC
);
1169 if (root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
)
1170 btrfs_set_header_flag(cow
, BTRFS_HEADER_FLAG_RELOC
);
1172 btrfs_set_header_owner(cow
, root
->root_key
.objectid
);
1174 write_extent_buffer_fsid(cow
, fs_info
->fsid
);
1176 ret
= update_ref_for_cow(trans
, root
, buf
, cow
, &last_ref
);
1178 btrfs_abort_transaction(trans
, ret
);
1182 if (test_bit(BTRFS_ROOT_REF_COWS
, &root
->state
)) {
1183 ret
= btrfs_reloc_cow_block(trans
, root
, buf
, cow
);
1185 btrfs_abort_transaction(trans
, ret
);
1190 if (buf
== root
->node
) {
1191 WARN_ON(parent
&& parent
!= buf
);
1192 if (root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
||
1193 btrfs_header_backref_rev(buf
) < BTRFS_MIXED_BACKREF_REV
)
1194 parent_start
= buf
->start
;
1196 extent_buffer_get(cow
);
1197 tree_mod_log_set_root_pointer(root
, cow
, 1);
1198 rcu_assign_pointer(root
->node
, cow
);
1200 btrfs_free_tree_block(trans
, root
, buf
, parent_start
,
1202 free_extent_buffer(buf
);
1203 add_root_to_dirty_list(root
);
1205 WARN_ON(trans
->transid
!= btrfs_header_generation(parent
));
1206 tree_mod_log_insert_key(fs_info
, parent
, parent_slot
,
1207 MOD_LOG_KEY_REPLACE
, GFP_NOFS
);
1208 btrfs_set_node_blockptr(parent
, parent_slot
,
1210 btrfs_set_node_ptr_generation(parent
, parent_slot
,
1212 btrfs_mark_buffer_dirty(parent
);
1214 ret
= tree_mod_log_free_eb(fs_info
, buf
);
1216 btrfs_abort_transaction(trans
, ret
);
1220 btrfs_free_tree_block(trans
, root
, buf
, parent_start
,
1224 btrfs_tree_unlock(buf
);
1225 free_extent_buffer_stale(buf
);
1226 btrfs_mark_buffer_dirty(cow
);
1232 * returns the logical address of the oldest predecessor of the given root.
1233 * entries older than time_seq are ignored.
1235 static struct tree_mod_elem
*
1236 __tree_mod_log_oldest_root(struct btrfs_fs_info
*fs_info
,
1237 struct extent_buffer
*eb_root
, u64 time_seq
)
1239 struct tree_mod_elem
*tm
;
1240 struct tree_mod_elem
*found
= NULL
;
1241 u64 root_logical
= eb_root
->start
;
1248 * the very last operation that's logged for a root is the
1249 * replacement operation (if it is replaced at all). this has
1250 * the logical address of the *new* root, making it the very
1251 * first operation that's logged for this root.
1254 tm
= tree_mod_log_search_oldest(fs_info
, root_logical
,
1259 * if there are no tree operation for the oldest root, we simply
1260 * return it. this should only happen if that (old) root is at
1267 * if there's an operation that's not a root replacement, we
1268 * found the oldest version of our root. normally, we'll find a
1269 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1271 if (tm
->op
!= MOD_LOG_ROOT_REPLACE
)
1275 root_logical
= tm
->old_root
.logical
;
1279 /* if there's no old root to return, return what we found instead */
1287 * tm is a pointer to the first operation to rewind within eb. then, all
1288 * previous operations will be rewound (until we reach something older than
1292 __tree_mod_log_rewind(struct btrfs_fs_info
*fs_info
, struct extent_buffer
*eb
,
1293 u64 time_seq
, struct tree_mod_elem
*first_tm
)
1296 struct rb_node
*next
;
1297 struct tree_mod_elem
*tm
= first_tm
;
1298 unsigned long o_dst
;
1299 unsigned long o_src
;
1300 unsigned long p_size
= sizeof(struct btrfs_key_ptr
);
1302 n
= btrfs_header_nritems(eb
);
1303 read_lock(&fs_info
->tree_mod_log_lock
);
1304 while (tm
&& tm
->seq
>= time_seq
) {
1306 * all the operations are recorded with the operator used for
1307 * the modification. as we're going backwards, we do the
1308 * opposite of each operation here.
1311 case MOD_LOG_KEY_REMOVE_WHILE_FREEING
:
1312 BUG_ON(tm
->slot
< n
);
1314 case MOD_LOG_KEY_REMOVE_WHILE_MOVING
:
1315 case MOD_LOG_KEY_REMOVE
:
1316 btrfs_set_node_key(eb
, &tm
->key
, tm
->slot
);
1317 btrfs_set_node_blockptr(eb
, tm
->slot
, tm
->blockptr
);
1318 btrfs_set_node_ptr_generation(eb
, tm
->slot
,
1322 case MOD_LOG_KEY_REPLACE
:
1323 BUG_ON(tm
->slot
>= n
);
1324 btrfs_set_node_key(eb
, &tm
->key
, tm
->slot
);
1325 btrfs_set_node_blockptr(eb
, tm
->slot
, tm
->blockptr
);
1326 btrfs_set_node_ptr_generation(eb
, tm
->slot
,
1329 case MOD_LOG_KEY_ADD
:
1330 /* if a move operation is needed it's in the log */
1333 case MOD_LOG_MOVE_KEYS
:
1334 o_dst
= btrfs_node_key_ptr_offset(tm
->slot
);
1335 o_src
= btrfs_node_key_ptr_offset(tm
->move
.dst_slot
);
1336 memmove_extent_buffer(eb
, o_dst
, o_src
,
1337 tm
->move
.nr_items
* p_size
);
1339 case MOD_LOG_ROOT_REPLACE
:
1341 * this operation is special. for roots, this must be
1342 * handled explicitly before rewinding.
1343 * for non-roots, this operation may exist if the node
1344 * was a root: root A -> child B; then A gets empty and
1345 * B is promoted to the new root. in the mod log, we'll
1346 * have a root-replace operation for B, a tree block
1347 * that is no root. we simply ignore that operation.
1351 next
= rb_next(&tm
->node
);
1354 tm
= rb_entry(next
, struct tree_mod_elem
, node
);
1355 if (tm
->logical
!= first_tm
->logical
)
1358 read_unlock(&fs_info
->tree_mod_log_lock
);
1359 btrfs_set_header_nritems(eb
, n
);
1363 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1364 * is returned. If rewind operations happen, a fresh buffer is returned. The
1365 * returned buffer is always read-locked. If the returned buffer is not the
1366 * input buffer, the lock on the input buffer is released and the input buffer
1367 * is freed (its refcount is decremented).
1369 static struct extent_buffer
*
1370 tree_mod_log_rewind(struct btrfs_fs_info
*fs_info
, struct btrfs_path
*path
,
1371 struct extent_buffer
*eb
, u64 time_seq
)
1373 struct extent_buffer
*eb_rewin
;
1374 struct tree_mod_elem
*tm
;
1379 if (btrfs_header_level(eb
) == 0)
1382 tm
= tree_mod_log_search(fs_info
, eb
->start
, time_seq
);
1386 btrfs_set_path_blocking(path
);
1387 btrfs_set_lock_blocking_rw(eb
, BTRFS_READ_LOCK
);
1389 if (tm
->op
== MOD_LOG_KEY_REMOVE_WHILE_FREEING
) {
1390 BUG_ON(tm
->slot
!= 0);
1391 eb_rewin
= alloc_dummy_extent_buffer(fs_info
, eb
->start
);
1393 btrfs_tree_read_unlock_blocking(eb
);
1394 free_extent_buffer(eb
);
1397 btrfs_set_header_bytenr(eb_rewin
, eb
->start
);
1398 btrfs_set_header_backref_rev(eb_rewin
,
1399 btrfs_header_backref_rev(eb
));
1400 btrfs_set_header_owner(eb_rewin
, btrfs_header_owner(eb
));
1401 btrfs_set_header_level(eb_rewin
, btrfs_header_level(eb
));
1403 eb_rewin
= btrfs_clone_extent_buffer(eb
);
1405 btrfs_tree_read_unlock_blocking(eb
);
1406 free_extent_buffer(eb
);
1411 btrfs_clear_path_blocking(path
, NULL
, BTRFS_READ_LOCK
);
1412 btrfs_tree_read_unlock_blocking(eb
);
1413 free_extent_buffer(eb
);
1415 extent_buffer_get(eb_rewin
);
1416 btrfs_tree_read_lock(eb_rewin
);
1417 __tree_mod_log_rewind(fs_info
, eb_rewin
, time_seq
, tm
);
1418 WARN_ON(btrfs_header_nritems(eb_rewin
) >
1419 BTRFS_NODEPTRS_PER_BLOCK(fs_info
));
1425 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1426 * value. If there are no changes, the current root->root_node is returned. If
1427 * anything changed in between, there's a fresh buffer allocated on which the
1428 * rewind operations are done. In any case, the returned buffer is read locked.
1429 * Returns NULL on error (with no locks held).
1431 static inline struct extent_buffer
*
1432 get_old_root(struct btrfs_root
*root
, u64 time_seq
)
1434 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1435 struct tree_mod_elem
*tm
;
1436 struct extent_buffer
*eb
= NULL
;
1437 struct extent_buffer
*eb_root
;
1438 u64 eb_root_owner
= 0;
1439 struct extent_buffer
*old
;
1440 struct tree_mod_root
*old_root
= NULL
;
1441 u64 old_generation
= 0;
1444 eb_root
= btrfs_read_lock_root_node(root
);
1445 tm
= __tree_mod_log_oldest_root(fs_info
, eb_root
, time_seq
);
1449 if (tm
->op
== MOD_LOG_ROOT_REPLACE
) {
1450 old_root
= &tm
->old_root
;
1451 old_generation
= tm
->generation
;
1452 logical
= old_root
->logical
;
1454 logical
= eb_root
->start
;
1457 tm
= tree_mod_log_search(fs_info
, logical
, time_seq
);
1458 if (old_root
&& tm
&& tm
->op
!= MOD_LOG_KEY_REMOVE_WHILE_FREEING
) {
1459 btrfs_tree_read_unlock(eb_root
);
1460 free_extent_buffer(eb_root
);
1461 old
= read_tree_block(fs_info
, logical
, 0);
1462 if (WARN_ON(IS_ERR(old
) || !extent_buffer_uptodate(old
))) {
1464 free_extent_buffer(old
);
1466 "failed to read tree block %llu from get_old_root",
1469 eb
= btrfs_clone_extent_buffer(old
);
1470 free_extent_buffer(old
);
1472 } else if (old_root
) {
1473 eb_root_owner
= btrfs_header_owner(eb_root
);
1474 btrfs_tree_read_unlock(eb_root
);
1475 free_extent_buffer(eb_root
);
1476 eb
= alloc_dummy_extent_buffer(fs_info
, logical
);
1478 btrfs_set_lock_blocking_rw(eb_root
, BTRFS_READ_LOCK
);
1479 eb
= btrfs_clone_extent_buffer(eb_root
);
1480 btrfs_tree_read_unlock_blocking(eb_root
);
1481 free_extent_buffer(eb_root
);
1486 extent_buffer_get(eb
);
1487 btrfs_tree_read_lock(eb
);
1489 btrfs_set_header_bytenr(eb
, eb
->start
);
1490 btrfs_set_header_backref_rev(eb
, BTRFS_MIXED_BACKREF_REV
);
1491 btrfs_set_header_owner(eb
, eb_root_owner
);
1492 btrfs_set_header_level(eb
, old_root
->level
);
1493 btrfs_set_header_generation(eb
, old_generation
);
1496 __tree_mod_log_rewind(fs_info
, eb
, time_seq
, tm
);
1498 WARN_ON(btrfs_header_level(eb
) != 0);
1499 WARN_ON(btrfs_header_nritems(eb
) > BTRFS_NODEPTRS_PER_BLOCK(fs_info
));
1504 int btrfs_old_root_level(struct btrfs_root
*root
, u64 time_seq
)
1506 struct tree_mod_elem
*tm
;
1508 struct extent_buffer
*eb_root
= btrfs_root_node(root
);
1510 tm
= __tree_mod_log_oldest_root(root
->fs_info
, eb_root
, time_seq
);
1511 if (tm
&& tm
->op
== MOD_LOG_ROOT_REPLACE
) {
1512 level
= tm
->old_root
.level
;
1514 level
= btrfs_header_level(eb_root
);
1516 free_extent_buffer(eb_root
);
1521 static inline int should_cow_block(struct btrfs_trans_handle
*trans
,
1522 struct btrfs_root
*root
,
1523 struct extent_buffer
*buf
)
1525 if (btrfs_is_testing(root
->fs_info
))
1528 /* ensure we can see the force_cow */
1532 * We do not need to cow a block if
1533 * 1) this block is not created or changed in this transaction;
1534 * 2) this block does not belong to TREE_RELOC tree;
1535 * 3) the root is not forced COW.
1537 * What is forced COW:
1538 * when we create snapshot during committing the transaction,
1539 * after we've finished coping src root, we must COW the shared
1540 * block to ensure the metadata consistency.
1542 if (btrfs_header_generation(buf
) == trans
->transid
&&
1543 !btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_WRITTEN
) &&
1544 !(root
->root_key
.objectid
!= BTRFS_TREE_RELOC_OBJECTID
&&
1545 btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_RELOC
)) &&
1546 !test_bit(BTRFS_ROOT_FORCE_COW
, &root
->state
))
1552 * cows a single block, see __btrfs_cow_block for the real work.
1553 * This version of it has extra checks so that a block isn't COWed more than
1554 * once per transaction, as long as it hasn't been written yet
1556 noinline
int btrfs_cow_block(struct btrfs_trans_handle
*trans
,
1557 struct btrfs_root
*root
, struct extent_buffer
*buf
,
1558 struct extent_buffer
*parent
, int parent_slot
,
1559 struct extent_buffer
**cow_ret
)
1561 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1565 if (trans
->transaction
!= fs_info
->running_transaction
)
1566 WARN(1, KERN_CRIT
"trans %llu running %llu\n",
1568 fs_info
->running_transaction
->transid
);
1570 if (trans
->transid
!= fs_info
->generation
)
1571 WARN(1, KERN_CRIT
"trans %llu running %llu\n",
1572 trans
->transid
, fs_info
->generation
);
1574 if (!should_cow_block(trans
, root
, buf
)) {
1575 trans
->dirty
= true;
1580 search_start
= buf
->start
& ~((u64
)SZ_1G
- 1);
1583 btrfs_set_lock_blocking(parent
);
1584 btrfs_set_lock_blocking(buf
);
1586 ret
= __btrfs_cow_block(trans
, root
, buf
, parent
,
1587 parent_slot
, cow_ret
, search_start
, 0);
1589 trace_btrfs_cow_block(root
, buf
, *cow_ret
);
1595 * helper function for defrag to decide if two blocks pointed to by a
1596 * node are actually close by
1598 static int close_blocks(u64 blocknr
, u64 other
, u32 blocksize
)
1600 if (blocknr
< other
&& other
- (blocknr
+ blocksize
) < 32768)
1602 if (blocknr
> other
&& blocknr
- (other
+ blocksize
) < 32768)
1608 * compare two keys in a memcmp fashion
1610 static int comp_keys(const struct btrfs_disk_key
*disk
,
1611 const struct btrfs_key
*k2
)
1613 struct btrfs_key k1
;
1615 btrfs_disk_key_to_cpu(&k1
, disk
);
1617 return btrfs_comp_cpu_keys(&k1
, k2
);
1621 * same as comp_keys only with two btrfs_key's
1623 int btrfs_comp_cpu_keys(const struct btrfs_key
*k1
, const struct btrfs_key
*k2
)
1625 if (k1
->objectid
> k2
->objectid
)
1627 if (k1
->objectid
< k2
->objectid
)
1629 if (k1
->type
> k2
->type
)
1631 if (k1
->type
< k2
->type
)
1633 if (k1
->offset
> k2
->offset
)
1635 if (k1
->offset
< k2
->offset
)
1641 * this is used by the defrag code to go through all the
1642 * leaves pointed to by a node and reallocate them so that
1643 * disk order is close to key order
1645 int btrfs_realloc_node(struct btrfs_trans_handle
*trans
,
1646 struct btrfs_root
*root
, struct extent_buffer
*parent
,
1647 int start_slot
, u64
*last_ret
,
1648 struct btrfs_key
*progress
)
1650 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1651 struct extent_buffer
*cur
;
1654 u64 search_start
= *last_ret
;
1664 int progress_passed
= 0;
1665 struct btrfs_disk_key disk_key
;
1667 parent_level
= btrfs_header_level(parent
);
1669 WARN_ON(trans
->transaction
!= fs_info
->running_transaction
);
1670 WARN_ON(trans
->transid
!= fs_info
->generation
);
1672 parent_nritems
= btrfs_header_nritems(parent
);
1673 blocksize
= fs_info
->nodesize
;
1674 end_slot
= parent_nritems
- 1;
1676 if (parent_nritems
<= 1)
1679 btrfs_set_lock_blocking(parent
);
1681 for (i
= start_slot
; i
<= end_slot
; i
++) {
1684 btrfs_node_key(parent
, &disk_key
, i
);
1685 if (!progress_passed
&& comp_keys(&disk_key
, progress
) < 0)
1688 progress_passed
= 1;
1689 blocknr
= btrfs_node_blockptr(parent
, i
);
1690 gen
= btrfs_node_ptr_generation(parent
, i
);
1691 if (last_block
== 0)
1692 last_block
= blocknr
;
1695 other
= btrfs_node_blockptr(parent
, i
- 1);
1696 close
= close_blocks(blocknr
, other
, blocksize
);
1698 if (!close
&& i
< end_slot
) {
1699 other
= btrfs_node_blockptr(parent
, i
+ 1);
1700 close
= close_blocks(blocknr
, other
, blocksize
);
1703 last_block
= blocknr
;
1707 cur
= find_extent_buffer(fs_info
, blocknr
);
1709 uptodate
= btrfs_buffer_uptodate(cur
, gen
, 0);
1712 if (!cur
|| !uptodate
) {
1714 cur
= read_tree_block(fs_info
, blocknr
, gen
);
1716 return PTR_ERR(cur
);
1717 } else if (!extent_buffer_uptodate(cur
)) {
1718 free_extent_buffer(cur
);
1721 } else if (!uptodate
) {
1722 err
= btrfs_read_buffer(cur
, gen
);
1724 free_extent_buffer(cur
);
1729 if (search_start
== 0)
1730 search_start
= last_block
;
1732 btrfs_tree_lock(cur
);
1733 btrfs_set_lock_blocking(cur
);
1734 err
= __btrfs_cow_block(trans
, root
, cur
, parent
, i
,
1737 (end_slot
- i
) * blocksize
));
1739 btrfs_tree_unlock(cur
);
1740 free_extent_buffer(cur
);
1743 search_start
= cur
->start
;
1744 last_block
= cur
->start
;
1745 *last_ret
= search_start
;
1746 btrfs_tree_unlock(cur
);
1747 free_extent_buffer(cur
);
1753 * search for key in the extent_buffer. The items start at offset p,
1754 * and they are item_size apart. There are 'max' items in p.
1756 * the slot in the array is returned via slot, and it points to
1757 * the place where you would insert key if it is not found in
1760 * slot may point to max if the key is bigger than all of the keys
1762 static noinline
int generic_bin_search(struct extent_buffer
*eb
,
1763 unsigned long p
, int item_size
,
1764 const struct btrfs_key
*key
,
1771 struct btrfs_disk_key
*tmp
= NULL
;
1772 struct btrfs_disk_key unaligned
;
1773 unsigned long offset
;
1775 unsigned long map_start
= 0;
1776 unsigned long map_len
= 0;
1780 btrfs_err(eb
->fs_info
,
1781 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1782 __func__
, low
, high
, eb
->start
,
1783 btrfs_header_owner(eb
), btrfs_header_level(eb
));
1787 while (low
< high
) {
1788 mid
= (low
+ high
) / 2;
1789 offset
= p
+ mid
* item_size
;
1791 if (!kaddr
|| offset
< map_start
||
1792 (offset
+ sizeof(struct btrfs_disk_key
)) >
1793 map_start
+ map_len
) {
1795 err
= map_private_extent_buffer(eb
, offset
,
1796 sizeof(struct btrfs_disk_key
),
1797 &kaddr
, &map_start
, &map_len
);
1800 tmp
= (struct btrfs_disk_key
*)(kaddr
+ offset
-
1802 } else if (err
== 1) {
1803 read_extent_buffer(eb
, &unaligned
,
1804 offset
, sizeof(unaligned
));
1811 tmp
= (struct btrfs_disk_key
*)(kaddr
+ offset
-
1814 ret
= comp_keys(tmp
, key
);
1830 * simple bin_search frontend that does the right thing for
1833 static int bin_search(struct extent_buffer
*eb
, const struct btrfs_key
*key
,
1834 int level
, int *slot
)
1837 return generic_bin_search(eb
,
1838 offsetof(struct btrfs_leaf
, items
),
1839 sizeof(struct btrfs_item
),
1840 key
, btrfs_header_nritems(eb
),
1843 return generic_bin_search(eb
,
1844 offsetof(struct btrfs_node
, ptrs
),
1845 sizeof(struct btrfs_key_ptr
),
1846 key
, btrfs_header_nritems(eb
),
1850 int btrfs_bin_search(struct extent_buffer
*eb
, const struct btrfs_key
*key
,
1851 int level
, int *slot
)
1853 return bin_search(eb
, key
, level
, slot
);
1856 static void root_add_used(struct btrfs_root
*root
, u32 size
)
1858 spin_lock(&root
->accounting_lock
);
1859 btrfs_set_root_used(&root
->root_item
,
1860 btrfs_root_used(&root
->root_item
) + size
);
1861 spin_unlock(&root
->accounting_lock
);
1864 static void root_sub_used(struct btrfs_root
*root
, u32 size
)
1866 spin_lock(&root
->accounting_lock
);
1867 btrfs_set_root_used(&root
->root_item
,
1868 btrfs_root_used(&root
->root_item
) - size
);
1869 spin_unlock(&root
->accounting_lock
);
1872 /* given a node and slot number, this reads the blocks it points to. The
1873 * extent buffer is returned with a reference taken (but unlocked).
1875 static noinline
struct extent_buffer
*
1876 read_node_slot(struct btrfs_fs_info
*fs_info
, struct extent_buffer
*parent
,
1879 int level
= btrfs_header_level(parent
);
1880 struct extent_buffer
*eb
;
1882 if (slot
< 0 || slot
>= btrfs_header_nritems(parent
))
1883 return ERR_PTR(-ENOENT
);
1887 eb
= read_tree_block(fs_info
, btrfs_node_blockptr(parent
, slot
),
1888 btrfs_node_ptr_generation(parent
, slot
));
1889 if (!IS_ERR(eb
) && !extent_buffer_uptodate(eb
)) {
1890 free_extent_buffer(eb
);
1898 * node level balancing, used to make sure nodes are in proper order for
1899 * item deletion. We balance from the top down, so we have to make sure
1900 * that a deletion won't leave an node completely empty later on.
1902 static noinline
int balance_level(struct btrfs_trans_handle
*trans
,
1903 struct btrfs_root
*root
,
1904 struct btrfs_path
*path
, int level
)
1906 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1907 struct extent_buffer
*right
= NULL
;
1908 struct extent_buffer
*mid
;
1909 struct extent_buffer
*left
= NULL
;
1910 struct extent_buffer
*parent
= NULL
;
1914 int orig_slot
= path
->slots
[level
];
1920 mid
= path
->nodes
[level
];
1922 WARN_ON(path
->locks
[level
] != BTRFS_WRITE_LOCK
&&
1923 path
->locks
[level
] != BTRFS_WRITE_LOCK_BLOCKING
);
1924 WARN_ON(btrfs_header_generation(mid
) != trans
->transid
);
1926 orig_ptr
= btrfs_node_blockptr(mid
, orig_slot
);
1928 if (level
< BTRFS_MAX_LEVEL
- 1) {
1929 parent
= path
->nodes
[level
+ 1];
1930 pslot
= path
->slots
[level
+ 1];
1934 * deal with the case where there is only one pointer in the root
1935 * by promoting the node below to a root
1938 struct extent_buffer
*child
;
1940 if (btrfs_header_nritems(mid
) != 1)
1943 /* promote the child to a root */
1944 child
= read_node_slot(fs_info
, mid
, 0);
1945 if (IS_ERR(child
)) {
1946 ret
= PTR_ERR(child
);
1947 btrfs_handle_fs_error(fs_info
, ret
, NULL
);
1951 btrfs_tree_lock(child
);
1952 btrfs_set_lock_blocking(child
);
1953 ret
= btrfs_cow_block(trans
, root
, child
, mid
, 0, &child
);
1955 btrfs_tree_unlock(child
);
1956 free_extent_buffer(child
);
1960 tree_mod_log_set_root_pointer(root
, child
, 1);
1961 rcu_assign_pointer(root
->node
, child
);
1963 add_root_to_dirty_list(root
);
1964 btrfs_tree_unlock(child
);
1966 path
->locks
[level
] = 0;
1967 path
->nodes
[level
] = NULL
;
1968 clean_tree_block(fs_info
, mid
);
1969 btrfs_tree_unlock(mid
);
1970 /* once for the path */
1971 free_extent_buffer(mid
);
1973 root_sub_used(root
, mid
->len
);
1974 btrfs_free_tree_block(trans
, root
, mid
, 0, 1);
1975 /* once for the root ptr */
1976 free_extent_buffer_stale(mid
);
1979 if (btrfs_header_nritems(mid
) >
1980 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) / 4)
1983 left
= read_node_slot(fs_info
, parent
, pslot
- 1);
1988 btrfs_tree_lock(left
);
1989 btrfs_set_lock_blocking(left
);
1990 wret
= btrfs_cow_block(trans
, root
, left
,
1991 parent
, pslot
- 1, &left
);
1998 right
= read_node_slot(fs_info
, parent
, pslot
+ 1);
2003 btrfs_tree_lock(right
);
2004 btrfs_set_lock_blocking(right
);
2005 wret
= btrfs_cow_block(trans
, root
, right
,
2006 parent
, pslot
+ 1, &right
);
2013 /* first, try to make some room in the middle buffer */
2015 orig_slot
+= btrfs_header_nritems(left
);
2016 wret
= push_node_left(trans
, fs_info
, left
, mid
, 1);
2022 * then try to empty the right most buffer into the middle
2025 wret
= push_node_left(trans
, fs_info
, mid
, right
, 1);
2026 if (wret
< 0 && wret
!= -ENOSPC
)
2028 if (btrfs_header_nritems(right
) == 0) {
2029 clean_tree_block(fs_info
, right
);
2030 btrfs_tree_unlock(right
);
2031 del_ptr(root
, path
, level
+ 1, pslot
+ 1);
2032 root_sub_used(root
, right
->len
);
2033 btrfs_free_tree_block(trans
, root
, right
, 0, 1);
2034 free_extent_buffer_stale(right
);
2037 struct btrfs_disk_key right_key
;
2038 btrfs_node_key(right
, &right_key
, 0);
2039 tree_mod_log_set_node_key(fs_info
, parent
,
2041 btrfs_set_node_key(parent
, &right_key
, pslot
+ 1);
2042 btrfs_mark_buffer_dirty(parent
);
2045 if (btrfs_header_nritems(mid
) == 1) {
2047 * we're not allowed to leave a node with one item in the
2048 * tree during a delete. A deletion from lower in the tree
2049 * could try to delete the only pointer in this node.
2050 * So, pull some keys from the left.
2051 * There has to be a left pointer at this point because
2052 * otherwise we would have pulled some pointers from the
2057 btrfs_handle_fs_error(fs_info
, ret
, NULL
);
2060 wret
= balance_node_right(trans
, fs_info
, mid
, left
);
2066 wret
= push_node_left(trans
, fs_info
, left
, mid
, 1);
2072 if (btrfs_header_nritems(mid
) == 0) {
2073 clean_tree_block(fs_info
, mid
);
2074 btrfs_tree_unlock(mid
);
2075 del_ptr(root
, path
, level
+ 1, pslot
);
2076 root_sub_used(root
, mid
->len
);
2077 btrfs_free_tree_block(trans
, root
, mid
, 0, 1);
2078 free_extent_buffer_stale(mid
);
2081 /* update the parent key to reflect our changes */
2082 struct btrfs_disk_key mid_key
;
2083 btrfs_node_key(mid
, &mid_key
, 0);
2084 tree_mod_log_set_node_key(fs_info
, parent
, pslot
, 0);
2085 btrfs_set_node_key(parent
, &mid_key
, pslot
);
2086 btrfs_mark_buffer_dirty(parent
);
2089 /* update the path */
2091 if (btrfs_header_nritems(left
) > orig_slot
) {
2092 extent_buffer_get(left
);
2093 /* left was locked after cow */
2094 path
->nodes
[level
] = left
;
2095 path
->slots
[level
+ 1] -= 1;
2096 path
->slots
[level
] = orig_slot
;
2098 btrfs_tree_unlock(mid
);
2099 free_extent_buffer(mid
);
2102 orig_slot
-= btrfs_header_nritems(left
);
2103 path
->slots
[level
] = orig_slot
;
2106 /* double check we haven't messed things up */
2108 btrfs_node_blockptr(path
->nodes
[level
], path
->slots
[level
]))
2112 btrfs_tree_unlock(right
);
2113 free_extent_buffer(right
);
2116 if (path
->nodes
[level
] != left
)
2117 btrfs_tree_unlock(left
);
2118 free_extent_buffer(left
);
2123 /* Node balancing for insertion. Here we only split or push nodes around
2124 * when they are completely full. This is also done top down, so we
2125 * have to be pessimistic.
2127 static noinline
int push_nodes_for_insert(struct btrfs_trans_handle
*trans
,
2128 struct btrfs_root
*root
,
2129 struct btrfs_path
*path
, int level
)
2131 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2132 struct extent_buffer
*right
= NULL
;
2133 struct extent_buffer
*mid
;
2134 struct extent_buffer
*left
= NULL
;
2135 struct extent_buffer
*parent
= NULL
;
2139 int orig_slot
= path
->slots
[level
];
2144 mid
= path
->nodes
[level
];
2145 WARN_ON(btrfs_header_generation(mid
) != trans
->transid
);
2147 if (level
< BTRFS_MAX_LEVEL
- 1) {
2148 parent
= path
->nodes
[level
+ 1];
2149 pslot
= path
->slots
[level
+ 1];
2155 left
= read_node_slot(fs_info
, parent
, pslot
- 1);
2159 /* first, try to make some room in the middle buffer */
2163 btrfs_tree_lock(left
);
2164 btrfs_set_lock_blocking(left
);
2166 left_nr
= btrfs_header_nritems(left
);
2167 if (left_nr
>= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 1) {
2170 ret
= btrfs_cow_block(trans
, root
, left
, parent
,
2175 wret
= push_node_left(trans
, fs_info
,
2182 struct btrfs_disk_key disk_key
;
2183 orig_slot
+= left_nr
;
2184 btrfs_node_key(mid
, &disk_key
, 0);
2185 tree_mod_log_set_node_key(fs_info
, parent
, pslot
, 0);
2186 btrfs_set_node_key(parent
, &disk_key
, pslot
);
2187 btrfs_mark_buffer_dirty(parent
);
2188 if (btrfs_header_nritems(left
) > orig_slot
) {
2189 path
->nodes
[level
] = left
;
2190 path
->slots
[level
+ 1] -= 1;
2191 path
->slots
[level
] = orig_slot
;
2192 btrfs_tree_unlock(mid
);
2193 free_extent_buffer(mid
);
2196 btrfs_header_nritems(left
);
2197 path
->slots
[level
] = orig_slot
;
2198 btrfs_tree_unlock(left
);
2199 free_extent_buffer(left
);
2203 btrfs_tree_unlock(left
);
2204 free_extent_buffer(left
);
2206 right
= read_node_slot(fs_info
, parent
, pslot
+ 1);
2211 * then try to empty the right most buffer into the middle
2216 btrfs_tree_lock(right
);
2217 btrfs_set_lock_blocking(right
);
2219 right_nr
= btrfs_header_nritems(right
);
2220 if (right_nr
>= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 1) {
2223 ret
= btrfs_cow_block(trans
, root
, right
,
2229 wret
= balance_node_right(trans
, fs_info
,
2236 struct btrfs_disk_key disk_key
;
2238 btrfs_node_key(right
, &disk_key
, 0);
2239 tree_mod_log_set_node_key(fs_info
, parent
,
2241 btrfs_set_node_key(parent
, &disk_key
, pslot
+ 1);
2242 btrfs_mark_buffer_dirty(parent
);
2244 if (btrfs_header_nritems(mid
) <= orig_slot
) {
2245 path
->nodes
[level
] = right
;
2246 path
->slots
[level
+ 1] += 1;
2247 path
->slots
[level
] = orig_slot
-
2248 btrfs_header_nritems(mid
);
2249 btrfs_tree_unlock(mid
);
2250 free_extent_buffer(mid
);
2252 btrfs_tree_unlock(right
);
2253 free_extent_buffer(right
);
2257 btrfs_tree_unlock(right
);
2258 free_extent_buffer(right
);
2264 * readahead one full node of leaves, finding things that are close
2265 * to the block in 'slot', and triggering ra on them.
2267 static void reada_for_search(struct btrfs_fs_info
*fs_info
,
2268 struct btrfs_path
*path
,
2269 int level
, int slot
, u64 objectid
)
2271 struct extent_buffer
*node
;
2272 struct btrfs_disk_key disk_key
;
2277 struct extent_buffer
*eb
;
2285 if (!path
->nodes
[level
])
2288 node
= path
->nodes
[level
];
2290 search
= btrfs_node_blockptr(node
, slot
);
2291 blocksize
= fs_info
->nodesize
;
2292 eb
= find_extent_buffer(fs_info
, search
);
2294 free_extent_buffer(eb
);
2300 nritems
= btrfs_header_nritems(node
);
2304 if (path
->reada
== READA_BACK
) {
2308 } else if (path
->reada
== READA_FORWARD
) {
2313 if (path
->reada
== READA_BACK
&& objectid
) {
2314 btrfs_node_key(node
, &disk_key
, nr
);
2315 if (btrfs_disk_key_objectid(&disk_key
) != objectid
)
2318 search
= btrfs_node_blockptr(node
, nr
);
2319 if ((search
<= target
&& target
- search
<= 65536) ||
2320 (search
> target
&& search
- target
<= 65536)) {
2321 readahead_tree_block(fs_info
, search
);
2325 if ((nread
> 65536 || nscan
> 32))
2330 static noinline
void reada_for_balance(struct btrfs_fs_info
*fs_info
,
2331 struct btrfs_path
*path
, int level
)
2335 struct extent_buffer
*parent
;
2336 struct extent_buffer
*eb
;
2341 parent
= path
->nodes
[level
+ 1];
2345 nritems
= btrfs_header_nritems(parent
);
2346 slot
= path
->slots
[level
+ 1];
2349 block1
= btrfs_node_blockptr(parent
, slot
- 1);
2350 gen
= btrfs_node_ptr_generation(parent
, slot
- 1);
2351 eb
= find_extent_buffer(fs_info
, block1
);
2353 * if we get -eagain from btrfs_buffer_uptodate, we
2354 * don't want to return eagain here. That will loop
2357 if (eb
&& btrfs_buffer_uptodate(eb
, gen
, 1) != 0)
2359 free_extent_buffer(eb
);
2361 if (slot
+ 1 < nritems
) {
2362 block2
= btrfs_node_blockptr(parent
, slot
+ 1);
2363 gen
= btrfs_node_ptr_generation(parent
, slot
+ 1);
2364 eb
= find_extent_buffer(fs_info
, block2
);
2365 if (eb
&& btrfs_buffer_uptodate(eb
, gen
, 1) != 0)
2367 free_extent_buffer(eb
);
2371 readahead_tree_block(fs_info
, block1
);
2373 readahead_tree_block(fs_info
, block2
);
2378 * when we walk down the tree, it is usually safe to unlock the higher layers
2379 * in the tree. The exceptions are when our path goes through slot 0, because
2380 * operations on the tree might require changing key pointers higher up in the
2383 * callers might also have set path->keep_locks, which tells this code to keep
2384 * the lock if the path points to the last slot in the block. This is part of
2385 * walking through the tree, and selecting the next slot in the higher block.
2387 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
2388 * if lowest_unlock is 1, level 0 won't be unlocked
2390 static noinline
void unlock_up(struct btrfs_path
*path
, int level
,
2391 int lowest_unlock
, int min_write_lock_level
,
2392 int *write_lock_level
)
2395 int skip_level
= level
;
2397 struct extent_buffer
*t
;
2399 for (i
= level
; i
< BTRFS_MAX_LEVEL
; i
++) {
2400 if (!path
->nodes
[i
])
2402 if (!path
->locks
[i
])
2404 if (!no_skips
&& path
->slots
[i
] == 0) {
2408 if (!no_skips
&& path
->keep_locks
) {
2411 nritems
= btrfs_header_nritems(t
);
2412 if (nritems
< 1 || path
->slots
[i
] >= nritems
- 1) {
2417 if (skip_level
< i
&& i
>= lowest_unlock
)
2421 if (i
>= lowest_unlock
&& i
> skip_level
&& path
->locks
[i
]) {
2422 btrfs_tree_unlock_rw(t
, path
->locks
[i
]);
2424 if (write_lock_level
&&
2425 i
> min_write_lock_level
&&
2426 i
<= *write_lock_level
) {
2427 *write_lock_level
= i
- 1;
2434 * This releases any locks held in the path starting at level and
2435 * going all the way up to the root.
2437 * btrfs_search_slot will keep the lock held on higher nodes in a few
2438 * corner cases, such as COW of the block at slot zero in the node. This
2439 * ignores those rules, and it should only be called when there are no
2440 * more updates to be done higher up in the tree.
2442 noinline
void btrfs_unlock_up_safe(struct btrfs_path
*path
, int level
)
2446 if (path
->keep_locks
)
2449 for (i
= level
; i
< BTRFS_MAX_LEVEL
; i
++) {
2450 if (!path
->nodes
[i
])
2452 if (!path
->locks
[i
])
2454 btrfs_tree_unlock_rw(path
->nodes
[i
], path
->locks
[i
]);
2460 * helper function for btrfs_search_slot. The goal is to find a block
2461 * in cache without setting the path to blocking. If we find the block
2462 * we return zero and the path is unchanged.
2464 * If we can't find the block, we set the path blocking and do some
2465 * reada. -EAGAIN is returned and the search must be repeated.
2468 read_block_for_search(struct btrfs_root
*root
, struct btrfs_path
*p
,
2469 struct extent_buffer
**eb_ret
, int level
, int slot
,
2470 const struct btrfs_key
*key
)
2472 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2475 struct extent_buffer
*b
= *eb_ret
;
2476 struct extent_buffer
*tmp
;
2479 blocknr
= btrfs_node_blockptr(b
, slot
);
2480 gen
= btrfs_node_ptr_generation(b
, slot
);
2482 tmp
= find_extent_buffer(fs_info
, blocknr
);
2484 /* first we do an atomic uptodate check */
2485 if (btrfs_buffer_uptodate(tmp
, gen
, 1) > 0) {
2490 /* the pages were up to date, but we failed
2491 * the generation number check. Do a full
2492 * read for the generation number that is correct.
2493 * We must do this without dropping locks so
2494 * we can trust our generation number
2496 btrfs_set_path_blocking(p
);
2498 /* now we're allowed to do a blocking uptodate check */
2499 ret
= btrfs_read_buffer(tmp
, gen
);
2504 free_extent_buffer(tmp
);
2505 btrfs_release_path(p
);
2510 * reduce lock contention at high levels
2511 * of the btree by dropping locks before
2512 * we read. Don't release the lock on the current
2513 * level because we need to walk this node to figure
2514 * out which blocks to read.
2516 btrfs_unlock_up_safe(p
, level
+ 1);
2517 btrfs_set_path_blocking(p
);
2519 free_extent_buffer(tmp
);
2520 if (p
->reada
!= READA_NONE
)
2521 reada_for_search(fs_info
, p
, level
, slot
, key
->objectid
);
2524 tmp
= read_tree_block(fs_info
, blocknr
, gen
);
2527 * If the read above didn't mark this buffer up to date,
2528 * it will never end up being up to date. Set ret to EIO now
2529 * and give up so that our caller doesn't loop forever
2532 if (!btrfs_buffer_uptodate(tmp
, 0, 0))
2534 free_extent_buffer(tmp
);
2539 btrfs_release_path(p
);
2544 * helper function for btrfs_search_slot. This does all of the checks
2545 * for node-level blocks and does any balancing required based on
2548 * If no extra work was required, zero is returned. If we had to
2549 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2553 setup_nodes_for_search(struct btrfs_trans_handle
*trans
,
2554 struct btrfs_root
*root
, struct btrfs_path
*p
,
2555 struct extent_buffer
*b
, int level
, int ins_len
,
2556 int *write_lock_level
)
2558 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2561 if ((p
->search_for_split
|| ins_len
> 0) && btrfs_header_nritems(b
) >=
2562 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 3) {
2565 if (*write_lock_level
< level
+ 1) {
2566 *write_lock_level
= level
+ 1;
2567 btrfs_release_path(p
);
2571 btrfs_set_path_blocking(p
);
2572 reada_for_balance(fs_info
, p
, level
);
2573 sret
= split_node(trans
, root
, p
, level
);
2574 btrfs_clear_path_blocking(p
, NULL
, 0);
2581 b
= p
->nodes
[level
];
2582 } else if (ins_len
< 0 && btrfs_header_nritems(b
) <
2583 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) / 2) {
2586 if (*write_lock_level
< level
+ 1) {
2587 *write_lock_level
= level
+ 1;
2588 btrfs_release_path(p
);
2592 btrfs_set_path_blocking(p
);
2593 reada_for_balance(fs_info
, p
, level
);
2594 sret
= balance_level(trans
, root
, p
, level
);
2595 btrfs_clear_path_blocking(p
, NULL
, 0);
2601 b
= p
->nodes
[level
];
2603 btrfs_release_path(p
);
2606 BUG_ON(btrfs_header_nritems(b
) == 1);
2616 static void key_search_validate(struct extent_buffer
*b
,
2617 const struct btrfs_key
*key
,
2620 #ifdef CONFIG_BTRFS_ASSERT
2621 struct btrfs_disk_key disk_key
;
2623 btrfs_cpu_key_to_disk(&disk_key
, key
);
2626 ASSERT(!memcmp_extent_buffer(b
, &disk_key
,
2627 offsetof(struct btrfs_leaf
, items
[0].key
),
2630 ASSERT(!memcmp_extent_buffer(b
, &disk_key
,
2631 offsetof(struct btrfs_node
, ptrs
[0].key
),
2636 static int key_search(struct extent_buffer
*b
, const struct btrfs_key
*key
,
2637 int level
, int *prev_cmp
, int *slot
)
2639 if (*prev_cmp
!= 0) {
2640 *prev_cmp
= bin_search(b
, key
, level
, slot
);
2644 key_search_validate(b
, key
, level
);
2650 int btrfs_find_item(struct btrfs_root
*fs_root
, struct btrfs_path
*path
,
2651 u64 iobjectid
, u64 ioff
, u8 key_type
,
2652 struct btrfs_key
*found_key
)
2655 struct btrfs_key key
;
2656 struct extent_buffer
*eb
;
2661 key
.type
= key_type
;
2662 key
.objectid
= iobjectid
;
2665 ret
= btrfs_search_slot(NULL
, fs_root
, &key
, path
, 0, 0);
2669 eb
= path
->nodes
[0];
2670 if (ret
&& path
->slots
[0] >= btrfs_header_nritems(eb
)) {
2671 ret
= btrfs_next_leaf(fs_root
, path
);
2674 eb
= path
->nodes
[0];
2677 btrfs_item_key_to_cpu(eb
, found_key
, path
->slots
[0]);
2678 if (found_key
->type
!= key
.type
||
2679 found_key
->objectid
!= key
.objectid
)
2686 * look for key in the tree. path is filled in with nodes along the way
2687 * if key is found, we return zero and you can find the item in the leaf
2688 * level of the path (level 0)
2690 * If the key isn't found, the path points to the slot where it should
2691 * be inserted, and 1 is returned. If there are other errors during the
2692 * search a negative error number is returned.
2694 * if ins_len > 0, nodes and leaves will be split as we walk down the
2695 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
2698 int btrfs_search_slot(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
2699 const struct btrfs_key
*key
, struct btrfs_path
*p
,
2700 int ins_len
, int cow
)
2702 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2703 struct extent_buffer
*b
;
2708 int lowest_unlock
= 1;
2710 /* everything at write_lock_level or lower must be write locked */
2711 int write_lock_level
= 0;
2712 u8 lowest_level
= 0;
2713 int min_write_lock_level
;
2716 lowest_level
= p
->lowest_level
;
2717 WARN_ON(lowest_level
&& ins_len
> 0);
2718 WARN_ON(p
->nodes
[0] != NULL
);
2719 BUG_ON(!cow
&& ins_len
);
2724 /* when we are removing items, we might have to go up to level
2725 * two as we update tree pointers Make sure we keep write
2726 * for those levels as well
2728 write_lock_level
= 2;
2729 } else if (ins_len
> 0) {
2731 * for inserting items, make sure we have a write lock on
2732 * level 1 so we can update keys
2734 write_lock_level
= 1;
2738 write_lock_level
= -1;
2740 if (cow
&& (p
->keep_locks
|| p
->lowest_level
))
2741 write_lock_level
= BTRFS_MAX_LEVEL
;
2743 min_write_lock_level
= write_lock_level
;
2748 * we try very hard to do read locks on the root
2750 root_lock
= BTRFS_READ_LOCK
;
2752 if (p
->search_commit_root
) {
2754 * the commit roots are read only
2755 * so we always do read locks
2757 if (p
->need_commit_sem
)
2758 down_read(&fs_info
->commit_root_sem
);
2759 b
= root
->commit_root
;
2760 extent_buffer_get(b
);
2761 level
= btrfs_header_level(b
);
2762 if (p
->need_commit_sem
)
2763 up_read(&fs_info
->commit_root_sem
);
2764 if (!p
->skip_locking
)
2765 btrfs_tree_read_lock(b
);
2767 if (p
->skip_locking
) {
2768 b
= btrfs_root_node(root
);
2769 level
= btrfs_header_level(b
);
2771 /* we don't know the level of the root node
2772 * until we actually have it read locked
2774 b
= btrfs_read_lock_root_node(root
);
2775 level
= btrfs_header_level(b
);
2776 if (level
<= write_lock_level
) {
2777 /* whoops, must trade for write lock */
2778 btrfs_tree_read_unlock(b
);
2779 free_extent_buffer(b
);
2780 b
= btrfs_lock_root_node(root
);
2781 root_lock
= BTRFS_WRITE_LOCK
;
2783 /* the level might have changed, check again */
2784 level
= btrfs_header_level(b
);
2788 p
->nodes
[level
] = b
;
2789 if (!p
->skip_locking
)
2790 p
->locks
[level
] = root_lock
;
2793 level
= btrfs_header_level(b
);
2796 * setup the path here so we can release it under lock
2797 * contention with the cow code
2800 bool last_level
= (level
== (BTRFS_MAX_LEVEL
- 1));
2803 * if we don't really need to cow this block
2804 * then we don't want to set the path blocking,
2805 * so we test it here
2807 if (!should_cow_block(trans
, root
, b
)) {
2808 trans
->dirty
= true;
2813 * must have write locks on this node and the
2816 if (level
> write_lock_level
||
2817 (level
+ 1 > write_lock_level
&&
2818 level
+ 1 < BTRFS_MAX_LEVEL
&&
2819 p
->nodes
[level
+ 1])) {
2820 write_lock_level
= level
+ 1;
2821 btrfs_release_path(p
);
2825 btrfs_set_path_blocking(p
);
2827 err
= btrfs_cow_block(trans
, root
, b
, NULL
, 0,
2830 err
= btrfs_cow_block(trans
, root
, b
,
2831 p
->nodes
[level
+ 1],
2832 p
->slots
[level
+ 1], &b
);
2839 p
->nodes
[level
] = b
;
2840 btrfs_clear_path_blocking(p
, NULL
, 0);
2843 * we have a lock on b and as long as we aren't changing
2844 * the tree, there is no way to for the items in b to change.
2845 * It is safe to drop the lock on our parent before we
2846 * go through the expensive btree search on b.
2848 * If we're inserting or deleting (ins_len != 0), then we might
2849 * be changing slot zero, which may require changing the parent.
2850 * So, we can't drop the lock until after we know which slot
2851 * we're operating on.
2853 if (!ins_len
&& !p
->keep_locks
) {
2856 if (u
< BTRFS_MAX_LEVEL
&& p
->locks
[u
]) {
2857 btrfs_tree_unlock_rw(p
->nodes
[u
], p
->locks
[u
]);
2862 ret
= key_search(b
, key
, level
, &prev_cmp
, &slot
);
2868 if (ret
&& slot
> 0) {
2872 p
->slots
[level
] = slot
;
2873 err
= setup_nodes_for_search(trans
, root
, p
, b
, level
,
2874 ins_len
, &write_lock_level
);
2881 b
= p
->nodes
[level
];
2882 slot
= p
->slots
[level
];
2885 * slot 0 is special, if we change the key
2886 * we have to update the parent pointer
2887 * which means we must have a write lock
2890 if (slot
== 0 && ins_len
&&
2891 write_lock_level
< level
+ 1) {
2892 write_lock_level
= level
+ 1;
2893 btrfs_release_path(p
);
2897 unlock_up(p
, level
, lowest_unlock
,
2898 min_write_lock_level
, &write_lock_level
);
2900 if (level
== lowest_level
) {
2906 err
= read_block_for_search(root
, p
, &b
, level
,
2915 if (!p
->skip_locking
) {
2916 level
= btrfs_header_level(b
);
2917 if (level
<= write_lock_level
) {
2918 err
= btrfs_try_tree_write_lock(b
);
2920 btrfs_set_path_blocking(p
);
2922 btrfs_clear_path_blocking(p
, b
,
2925 p
->locks
[level
] = BTRFS_WRITE_LOCK
;
2927 err
= btrfs_tree_read_lock_atomic(b
);
2929 btrfs_set_path_blocking(p
);
2930 btrfs_tree_read_lock(b
);
2931 btrfs_clear_path_blocking(p
, b
,
2934 p
->locks
[level
] = BTRFS_READ_LOCK
;
2936 p
->nodes
[level
] = b
;
2939 p
->slots
[level
] = slot
;
2941 btrfs_leaf_free_space(fs_info
, b
) < ins_len
) {
2942 if (write_lock_level
< 1) {
2943 write_lock_level
= 1;
2944 btrfs_release_path(p
);
2948 btrfs_set_path_blocking(p
);
2949 err
= split_leaf(trans
, root
, key
,
2950 p
, ins_len
, ret
== 0);
2951 btrfs_clear_path_blocking(p
, NULL
, 0);
2959 if (!p
->search_for_split
)
2960 unlock_up(p
, level
, lowest_unlock
,
2961 min_write_lock_level
, &write_lock_level
);
2968 * we don't really know what they plan on doing with the path
2969 * from here on, so for now just mark it as blocking
2971 if (!p
->leave_spinning
)
2972 btrfs_set_path_blocking(p
);
2973 if (ret
< 0 && !p
->skip_release_on_error
)
2974 btrfs_release_path(p
);
2979 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2980 * current state of the tree together with the operations recorded in the tree
2981 * modification log to search for the key in a previous version of this tree, as
2982 * denoted by the time_seq parameter.
2984 * Naturally, there is no support for insert, delete or cow operations.
2986 * The resulting path and return value will be set up as if we called
2987 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2989 int btrfs_search_old_slot(struct btrfs_root
*root
, const struct btrfs_key
*key
,
2990 struct btrfs_path
*p
, u64 time_seq
)
2992 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2993 struct extent_buffer
*b
;
2998 int lowest_unlock
= 1;
2999 u8 lowest_level
= 0;
3002 lowest_level
= p
->lowest_level
;
3003 WARN_ON(p
->nodes
[0] != NULL
);
3005 if (p
->search_commit_root
) {
3007 return btrfs_search_slot(NULL
, root
, key
, p
, 0, 0);
3011 b
= get_old_root(root
, time_seq
);
3016 level
= btrfs_header_level(b
);
3017 p
->locks
[level
] = BTRFS_READ_LOCK
;
3020 level
= btrfs_header_level(b
);
3021 p
->nodes
[level
] = b
;
3022 btrfs_clear_path_blocking(p
, NULL
, 0);
3025 * we have a lock on b and as long as we aren't changing
3026 * the tree, there is no way to for the items in b to change.
3027 * It is safe to drop the lock on our parent before we
3028 * go through the expensive btree search on b.
3030 btrfs_unlock_up_safe(p
, level
+ 1);
3033 * Since we can unwind ebs we want to do a real search every
3037 ret
= key_search(b
, key
, level
, &prev_cmp
, &slot
);
3041 if (ret
&& slot
> 0) {
3045 p
->slots
[level
] = slot
;
3046 unlock_up(p
, level
, lowest_unlock
, 0, NULL
);
3048 if (level
== lowest_level
) {
3054 err
= read_block_for_search(root
, p
, &b
, level
,
3063 level
= btrfs_header_level(b
);
3064 err
= btrfs_tree_read_lock_atomic(b
);
3066 btrfs_set_path_blocking(p
);
3067 btrfs_tree_read_lock(b
);
3068 btrfs_clear_path_blocking(p
, b
,
3071 b
= tree_mod_log_rewind(fs_info
, p
, b
, time_seq
);
3076 p
->locks
[level
] = BTRFS_READ_LOCK
;
3077 p
->nodes
[level
] = b
;
3079 p
->slots
[level
] = slot
;
3080 unlock_up(p
, level
, lowest_unlock
, 0, NULL
);
3086 if (!p
->leave_spinning
)
3087 btrfs_set_path_blocking(p
);
3089 btrfs_release_path(p
);
3095 * helper to use instead of search slot if no exact match is needed but
3096 * instead the next or previous item should be returned.
3097 * When find_higher is true, the next higher item is returned, the next lower
3099 * When return_any and find_higher are both true, and no higher item is found,
3100 * return the next lower instead.
3101 * When return_any is true and find_higher is false, and no lower item is found,
3102 * return the next higher instead.
3103 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3106 int btrfs_search_slot_for_read(struct btrfs_root
*root
,
3107 const struct btrfs_key
*key
,
3108 struct btrfs_path
*p
, int find_higher
,
3112 struct extent_buffer
*leaf
;
3115 ret
= btrfs_search_slot(NULL
, root
, key
, p
, 0, 0);
3119 * a return value of 1 means the path is at the position where the
3120 * item should be inserted. Normally this is the next bigger item,
3121 * but in case the previous item is the last in a leaf, path points
3122 * to the first free slot in the previous leaf, i.e. at an invalid
3128 if (p
->slots
[0] >= btrfs_header_nritems(leaf
)) {
3129 ret
= btrfs_next_leaf(root
, p
);
3135 * no higher item found, return the next
3140 btrfs_release_path(p
);
3144 if (p
->slots
[0] == 0) {
3145 ret
= btrfs_prev_leaf(root
, p
);
3150 if (p
->slots
[0] == btrfs_header_nritems(leaf
))
3157 * no lower item found, return the next
3162 btrfs_release_path(p
);
3172 * adjust the pointers going up the tree, starting at level
3173 * making sure the right key of each node is points to 'key'.
3174 * This is used after shifting pointers to the left, so it stops
3175 * fixing up pointers when a given leaf/node is not in slot 0 of the
3179 static void fixup_low_keys(struct btrfs_fs_info
*fs_info
,
3180 struct btrfs_path
*path
,
3181 struct btrfs_disk_key
*key
, int level
)
3184 struct extent_buffer
*t
;
3186 for (i
= level
; i
< BTRFS_MAX_LEVEL
; i
++) {
3187 int tslot
= path
->slots
[i
];
3188 if (!path
->nodes
[i
])
3191 tree_mod_log_set_node_key(fs_info
, t
, tslot
, 1);
3192 btrfs_set_node_key(t
, key
, tslot
);
3193 btrfs_mark_buffer_dirty(path
->nodes
[i
]);
3202 * This function isn't completely safe. It's the caller's responsibility
3203 * that the new key won't break the order
3205 void btrfs_set_item_key_safe(struct btrfs_fs_info
*fs_info
,
3206 struct btrfs_path
*path
,
3207 const struct btrfs_key
*new_key
)
3209 struct btrfs_disk_key disk_key
;
3210 struct extent_buffer
*eb
;
3213 eb
= path
->nodes
[0];
3214 slot
= path
->slots
[0];
3216 btrfs_item_key(eb
, &disk_key
, slot
- 1);
3217 BUG_ON(comp_keys(&disk_key
, new_key
) >= 0);
3219 if (slot
< btrfs_header_nritems(eb
) - 1) {
3220 btrfs_item_key(eb
, &disk_key
, slot
+ 1);
3221 BUG_ON(comp_keys(&disk_key
, new_key
) <= 0);
3224 btrfs_cpu_key_to_disk(&disk_key
, new_key
);
3225 btrfs_set_item_key(eb
, &disk_key
, slot
);
3226 btrfs_mark_buffer_dirty(eb
);
3228 fixup_low_keys(fs_info
, path
, &disk_key
, 1);
3232 * try to push data from one node into the next node left in the
3235 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3236 * error, and > 0 if there was no room in the left hand block.
3238 static int push_node_left(struct btrfs_trans_handle
*trans
,
3239 struct btrfs_fs_info
*fs_info
,
3240 struct extent_buffer
*dst
,
3241 struct extent_buffer
*src
, int empty
)
3248 src_nritems
= btrfs_header_nritems(src
);
3249 dst_nritems
= btrfs_header_nritems(dst
);
3250 push_items
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - dst_nritems
;
3251 WARN_ON(btrfs_header_generation(src
) != trans
->transid
);
3252 WARN_ON(btrfs_header_generation(dst
) != trans
->transid
);
3254 if (!empty
&& src_nritems
<= 8)
3257 if (push_items
<= 0)
3261 push_items
= min(src_nritems
, push_items
);
3262 if (push_items
< src_nritems
) {
3263 /* leave at least 8 pointers in the node if
3264 * we aren't going to empty it
3266 if (src_nritems
- push_items
< 8) {
3267 if (push_items
<= 8)
3273 push_items
= min(src_nritems
- 8, push_items
);
3275 ret
= tree_mod_log_eb_copy(fs_info
, dst
, src
, dst_nritems
, 0,
3278 btrfs_abort_transaction(trans
, ret
);
3281 copy_extent_buffer(dst
, src
,
3282 btrfs_node_key_ptr_offset(dst_nritems
),
3283 btrfs_node_key_ptr_offset(0),
3284 push_items
* sizeof(struct btrfs_key_ptr
));
3286 if (push_items
< src_nritems
) {
3288 * don't call tree_mod_log_eb_move here, key removal was already
3289 * fully logged by tree_mod_log_eb_copy above.
3291 memmove_extent_buffer(src
, btrfs_node_key_ptr_offset(0),
3292 btrfs_node_key_ptr_offset(push_items
),
3293 (src_nritems
- push_items
) *
3294 sizeof(struct btrfs_key_ptr
));
3296 btrfs_set_header_nritems(src
, src_nritems
- push_items
);
3297 btrfs_set_header_nritems(dst
, dst_nritems
+ push_items
);
3298 btrfs_mark_buffer_dirty(src
);
3299 btrfs_mark_buffer_dirty(dst
);
3305 * try to push data from one node into the next node right in the
3308 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3309 * error, and > 0 if there was no room in the right hand block.
3311 * this will only push up to 1/2 the contents of the left node over
3313 static int balance_node_right(struct btrfs_trans_handle
*trans
,
3314 struct btrfs_fs_info
*fs_info
,
3315 struct extent_buffer
*dst
,
3316 struct extent_buffer
*src
)
3324 WARN_ON(btrfs_header_generation(src
) != trans
->transid
);
3325 WARN_ON(btrfs_header_generation(dst
) != trans
->transid
);
3327 src_nritems
= btrfs_header_nritems(src
);
3328 dst_nritems
= btrfs_header_nritems(dst
);
3329 push_items
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - dst_nritems
;
3330 if (push_items
<= 0)
3333 if (src_nritems
< 4)
3336 max_push
= src_nritems
/ 2 + 1;
3337 /* don't try to empty the node */
3338 if (max_push
>= src_nritems
)
3341 if (max_push
< push_items
)
3342 push_items
= max_push
;
3344 tree_mod_log_eb_move(fs_info
, dst
, push_items
, 0, dst_nritems
);
3345 memmove_extent_buffer(dst
, btrfs_node_key_ptr_offset(push_items
),
3346 btrfs_node_key_ptr_offset(0),
3348 sizeof(struct btrfs_key_ptr
));
3350 ret
= tree_mod_log_eb_copy(fs_info
, dst
, src
, 0,
3351 src_nritems
- push_items
, push_items
);
3353 btrfs_abort_transaction(trans
, ret
);
3356 copy_extent_buffer(dst
, src
,
3357 btrfs_node_key_ptr_offset(0),
3358 btrfs_node_key_ptr_offset(src_nritems
- push_items
),
3359 push_items
* sizeof(struct btrfs_key_ptr
));
3361 btrfs_set_header_nritems(src
, src_nritems
- push_items
);
3362 btrfs_set_header_nritems(dst
, dst_nritems
+ push_items
);
3364 btrfs_mark_buffer_dirty(src
);
3365 btrfs_mark_buffer_dirty(dst
);
3371 * helper function to insert a new root level in the tree.
3372 * A new node is allocated, and a single item is inserted to
3373 * point to the existing root
3375 * returns zero on success or < 0 on failure.
3377 static noinline
int insert_new_root(struct btrfs_trans_handle
*trans
,
3378 struct btrfs_root
*root
,
3379 struct btrfs_path
*path
, int level
)
3381 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
3383 struct extent_buffer
*lower
;
3384 struct extent_buffer
*c
;
3385 struct extent_buffer
*old
;
3386 struct btrfs_disk_key lower_key
;
3388 BUG_ON(path
->nodes
[level
]);
3389 BUG_ON(path
->nodes
[level
-1] != root
->node
);
3391 lower
= path
->nodes
[level
-1];
3393 btrfs_item_key(lower
, &lower_key
, 0);
3395 btrfs_node_key(lower
, &lower_key
, 0);
3397 c
= alloc_tree_block_no_bg_flush(trans
, root
, 0, &lower_key
, level
,
3398 root
->node
->start
, 0);
3402 root_add_used(root
, fs_info
->nodesize
);
3404 memzero_extent_buffer(c
, 0, sizeof(struct btrfs_header
));
3405 btrfs_set_header_nritems(c
, 1);
3406 btrfs_set_header_level(c
, level
);
3407 btrfs_set_header_bytenr(c
, c
->start
);
3408 btrfs_set_header_generation(c
, trans
->transid
);
3409 btrfs_set_header_backref_rev(c
, BTRFS_MIXED_BACKREF_REV
);
3410 btrfs_set_header_owner(c
, root
->root_key
.objectid
);
3412 write_extent_buffer_fsid(c
, fs_info
->fsid
);
3413 write_extent_buffer_chunk_tree_uuid(c
, fs_info
->chunk_tree_uuid
);
3415 btrfs_set_node_key(c
, &lower_key
, 0);
3416 btrfs_set_node_blockptr(c
, 0, lower
->start
);
3417 lower_gen
= btrfs_header_generation(lower
);
3418 WARN_ON(lower_gen
!= trans
->transid
);
3420 btrfs_set_node_ptr_generation(c
, 0, lower_gen
);
3422 btrfs_mark_buffer_dirty(c
);
3425 tree_mod_log_set_root_pointer(root
, c
, 0);
3426 rcu_assign_pointer(root
->node
, c
);
3428 /* the super has an extra ref to root->node */
3429 free_extent_buffer(old
);
3431 add_root_to_dirty_list(root
);
3432 extent_buffer_get(c
);
3433 path
->nodes
[level
] = c
;
3434 path
->locks
[level
] = BTRFS_WRITE_LOCK_BLOCKING
;
3435 path
->slots
[level
] = 0;
3440 * worker function to insert a single pointer in a node.
3441 * the node should have enough room for the pointer already
3443 * slot and level indicate where you want the key to go, and
3444 * blocknr is the block the key points to.
3446 static void insert_ptr(struct btrfs_trans_handle
*trans
,
3447 struct btrfs_fs_info
*fs_info
, struct btrfs_path
*path
,
3448 struct btrfs_disk_key
*key
, u64 bytenr
,
3449 int slot
, int level
)
3451 struct extent_buffer
*lower
;
3455 BUG_ON(!path
->nodes
[level
]);
3456 btrfs_assert_tree_locked(path
->nodes
[level
]);
3457 lower
= path
->nodes
[level
];
3458 nritems
= btrfs_header_nritems(lower
);
3459 BUG_ON(slot
> nritems
);
3460 BUG_ON(nritems
== BTRFS_NODEPTRS_PER_BLOCK(fs_info
));
3461 if (slot
!= nritems
) {
3463 tree_mod_log_eb_move(fs_info
, lower
, slot
+ 1,
3464 slot
, nritems
- slot
);
3465 memmove_extent_buffer(lower
,
3466 btrfs_node_key_ptr_offset(slot
+ 1),
3467 btrfs_node_key_ptr_offset(slot
),
3468 (nritems
- slot
) * sizeof(struct btrfs_key_ptr
));
3471 ret
= tree_mod_log_insert_key(fs_info
, lower
, slot
,
3472 MOD_LOG_KEY_ADD
, GFP_NOFS
);
3475 btrfs_set_node_key(lower
, key
, slot
);
3476 btrfs_set_node_blockptr(lower
, slot
, bytenr
);
3477 WARN_ON(trans
->transid
== 0);
3478 btrfs_set_node_ptr_generation(lower
, slot
, trans
->transid
);
3479 btrfs_set_header_nritems(lower
, nritems
+ 1);
3480 btrfs_mark_buffer_dirty(lower
);
3484 * split the node at the specified level in path in two.
3485 * The path is corrected to point to the appropriate node after the split
3487 * Before splitting this tries to make some room in the node by pushing
3488 * left and right, if either one works, it returns right away.
3490 * returns 0 on success and < 0 on failure
3492 static noinline
int split_node(struct btrfs_trans_handle
*trans
,
3493 struct btrfs_root
*root
,
3494 struct btrfs_path
*path
, int level
)
3496 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
3497 struct extent_buffer
*c
;
3498 struct extent_buffer
*split
;
3499 struct btrfs_disk_key disk_key
;
3504 c
= path
->nodes
[level
];
3505 WARN_ON(btrfs_header_generation(c
) != trans
->transid
);
3506 if (c
== root
->node
) {
3508 * trying to split the root, lets make a new one
3510 * tree mod log: We don't log_removal old root in
3511 * insert_new_root, because that root buffer will be kept as a
3512 * normal node. We are going to log removal of half of the
3513 * elements below with tree_mod_log_eb_copy. We're holding a
3514 * tree lock on the buffer, which is why we cannot race with
3515 * other tree_mod_log users.
3517 ret
= insert_new_root(trans
, root
, path
, level
+ 1);
3521 ret
= push_nodes_for_insert(trans
, root
, path
, level
);
3522 c
= path
->nodes
[level
];
3523 if (!ret
&& btrfs_header_nritems(c
) <
3524 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 3)
3530 c_nritems
= btrfs_header_nritems(c
);
3531 mid
= (c_nritems
+ 1) / 2;
3532 btrfs_node_key(c
, &disk_key
, mid
);
3534 split
= alloc_tree_block_no_bg_flush(trans
, root
, 0, &disk_key
, level
,
3537 return PTR_ERR(split
);
3539 root_add_used(root
, fs_info
->nodesize
);
3541 memzero_extent_buffer(split
, 0, sizeof(struct btrfs_header
));
3542 btrfs_set_header_level(split
, btrfs_header_level(c
));
3543 btrfs_set_header_bytenr(split
, split
->start
);
3544 btrfs_set_header_generation(split
, trans
->transid
);
3545 btrfs_set_header_backref_rev(split
, BTRFS_MIXED_BACKREF_REV
);
3546 btrfs_set_header_owner(split
, root
->root_key
.objectid
);
3547 write_extent_buffer_fsid(split
, fs_info
->fsid
);
3548 write_extent_buffer_chunk_tree_uuid(split
, fs_info
->chunk_tree_uuid
);
3550 ret
= tree_mod_log_eb_copy(fs_info
, split
, c
, 0, mid
, c_nritems
- mid
);
3552 btrfs_abort_transaction(trans
, ret
);
3555 copy_extent_buffer(split
, c
,
3556 btrfs_node_key_ptr_offset(0),
3557 btrfs_node_key_ptr_offset(mid
),
3558 (c_nritems
- mid
) * sizeof(struct btrfs_key_ptr
));
3559 btrfs_set_header_nritems(split
, c_nritems
- mid
);
3560 btrfs_set_header_nritems(c
, mid
);
3563 btrfs_mark_buffer_dirty(c
);
3564 btrfs_mark_buffer_dirty(split
);
3566 insert_ptr(trans
, fs_info
, path
, &disk_key
, split
->start
,
3567 path
->slots
[level
+ 1] + 1, level
+ 1);
3569 if (path
->slots
[level
] >= mid
) {
3570 path
->slots
[level
] -= mid
;
3571 btrfs_tree_unlock(c
);
3572 free_extent_buffer(c
);
3573 path
->nodes
[level
] = split
;
3574 path
->slots
[level
+ 1] += 1;
3576 btrfs_tree_unlock(split
);
3577 free_extent_buffer(split
);
3583 * how many bytes are required to store the items in a leaf. start
3584 * and nr indicate which items in the leaf to check. This totals up the
3585 * space used both by the item structs and the item data
3587 static int leaf_space_used(struct extent_buffer
*l
, int start
, int nr
)
3589 struct btrfs_item
*start_item
;
3590 struct btrfs_item
*end_item
;
3591 struct btrfs_map_token token
;
3593 int nritems
= btrfs_header_nritems(l
);
3594 int end
= min(nritems
, start
+ nr
) - 1;
3598 btrfs_init_map_token(&token
);
3599 start_item
= btrfs_item_nr(start
);
3600 end_item
= btrfs_item_nr(end
);
3601 data_len
= btrfs_token_item_offset(l
, start_item
, &token
) +
3602 btrfs_token_item_size(l
, start_item
, &token
);
3603 data_len
= data_len
- btrfs_token_item_offset(l
, end_item
, &token
);
3604 data_len
+= sizeof(struct btrfs_item
) * nr
;
3605 WARN_ON(data_len
< 0);
3610 * The space between the end of the leaf items and
3611 * the start of the leaf data. IOW, how much room
3612 * the leaf has left for both items and data
3614 noinline
int btrfs_leaf_free_space(struct btrfs_fs_info
*fs_info
,
3615 struct extent_buffer
*leaf
)
3617 int nritems
= btrfs_header_nritems(leaf
);
3620 ret
= BTRFS_LEAF_DATA_SIZE(fs_info
) - leaf_space_used(leaf
, 0, nritems
);
3623 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3625 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info
),
3626 leaf_space_used(leaf
, 0, nritems
), nritems
);
3632 * min slot controls the lowest index we're willing to push to the
3633 * right. We'll push up to and including min_slot, but no lower
3635 static noinline
int __push_leaf_right(struct btrfs_fs_info
*fs_info
,
3636 struct btrfs_path
*path
,
3637 int data_size
, int empty
,
3638 struct extent_buffer
*right
,
3639 int free_space
, u32 left_nritems
,
3642 struct extent_buffer
*left
= path
->nodes
[0];
3643 struct extent_buffer
*upper
= path
->nodes
[1];
3644 struct btrfs_map_token token
;
3645 struct btrfs_disk_key disk_key
;
3650 struct btrfs_item
*item
;
3656 btrfs_init_map_token(&token
);
3661 nr
= max_t(u32
, 1, min_slot
);
3663 if (path
->slots
[0] >= left_nritems
)
3664 push_space
+= data_size
;
3666 slot
= path
->slots
[1];
3667 i
= left_nritems
- 1;
3669 item
= btrfs_item_nr(i
);
3671 if (!empty
&& push_items
> 0) {
3672 if (path
->slots
[0] > i
)
3674 if (path
->slots
[0] == i
) {
3675 int space
= btrfs_leaf_free_space(fs_info
, left
);
3676 if (space
+ push_space
* 2 > free_space
)
3681 if (path
->slots
[0] == i
)
3682 push_space
+= data_size
;
3684 this_item_size
= btrfs_item_size(left
, item
);
3685 if (this_item_size
+ sizeof(*item
) + push_space
> free_space
)
3689 push_space
+= this_item_size
+ sizeof(*item
);
3695 if (push_items
== 0)
3698 WARN_ON(!empty
&& push_items
== left_nritems
);
3700 /* push left to right */
3701 right_nritems
= btrfs_header_nritems(right
);
3703 push_space
= btrfs_item_end_nr(left
, left_nritems
- push_items
);
3704 push_space
-= leaf_data_end(fs_info
, left
);
3706 /* make room in the right data area */
3707 data_end
= leaf_data_end(fs_info
, right
);
3708 memmove_extent_buffer(right
,
3709 BTRFS_LEAF_DATA_OFFSET
+ data_end
- push_space
,
3710 BTRFS_LEAF_DATA_OFFSET
+ data_end
,
3711 BTRFS_LEAF_DATA_SIZE(fs_info
) - data_end
);
3713 /* copy from the left data area */
3714 copy_extent_buffer(right
, left
, BTRFS_LEAF_DATA_OFFSET
+
3715 BTRFS_LEAF_DATA_SIZE(fs_info
) - push_space
,
3716 BTRFS_LEAF_DATA_OFFSET
+ leaf_data_end(fs_info
, left
),
3719 memmove_extent_buffer(right
, btrfs_item_nr_offset(push_items
),
3720 btrfs_item_nr_offset(0),
3721 right_nritems
* sizeof(struct btrfs_item
));
3723 /* copy the items from left to right */
3724 copy_extent_buffer(right
, left
, btrfs_item_nr_offset(0),
3725 btrfs_item_nr_offset(left_nritems
- push_items
),
3726 push_items
* sizeof(struct btrfs_item
));
3728 /* update the item pointers */
3729 right_nritems
+= push_items
;
3730 btrfs_set_header_nritems(right
, right_nritems
);
3731 push_space
= BTRFS_LEAF_DATA_SIZE(fs_info
);
3732 for (i
= 0; i
< right_nritems
; i
++) {
3733 item
= btrfs_item_nr(i
);
3734 push_space
-= btrfs_token_item_size(right
, item
, &token
);
3735 btrfs_set_token_item_offset(right
, item
, push_space
, &token
);
3738 left_nritems
-= push_items
;
3739 btrfs_set_header_nritems(left
, left_nritems
);
3742 btrfs_mark_buffer_dirty(left
);
3744 clean_tree_block(fs_info
, left
);
3746 btrfs_mark_buffer_dirty(right
);
3748 btrfs_item_key(right
, &disk_key
, 0);
3749 btrfs_set_node_key(upper
, &disk_key
, slot
+ 1);
3750 btrfs_mark_buffer_dirty(upper
);
3752 /* then fixup the leaf pointer in the path */
3753 if (path
->slots
[0] >= left_nritems
) {
3754 path
->slots
[0] -= left_nritems
;
3755 if (btrfs_header_nritems(path
->nodes
[0]) == 0)
3756 clean_tree_block(fs_info
, path
->nodes
[0]);
3757 btrfs_tree_unlock(path
->nodes
[0]);
3758 free_extent_buffer(path
->nodes
[0]);
3759 path
->nodes
[0] = right
;
3760 path
->slots
[1] += 1;
3762 btrfs_tree_unlock(right
);
3763 free_extent_buffer(right
);
3768 btrfs_tree_unlock(right
);
3769 free_extent_buffer(right
);
3774 * push some data in the path leaf to the right, trying to free up at
3775 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3777 * returns 1 if the push failed because the other node didn't have enough
3778 * room, 0 if everything worked out and < 0 if there were major errors.
3780 * this will push starting from min_slot to the end of the leaf. It won't
3781 * push any slot lower than min_slot
3783 static int push_leaf_right(struct btrfs_trans_handle
*trans
, struct btrfs_root
3784 *root
, struct btrfs_path
*path
,
3785 int min_data_size
, int data_size
,
3786 int empty
, u32 min_slot
)
3788 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
3789 struct extent_buffer
*left
= path
->nodes
[0];
3790 struct extent_buffer
*right
;
3791 struct extent_buffer
*upper
;
3797 if (!path
->nodes
[1])
3800 slot
= path
->slots
[1];
3801 upper
= path
->nodes
[1];
3802 if (slot
>= btrfs_header_nritems(upper
) - 1)
3805 btrfs_assert_tree_locked(path
->nodes
[1]);
3807 right
= read_node_slot(fs_info
, upper
, slot
+ 1);
3809 * slot + 1 is not valid or we fail to read the right node,
3810 * no big deal, just return.
3815 btrfs_tree_lock(right
);
3816 btrfs_set_lock_blocking(right
);
3818 free_space
= btrfs_leaf_free_space(fs_info
, right
);
3819 if (free_space
< data_size
)
3822 /* cow and double check */
3823 ret
= btrfs_cow_block(trans
, root
, right
, upper
,
3828 free_space
= btrfs_leaf_free_space(fs_info
, right
);
3829 if (free_space
< data_size
)
3832 left_nritems
= btrfs_header_nritems(left
);
3833 if (left_nritems
== 0)
3836 if (path
->slots
[0] == left_nritems
&& !empty
) {
3837 /* Key greater than all keys in the leaf, right neighbor has
3838 * enough room for it and we're not emptying our leaf to delete
3839 * it, therefore use right neighbor to insert the new item and
3840 * no need to touch/dirty our left leaft. */
3841 btrfs_tree_unlock(left
);
3842 free_extent_buffer(left
);
3843 path
->nodes
[0] = right
;
3849 return __push_leaf_right(fs_info
, path
, min_data_size
, empty
,
3850 right
, free_space
, left_nritems
, min_slot
);
3852 btrfs_tree_unlock(right
);
3853 free_extent_buffer(right
);
3858 * push some data in the path leaf to the left, trying to free up at
3859 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3861 * max_slot can put a limit on how far into the leaf we'll push items. The
3862 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3865 static noinline
int __push_leaf_left(struct btrfs_fs_info
*fs_info
,
3866 struct btrfs_path
*path
, int data_size
,
3867 int empty
, struct extent_buffer
*left
,
3868 int free_space
, u32 right_nritems
,
3871 struct btrfs_disk_key disk_key
;
3872 struct extent_buffer
*right
= path
->nodes
[0];
3876 struct btrfs_item
*item
;
3877 u32 old_left_nritems
;
3881 u32 old_left_item_size
;
3882 struct btrfs_map_token token
;
3884 btrfs_init_map_token(&token
);
3887 nr
= min(right_nritems
, max_slot
);
3889 nr
= min(right_nritems
- 1, max_slot
);
3891 for (i
= 0; i
< nr
; i
++) {
3892 item
= btrfs_item_nr(i
);
3894 if (!empty
&& push_items
> 0) {
3895 if (path
->slots
[0] < i
)
3897 if (path
->slots
[0] == i
) {
3898 int space
= btrfs_leaf_free_space(fs_info
, right
);
3899 if (space
+ push_space
* 2 > free_space
)
3904 if (path
->slots
[0] == i
)
3905 push_space
+= data_size
;
3907 this_item_size
= btrfs_item_size(right
, item
);
3908 if (this_item_size
+ sizeof(*item
) + push_space
> free_space
)
3912 push_space
+= this_item_size
+ sizeof(*item
);
3915 if (push_items
== 0) {
3919 WARN_ON(!empty
&& push_items
== btrfs_header_nritems(right
));
3921 /* push data from right to left */
3922 copy_extent_buffer(left
, right
,
3923 btrfs_item_nr_offset(btrfs_header_nritems(left
)),
3924 btrfs_item_nr_offset(0),
3925 push_items
* sizeof(struct btrfs_item
));
3927 push_space
= BTRFS_LEAF_DATA_SIZE(fs_info
) -
3928 btrfs_item_offset_nr(right
, push_items
- 1);
3930 copy_extent_buffer(left
, right
, BTRFS_LEAF_DATA_OFFSET
+
3931 leaf_data_end(fs_info
, left
) - push_space
,
3932 BTRFS_LEAF_DATA_OFFSET
+
3933 btrfs_item_offset_nr(right
, push_items
- 1),
3935 old_left_nritems
= btrfs_header_nritems(left
);
3936 BUG_ON(old_left_nritems
<= 0);
3938 old_left_item_size
= btrfs_item_offset_nr(left
, old_left_nritems
- 1);
3939 for (i
= old_left_nritems
; i
< old_left_nritems
+ push_items
; i
++) {
3942 item
= btrfs_item_nr(i
);
3944 ioff
= btrfs_token_item_offset(left
, item
, &token
);
3945 btrfs_set_token_item_offset(left
, item
,
3946 ioff
- (BTRFS_LEAF_DATA_SIZE(fs_info
) - old_left_item_size
),
3949 btrfs_set_header_nritems(left
, old_left_nritems
+ push_items
);
3951 /* fixup right node */
3952 if (push_items
> right_nritems
)
3953 WARN(1, KERN_CRIT
"push items %d nr %u\n", push_items
,
3956 if (push_items
< right_nritems
) {
3957 push_space
= btrfs_item_offset_nr(right
, push_items
- 1) -
3958 leaf_data_end(fs_info
, right
);
3959 memmove_extent_buffer(right
, BTRFS_LEAF_DATA_OFFSET
+
3960 BTRFS_LEAF_DATA_SIZE(fs_info
) - push_space
,
3961 BTRFS_LEAF_DATA_OFFSET
+
3962 leaf_data_end(fs_info
, right
), push_space
);
3964 memmove_extent_buffer(right
, btrfs_item_nr_offset(0),
3965 btrfs_item_nr_offset(push_items
),
3966 (btrfs_header_nritems(right
) - push_items
) *
3967 sizeof(struct btrfs_item
));
3969 right_nritems
-= push_items
;
3970 btrfs_set_header_nritems(right
, right_nritems
);
3971 push_space
= BTRFS_LEAF_DATA_SIZE(fs_info
);
3972 for (i
= 0; i
< right_nritems
; i
++) {
3973 item
= btrfs_item_nr(i
);
3975 push_space
= push_space
- btrfs_token_item_size(right
,
3977 btrfs_set_token_item_offset(right
, item
, push_space
, &token
);
3980 btrfs_mark_buffer_dirty(left
);
3982 btrfs_mark_buffer_dirty(right
);
3984 clean_tree_block(fs_info
, right
);
3986 btrfs_item_key(right
, &disk_key
, 0);
3987 fixup_low_keys(fs_info
, path
, &disk_key
, 1);
3989 /* then fixup the leaf pointer in the path */
3990 if (path
->slots
[0] < push_items
) {
3991 path
->slots
[0] += old_left_nritems
;
3992 btrfs_tree_unlock(path
->nodes
[0]);
3993 free_extent_buffer(path
->nodes
[0]);
3994 path
->nodes
[0] = left
;
3995 path
->slots
[1] -= 1;
3997 btrfs_tree_unlock(left
);
3998 free_extent_buffer(left
);
3999 path
->slots
[0] -= push_items
;
4001 BUG_ON(path
->slots
[0] < 0);
4004 btrfs_tree_unlock(left
);
4005 free_extent_buffer(left
);
4010 * push some data in the path leaf to the left, trying to free up at
4011 * least data_size bytes. returns zero if the push worked, nonzero otherwise
4013 * max_slot can put a limit on how far into the leaf we'll push items. The
4014 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
4017 static int push_leaf_left(struct btrfs_trans_handle
*trans
, struct btrfs_root
4018 *root
, struct btrfs_path
*path
, int min_data_size
,
4019 int data_size
, int empty
, u32 max_slot
)
4021 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4022 struct extent_buffer
*right
= path
->nodes
[0];
4023 struct extent_buffer
*left
;
4029 slot
= path
->slots
[1];
4032 if (!path
->nodes
[1])
4035 right_nritems
= btrfs_header_nritems(right
);
4036 if (right_nritems
== 0)
4039 btrfs_assert_tree_locked(path
->nodes
[1]);
4041 left
= read_node_slot(fs_info
, path
->nodes
[1], slot
- 1);
4043 * slot - 1 is not valid or we fail to read the left node,
4044 * no big deal, just return.
4049 btrfs_tree_lock(left
);
4050 btrfs_set_lock_blocking(left
);
4052 free_space
= btrfs_leaf_free_space(fs_info
, left
);
4053 if (free_space
< data_size
) {
4058 /* cow and double check */
4059 ret
= btrfs_cow_block(trans
, root
, left
,
4060 path
->nodes
[1], slot
- 1, &left
);
4062 /* we hit -ENOSPC, but it isn't fatal here */
4068 free_space
= btrfs_leaf_free_space(fs_info
, left
);
4069 if (free_space
< data_size
) {
4074 return __push_leaf_left(fs_info
, path
, min_data_size
,
4075 empty
, left
, free_space
, right_nritems
,
4078 btrfs_tree_unlock(left
);
4079 free_extent_buffer(left
);
4084 * split the path's leaf in two, making sure there is at least data_size
4085 * available for the resulting leaf level of the path.
4087 static noinline
void copy_for_split(struct btrfs_trans_handle
*trans
,
4088 struct btrfs_fs_info
*fs_info
,
4089 struct btrfs_path
*path
,
4090 struct extent_buffer
*l
,
4091 struct extent_buffer
*right
,
4092 int slot
, int mid
, int nritems
)
4097 struct btrfs_disk_key disk_key
;
4098 struct btrfs_map_token token
;
4100 btrfs_init_map_token(&token
);
4102 nritems
= nritems
- mid
;
4103 btrfs_set_header_nritems(right
, nritems
);
4104 data_copy_size
= btrfs_item_end_nr(l
, mid
) - leaf_data_end(fs_info
, l
);
4106 copy_extent_buffer(right
, l
, btrfs_item_nr_offset(0),
4107 btrfs_item_nr_offset(mid
),
4108 nritems
* sizeof(struct btrfs_item
));
4110 copy_extent_buffer(right
, l
,
4111 BTRFS_LEAF_DATA_OFFSET
+ BTRFS_LEAF_DATA_SIZE(fs_info
) -
4112 data_copy_size
, BTRFS_LEAF_DATA_OFFSET
+
4113 leaf_data_end(fs_info
, l
), data_copy_size
);
4115 rt_data_off
= BTRFS_LEAF_DATA_SIZE(fs_info
) - btrfs_item_end_nr(l
, mid
);
4117 for (i
= 0; i
< nritems
; i
++) {
4118 struct btrfs_item
*item
= btrfs_item_nr(i
);
4121 ioff
= btrfs_token_item_offset(right
, item
, &token
);
4122 btrfs_set_token_item_offset(right
, item
,
4123 ioff
+ rt_data_off
, &token
);
4126 btrfs_set_header_nritems(l
, mid
);
4127 btrfs_item_key(right
, &disk_key
, 0);
4128 insert_ptr(trans
, fs_info
, path
, &disk_key
, right
->start
,
4129 path
->slots
[1] + 1, 1);
4131 btrfs_mark_buffer_dirty(right
);
4132 btrfs_mark_buffer_dirty(l
);
4133 BUG_ON(path
->slots
[0] != slot
);
4136 btrfs_tree_unlock(path
->nodes
[0]);
4137 free_extent_buffer(path
->nodes
[0]);
4138 path
->nodes
[0] = right
;
4139 path
->slots
[0] -= mid
;
4140 path
->slots
[1] += 1;
4142 btrfs_tree_unlock(right
);
4143 free_extent_buffer(right
);
4146 BUG_ON(path
->slots
[0] < 0);
4150 * double splits happen when we need to insert a big item in the middle
4151 * of a leaf. A double split can leave us with 3 mostly empty leaves:
4152 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4155 * We avoid this by trying to push the items on either side of our target
4156 * into the adjacent leaves. If all goes well we can avoid the double split
4159 static noinline
int push_for_double_split(struct btrfs_trans_handle
*trans
,
4160 struct btrfs_root
*root
,
4161 struct btrfs_path
*path
,
4164 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4169 int space_needed
= data_size
;
4171 slot
= path
->slots
[0];
4172 if (slot
< btrfs_header_nritems(path
->nodes
[0]))
4173 space_needed
-= btrfs_leaf_free_space(fs_info
, path
->nodes
[0]);
4176 * try to push all the items after our slot into the
4179 ret
= push_leaf_right(trans
, root
, path
, 1, space_needed
, 0, slot
);
4186 nritems
= btrfs_header_nritems(path
->nodes
[0]);
4188 * our goal is to get our slot at the start or end of a leaf. If
4189 * we've done so we're done
4191 if (path
->slots
[0] == 0 || path
->slots
[0] == nritems
)
4194 if (btrfs_leaf_free_space(fs_info
, path
->nodes
[0]) >= data_size
)
4197 /* try to push all the items before our slot into the next leaf */
4198 slot
= path
->slots
[0];
4199 space_needed
= data_size
;
4201 space_needed
-= btrfs_leaf_free_space(fs_info
, path
->nodes
[0]);
4202 ret
= push_leaf_left(trans
, root
, path
, 1, space_needed
, 0, slot
);
4215 * split the path's leaf in two, making sure there is at least data_size
4216 * available for the resulting leaf level of the path.
4218 * returns 0 if all went well and < 0 on failure.
4220 static noinline
int split_leaf(struct btrfs_trans_handle
*trans
,
4221 struct btrfs_root
*root
,
4222 const struct btrfs_key
*ins_key
,
4223 struct btrfs_path
*path
, int data_size
,
4226 struct btrfs_disk_key disk_key
;
4227 struct extent_buffer
*l
;
4231 struct extent_buffer
*right
;
4232 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4236 int num_doubles
= 0;
4237 int tried_avoid_double
= 0;
4240 slot
= path
->slots
[0];
4241 if (extend
&& data_size
+ btrfs_item_size_nr(l
, slot
) +
4242 sizeof(struct btrfs_item
) > BTRFS_LEAF_DATA_SIZE(fs_info
))
4245 /* first try to make some room by pushing left and right */
4246 if (data_size
&& path
->nodes
[1]) {
4247 int space_needed
= data_size
;
4249 if (slot
< btrfs_header_nritems(l
))
4250 space_needed
-= btrfs_leaf_free_space(fs_info
, l
);
4252 wret
= push_leaf_right(trans
, root
, path
, space_needed
,
4253 space_needed
, 0, 0);
4257 space_needed
= data_size
;
4259 space_needed
-= btrfs_leaf_free_space(fs_info
,
4261 wret
= push_leaf_left(trans
, root
, path
, space_needed
,
4262 space_needed
, 0, (u32
)-1);
4268 /* did the pushes work? */
4269 if (btrfs_leaf_free_space(fs_info
, l
) >= data_size
)
4273 if (!path
->nodes
[1]) {
4274 ret
= insert_new_root(trans
, root
, path
, 1);
4281 slot
= path
->slots
[0];
4282 nritems
= btrfs_header_nritems(l
);
4283 mid
= (nritems
+ 1) / 2;
4287 leaf_space_used(l
, mid
, nritems
- mid
) + data_size
>
4288 BTRFS_LEAF_DATA_SIZE(fs_info
)) {
4289 if (slot
>= nritems
) {
4293 if (mid
!= nritems
&&
4294 leaf_space_used(l
, mid
, nritems
- mid
) +
4295 data_size
> BTRFS_LEAF_DATA_SIZE(fs_info
)) {
4296 if (data_size
&& !tried_avoid_double
)
4297 goto push_for_double
;
4303 if (leaf_space_used(l
, 0, mid
) + data_size
>
4304 BTRFS_LEAF_DATA_SIZE(fs_info
)) {
4305 if (!extend
&& data_size
&& slot
== 0) {
4307 } else if ((extend
|| !data_size
) && slot
== 0) {
4311 if (mid
!= nritems
&&
4312 leaf_space_used(l
, mid
, nritems
- mid
) +
4313 data_size
> BTRFS_LEAF_DATA_SIZE(fs_info
)) {
4314 if (data_size
&& !tried_avoid_double
)
4315 goto push_for_double
;
4323 btrfs_cpu_key_to_disk(&disk_key
, ins_key
);
4325 btrfs_item_key(l
, &disk_key
, mid
);
4327 right
= alloc_tree_block_no_bg_flush(trans
, root
, 0, &disk_key
, 0,
4330 return PTR_ERR(right
);
4332 root_add_used(root
, fs_info
->nodesize
);
4334 memzero_extent_buffer(right
, 0, sizeof(struct btrfs_header
));
4335 btrfs_set_header_bytenr(right
, right
->start
);
4336 btrfs_set_header_generation(right
, trans
->transid
);
4337 btrfs_set_header_backref_rev(right
, BTRFS_MIXED_BACKREF_REV
);
4338 btrfs_set_header_owner(right
, root
->root_key
.objectid
);
4339 btrfs_set_header_level(right
, 0);
4340 write_extent_buffer_fsid(right
, fs_info
->fsid
);
4341 write_extent_buffer_chunk_tree_uuid(right
, fs_info
->chunk_tree_uuid
);
4345 btrfs_set_header_nritems(right
, 0);
4346 insert_ptr(trans
, fs_info
, path
, &disk_key
,
4347 right
->start
, path
->slots
[1] + 1, 1);
4348 btrfs_tree_unlock(path
->nodes
[0]);
4349 free_extent_buffer(path
->nodes
[0]);
4350 path
->nodes
[0] = right
;
4352 path
->slots
[1] += 1;
4354 btrfs_set_header_nritems(right
, 0);
4355 insert_ptr(trans
, fs_info
, path
, &disk_key
,
4356 right
->start
, path
->slots
[1], 1);
4357 btrfs_tree_unlock(path
->nodes
[0]);
4358 free_extent_buffer(path
->nodes
[0]);
4359 path
->nodes
[0] = right
;
4361 if (path
->slots
[1] == 0)
4362 fixup_low_keys(fs_info
, path
, &disk_key
, 1);
4365 * We create a new leaf 'right' for the required ins_len and
4366 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4367 * the content of ins_len to 'right'.
4372 copy_for_split(trans
, fs_info
, path
, l
, right
, slot
, mid
, nritems
);
4375 BUG_ON(num_doubles
!= 0);
4383 push_for_double_split(trans
, root
, path
, data_size
);
4384 tried_avoid_double
= 1;
4385 if (btrfs_leaf_free_space(fs_info
, path
->nodes
[0]) >= data_size
)
4390 static noinline
int setup_leaf_for_split(struct btrfs_trans_handle
*trans
,
4391 struct btrfs_root
*root
,
4392 struct btrfs_path
*path
, int ins_len
)
4394 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4395 struct btrfs_key key
;
4396 struct extent_buffer
*leaf
;
4397 struct btrfs_file_extent_item
*fi
;
4402 leaf
= path
->nodes
[0];
4403 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
4405 BUG_ON(key
.type
!= BTRFS_EXTENT_DATA_KEY
&&
4406 key
.type
!= BTRFS_EXTENT_CSUM_KEY
);
4408 if (btrfs_leaf_free_space(fs_info
, leaf
) >= ins_len
)
4411 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
4412 if (key
.type
== BTRFS_EXTENT_DATA_KEY
) {
4413 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
4414 struct btrfs_file_extent_item
);
4415 extent_len
= btrfs_file_extent_num_bytes(leaf
, fi
);
4417 btrfs_release_path(path
);
4419 path
->keep_locks
= 1;
4420 path
->search_for_split
= 1;
4421 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
4422 path
->search_for_split
= 0;
4429 leaf
= path
->nodes
[0];
4430 /* if our item isn't there, return now */
4431 if (item_size
!= btrfs_item_size_nr(leaf
, path
->slots
[0]))
4434 /* the leaf has changed, it now has room. return now */
4435 if (btrfs_leaf_free_space(fs_info
, path
->nodes
[0]) >= ins_len
)
4438 if (key
.type
== BTRFS_EXTENT_DATA_KEY
) {
4439 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
4440 struct btrfs_file_extent_item
);
4441 if (extent_len
!= btrfs_file_extent_num_bytes(leaf
, fi
))
4445 btrfs_set_path_blocking(path
);
4446 ret
= split_leaf(trans
, root
, &key
, path
, ins_len
, 1);
4450 path
->keep_locks
= 0;
4451 btrfs_unlock_up_safe(path
, 1);
4454 path
->keep_locks
= 0;
4458 static noinline
int split_item(struct btrfs_fs_info
*fs_info
,
4459 struct btrfs_path
*path
,
4460 const struct btrfs_key
*new_key
,
4461 unsigned long split_offset
)
4463 struct extent_buffer
*leaf
;
4464 struct btrfs_item
*item
;
4465 struct btrfs_item
*new_item
;
4471 struct btrfs_disk_key disk_key
;
4473 leaf
= path
->nodes
[0];
4474 BUG_ON(btrfs_leaf_free_space(fs_info
, leaf
) < sizeof(struct btrfs_item
));
4476 btrfs_set_path_blocking(path
);
4478 item
= btrfs_item_nr(path
->slots
[0]);
4479 orig_offset
= btrfs_item_offset(leaf
, item
);
4480 item_size
= btrfs_item_size(leaf
, item
);
4482 buf
= kmalloc(item_size
, GFP_NOFS
);
4486 read_extent_buffer(leaf
, buf
, btrfs_item_ptr_offset(leaf
,
4487 path
->slots
[0]), item_size
);
4489 slot
= path
->slots
[0] + 1;
4490 nritems
= btrfs_header_nritems(leaf
);
4491 if (slot
!= nritems
) {
4492 /* shift the items */
4493 memmove_extent_buffer(leaf
, btrfs_item_nr_offset(slot
+ 1),
4494 btrfs_item_nr_offset(slot
),
4495 (nritems
- slot
) * sizeof(struct btrfs_item
));
4498 btrfs_cpu_key_to_disk(&disk_key
, new_key
);
4499 btrfs_set_item_key(leaf
, &disk_key
, slot
);
4501 new_item
= btrfs_item_nr(slot
);
4503 btrfs_set_item_offset(leaf
, new_item
, orig_offset
);
4504 btrfs_set_item_size(leaf
, new_item
, item_size
- split_offset
);
4506 btrfs_set_item_offset(leaf
, item
,
4507 orig_offset
+ item_size
- split_offset
);
4508 btrfs_set_item_size(leaf
, item
, split_offset
);
4510 btrfs_set_header_nritems(leaf
, nritems
+ 1);
4512 /* write the data for the start of the original item */
4513 write_extent_buffer(leaf
, buf
,
4514 btrfs_item_ptr_offset(leaf
, path
->slots
[0]),
4517 /* write the data for the new item */
4518 write_extent_buffer(leaf
, buf
+ split_offset
,
4519 btrfs_item_ptr_offset(leaf
, slot
),
4520 item_size
- split_offset
);
4521 btrfs_mark_buffer_dirty(leaf
);
4523 BUG_ON(btrfs_leaf_free_space(fs_info
, leaf
) < 0);
4529 * This function splits a single item into two items,
4530 * giving 'new_key' to the new item and splitting the
4531 * old one at split_offset (from the start of the item).
4533 * The path may be released by this operation. After
4534 * the split, the path is pointing to the old item. The
4535 * new item is going to be in the same node as the old one.
4537 * Note, the item being split must be smaller enough to live alone on
4538 * a tree block with room for one extra struct btrfs_item
4540 * This allows us to split the item in place, keeping a lock on the
4541 * leaf the entire time.
4543 int btrfs_split_item(struct btrfs_trans_handle
*trans
,
4544 struct btrfs_root
*root
,
4545 struct btrfs_path
*path
,
4546 const struct btrfs_key
*new_key
,
4547 unsigned long split_offset
)
4550 ret
= setup_leaf_for_split(trans
, root
, path
,
4551 sizeof(struct btrfs_item
));
4555 ret
= split_item(root
->fs_info
, path
, new_key
, split_offset
);
4560 * This function duplicate a item, giving 'new_key' to the new item.
4561 * It guarantees both items live in the same tree leaf and the new item
4562 * is contiguous with the original item.
4564 * This allows us to split file extent in place, keeping a lock on the
4565 * leaf the entire time.
4567 int btrfs_duplicate_item(struct btrfs_trans_handle
*trans
,
4568 struct btrfs_root
*root
,
4569 struct btrfs_path
*path
,
4570 const struct btrfs_key
*new_key
)
4572 struct extent_buffer
*leaf
;
4576 leaf
= path
->nodes
[0];
4577 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
4578 ret
= setup_leaf_for_split(trans
, root
, path
,
4579 item_size
+ sizeof(struct btrfs_item
));
4584 setup_items_for_insert(root
, path
, new_key
, &item_size
,
4585 item_size
, item_size
+
4586 sizeof(struct btrfs_item
), 1);
4587 leaf
= path
->nodes
[0];
4588 memcpy_extent_buffer(leaf
,
4589 btrfs_item_ptr_offset(leaf
, path
->slots
[0]),
4590 btrfs_item_ptr_offset(leaf
, path
->slots
[0] - 1),
4596 * make the item pointed to by the path smaller. new_size indicates
4597 * how small to make it, and from_end tells us if we just chop bytes
4598 * off the end of the item or if we shift the item to chop bytes off
4601 void btrfs_truncate_item(struct btrfs_fs_info
*fs_info
,
4602 struct btrfs_path
*path
, u32 new_size
, int from_end
)
4605 struct extent_buffer
*leaf
;
4606 struct btrfs_item
*item
;
4608 unsigned int data_end
;
4609 unsigned int old_data_start
;
4610 unsigned int old_size
;
4611 unsigned int size_diff
;
4613 struct btrfs_map_token token
;
4615 btrfs_init_map_token(&token
);
4617 leaf
= path
->nodes
[0];
4618 slot
= path
->slots
[0];
4620 old_size
= btrfs_item_size_nr(leaf
, slot
);
4621 if (old_size
== new_size
)
4624 nritems
= btrfs_header_nritems(leaf
);
4625 data_end
= leaf_data_end(fs_info
, leaf
);
4627 old_data_start
= btrfs_item_offset_nr(leaf
, slot
);
4629 size_diff
= old_size
- new_size
;
4632 BUG_ON(slot
>= nritems
);
4635 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4637 /* first correct the data pointers */
4638 for (i
= slot
; i
< nritems
; i
++) {
4640 item
= btrfs_item_nr(i
);
4642 ioff
= btrfs_token_item_offset(leaf
, item
, &token
);
4643 btrfs_set_token_item_offset(leaf
, item
,
4644 ioff
+ size_diff
, &token
);
4647 /* shift the data */
4649 memmove_extent_buffer(leaf
, BTRFS_LEAF_DATA_OFFSET
+
4650 data_end
+ size_diff
, BTRFS_LEAF_DATA_OFFSET
+
4651 data_end
, old_data_start
+ new_size
- data_end
);
4653 struct btrfs_disk_key disk_key
;
4656 btrfs_item_key(leaf
, &disk_key
, slot
);
4658 if (btrfs_disk_key_type(&disk_key
) == BTRFS_EXTENT_DATA_KEY
) {
4660 struct btrfs_file_extent_item
*fi
;
4662 fi
= btrfs_item_ptr(leaf
, slot
,
4663 struct btrfs_file_extent_item
);
4664 fi
= (struct btrfs_file_extent_item
*)(
4665 (unsigned long)fi
- size_diff
);
4667 if (btrfs_file_extent_type(leaf
, fi
) ==
4668 BTRFS_FILE_EXTENT_INLINE
) {
4669 ptr
= btrfs_item_ptr_offset(leaf
, slot
);
4670 memmove_extent_buffer(leaf
, ptr
,
4672 BTRFS_FILE_EXTENT_INLINE_DATA_START
);
4676 memmove_extent_buffer(leaf
, BTRFS_LEAF_DATA_OFFSET
+
4677 data_end
+ size_diff
, BTRFS_LEAF_DATA_OFFSET
+
4678 data_end
, old_data_start
- data_end
);
4680 offset
= btrfs_disk_key_offset(&disk_key
);
4681 btrfs_set_disk_key_offset(&disk_key
, offset
+ size_diff
);
4682 btrfs_set_item_key(leaf
, &disk_key
, slot
);
4684 fixup_low_keys(fs_info
, path
, &disk_key
, 1);
4687 item
= btrfs_item_nr(slot
);
4688 btrfs_set_item_size(leaf
, item
, new_size
);
4689 btrfs_mark_buffer_dirty(leaf
);
4691 if (btrfs_leaf_free_space(fs_info
, leaf
) < 0) {
4692 btrfs_print_leaf(leaf
);
4698 * make the item pointed to by the path bigger, data_size is the added size.
4700 void btrfs_extend_item(struct btrfs_fs_info
*fs_info
, struct btrfs_path
*path
,
4704 struct extent_buffer
*leaf
;
4705 struct btrfs_item
*item
;
4707 unsigned int data_end
;
4708 unsigned int old_data
;
4709 unsigned int old_size
;
4711 struct btrfs_map_token token
;
4713 btrfs_init_map_token(&token
);
4715 leaf
= path
->nodes
[0];
4717 nritems
= btrfs_header_nritems(leaf
);
4718 data_end
= leaf_data_end(fs_info
, leaf
);
4720 if (btrfs_leaf_free_space(fs_info
, leaf
) < data_size
) {
4721 btrfs_print_leaf(leaf
);
4724 slot
= path
->slots
[0];
4725 old_data
= btrfs_item_end_nr(leaf
, slot
);
4728 if (slot
>= nritems
) {
4729 btrfs_print_leaf(leaf
);
4730 btrfs_crit(fs_info
, "slot %d too large, nritems %d",
4736 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4738 /* first correct the data pointers */
4739 for (i
= slot
; i
< nritems
; i
++) {
4741 item
= btrfs_item_nr(i
);
4743 ioff
= btrfs_token_item_offset(leaf
, item
, &token
);
4744 btrfs_set_token_item_offset(leaf
, item
,
4745 ioff
- data_size
, &token
);
4748 /* shift the data */
4749 memmove_extent_buffer(leaf
, BTRFS_LEAF_DATA_OFFSET
+
4750 data_end
- data_size
, BTRFS_LEAF_DATA_OFFSET
+
4751 data_end
, old_data
- data_end
);
4753 data_end
= old_data
;
4754 old_size
= btrfs_item_size_nr(leaf
, slot
);
4755 item
= btrfs_item_nr(slot
);
4756 btrfs_set_item_size(leaf
, item
, old_size
+ data_size
);
4757 btrfs_mark_buffer_dirty(leaf
);
4759 if (btrfs_leaf_free_space(fs_info
, leaf
) < 0) {
4760 btrfs_print_leaf(leaf
);
4766 * this is a helper for btrfs_insert_empty_items, the main goal here is
4767 * to save stack depth by doing the bulk of the work in a function
4768 * that doesn't call btrfs_search_slot
4770 void setup_items_for_insert(struct btrfs_root
*root
, struct btrfs_path
*path
,
4771 const struct btrfs_key
*cpu_key
, u32
*data_size
,
4772 u32 total_data
, u32 total_size
, int nr
)
4774 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4775 struct btrfs_item
*item
;
4778 unsigned int data_end
;
4779 struct btrfs_disk_key disk_key
;
4780 struct extent_buffer
*leaf
;
4782 struct btrfs_map_token token
;
4784 if (path
->slots
[0] == 0) {
4785 btrfs_cpu_key_to_disk(&disk_key
, cpu_key
);
4786 fixup_low_keys(fs_info
, path
, &disk_key
, 1);
4788 btrfs_unlock_up_safe(path
, 1);
4790 btrfs_init_map_token(&token
);
4792 leaf
= path
->nodes
[0];
4793 slot
= path
->slots
[0];
4795 nritems
= btrfs_header_nritems(leaf
);
4796 data_end
= leaf_data_end(fs_info
, leaf
);
4798 if (btrfs_leaf_free_space(fs_info
, leaf
) < total_size
) {
4799 btrfs_print_leaf(leaf
);
4800 btrfs_crit(fs_info
, "not enough freespace need %u have %d",
4801 total_size
, btrfs_leaf_free_space(fs_info
, leaf
));
4805 if (slot
!= nritems
) {
4806 unsigned int old_data
= btrfs_item_end_nr(leaf
, slot
);
4808 if (old_data
< data_end
) {
4809 btrfs_print_leaf(leaf
);
4810 btrfs_crit(fs_info
, "slot %d old_data %d data_end %d",
4811 slot
, old_data
, data_end
);
4815 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4817 /* first correct the data pointers */
4818 for (i
= slot
; i
< nritems
; i
++) {
4821 item
= btrfs_item_nr(i
);
4822 ioff
= btrfs_token_item_offset(leaf
, item
, &token
);
4823 btrfs_set_token_item_offset(leaf
, item
,
4824 ioff
- total_data
, &token
);
4826 /* shift the items */
4827 memmove_extent_buffer(leaf
, btrfs_item_nr_offset(slot
+ nr
),
4828 btrfs_item_nr_offset(slot
),
4829 (nritems
- slot
) * sizeof(struct btrfs_item
));
4831 /* shift the data */
4832 memmove_extent_buffer(leaf
, BTRFS_LEAF_DATA_OFFSET
+
4833 data_end
- total_data
, BTRFS_LEAF_DATA_OFFSET
+
4834 data_end
, old_data
- data_end
);
4835 data_end
= old_data
;
4838 /* setup the item for the new data */
4839 for (i
= 0; i
< nr
; i
++) {
4840 btrfs_cpu_key_to_disk(&disk_key
, cpu_key
+ i
);
4841 btrfs_set_item_key(leaf
, &disk_key
, slot
+ i
);
4842 item
= btrfs_item_nr(slot
+ i
);
4843 btrfs_set_token_item_offset(leaf
, item
,
4844 data_end
- data_size
[i
], &token
);
4845 data_end
-= data_size
[i
];
4846 btrfs_set_token_item_size(leaf
, item
, data_size
[i
], &token
);
4849 btrfs_set_header_nritems(leaf
, nritems
+ nr
);
4850 btrfs_mark_buffer_dirty(leaf
);
4852 if (btrfs_leaf_free_space(fs_info
, leaf
) < 0) {
4853 btrfs_print_leaf(leaf
);
4859 * Given a key and some data, insert items into the tree.
4860 * This does all the path init required, making room in the tree if needed.
4862 int btrfs_insert_empty_items(struct btrfs_trans_handle
*trans
,
4863 struct btrfs_root
*root
,
4864 struct btrfs_path
*path
,
4865 const struct btrfs_key
*cpu_key
, u32
*data_size
,
4874 for (i
= 0; i
< nr
; i
++)
4875 total_data
+= data_size
[i
];
4877 total_size
= total_data
+ (nr
* sizeof(struct btrfs_item
));
4878 ret
= btrfs_search_slot(trans
, root
, cpu_key
, path
, total_size
, 1);
4884 slot
= path
->slots
[0];
4887 setup_items_for_insert(root
, path
, cpu_key
, data_size
,
4888 total_data
, total_size
, nr
);
4893 * Given a key and some data, insert an item into the tree.
4894 * This does all the path init required, making room in the tree if needed.
4896 int btrfs_insert_item(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
4897 const struct btrfs_key
*cpu_key
, void *data
,
4901 struct btrfs_path
*path
;
4902 struct extent_buffer
*leaf
;
4905 path
= btrfs_alloc_path();
4908 ret
= btrfs_insert_empty_item(trans
, root
, path
, cpu_key
, data_size
);
4910 leaf
= path
->nodes
[0];
4911 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
4912 write_extent_buffer(leaf
, data
, ptr
, data_size
);
4913 btrfs_mark_buffer_dirty(leaf
);
4915 btrfs_free_path(path
);
4920 * delete the pointer from a given node.
4922 * the tree should have been previously balanced so the deletion does not
4925 static void del_ptr(struct btrfs_root
*root
, struct btrfs_path
*path
,
4926 int level
, int slot
)
4928 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4929 struct extent_buffer
*parent
= path
->nodes
[level
];
4933 nritems
= btrfs_header_nritems(parent
);
4934 if (slot
!= nritems
- 1) {
4936 tree_mod_log_eb_move(fs_info
, parent
, slot
,
4937 slot
+ 1, nritems
- slot
- 1);
4938 memmove_extent_buffer(parent
,
4939 btrfs_node_key_ptr_offset(slot
),
4940 btrfs_node_key_ptr_offset(slot
+ 1),
4941 sizeof(struct btrfs_key_ptr
) *
4942 (nritems
- slot
- 1));
4944 ret
= tree_mod_log_insert_key(fs_info
, parent
, slot
,
4945 MOD_LOG_KEY_REMOVE
, GFP_NOFS
);
4950 btrfs_set_header_nritems(parent
, nritems
);
4951 if (nritems
== 0 && parent
== root
->node
) {
4952 BUG_ON(btrfs_header_level(root
->node
) != 1);
4953 /* just turn the root into a leaf and break */
4954 btrfs_set_header_level(root
->node
, 0);
4955 } else if (slot
== 0) {
4956 struct btrfs_disk_key disk_key
;
4958 btrfs_node_key(parent
, &disk_key
, 0);
4959 fixup_low_keys(fs_info
, path
, &disk_key
, level
+ 1);
4961 btrfs_mark_buffer_dirty(parent
);
4965 * a helper function to delete the leaf pointed to by path->slots[1] and
4968 * This deletes the pointer in path->nodes[1] and frees the leaf
4969 * block extent. zero is returned if it all worked out, < 0 otherwise.
4971 * The path must have already been setup for deleting the leaf, including
4972 * all the proper balancing. path->nodes[1] must be locked.
4974 static noinline
void btrfs_del_leaf(struct btrfs_trans_handle
*trans
,
4975 struct btrfs_root
*root
,
4976 struct btrfs_path
*path
,
4977 struct extent_buffer
*leaf
)
4979 WARN_ON(btrfs_header_generation(leaf
) != trans
->transid
);
4980 del_ptr(root
, path
, 1, path
->slots
[1]);
4983 * btrfs_free_extent is expensive, we want to make sure we
4984 * aren't holding any locks when we call it
4986 btrfs_unlock_up_safe(path
, 0);
4988 root_sub_used(root
, leaf
->len
);
4990 extent_buffer_get(leaf
);
4991 btrfs_free_tree_block(trans
, root
, leaf
, 0, 1);
4992 free_extent_buffer_stale(leaf
);
4995 * delete the item at the leaf level in path. If that empties
4996 * the leaf, remove it from the tree
4998 int btrfs_del_items(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
4999 struct btrfs_path
*path
, int slot
, int nr
)
5001 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5002 struct extent_buffer
*leaf
;
5003 struct btrfs_item
*item
;
5010 struct btrfs_map_token token
;
5012 btrfs_init_map_token(&token
);
5014 leaf
= path
->nodes
[0];
5015 last_off
= btrfs_item_offset_nr(leaf
, slot
+ nr
- 1);
5017 for (i
= 0; i
< nr
; i
++)
5018 dsize
+= btrfs_item_size_nr(leaf
, slot
+ i
);
5020 nritems
= btrfs_header_nritems(leaf
);
5022 if (slot
+ nr
!= nritems
) {
5023 int data_end
= leaf_data_end(fs_info
, leaf
);
5025 memmove_extent_buffer(leaf
, BTRFS_LEAF_DATA_OFFSET
+
5027 BTRFS_LEAF_DATA_OFFSET
+ data_end
,
5028 last_off
- data_end
);
5030 for (i
= slot
+ nr
; i
< nritems
; i
++) {
5033 item
= btrfs_item_nr(i
);
5034 ioff
= btrfs_token_item_offset(leaf
, item
, &token
);
5035 btrfs_set_token_item_offset(leaf
, item
,
5036 ioff
+ dsize
, &token
);
5039 memmove_extent_buffer(leaf
, btrfs_item_nr_offset(slot
),
5040 btrfs_item_nr_offset(slot
+ nr
),
5041 sizeof(struct btrfs_item
) *
5042 (nritems
- slot
- nr
));
5044 btrfs_set_header_nritems(leaf
, nritems
- nr
);
5047 /* delete the leaf if we've emptied it */
5049 if (leaf
== root
->node
) {
5050 btrfs_set_header_level(leaf
, 0);
5052 btrfs_set_path_blocking(path
);
5053 clean_tree_block(fs_info
, leaf
);
5054 btrfs_del_leaf(trans
, root
, path
, leaf
);
5057 int used
= leaf_space_used(leaf
, 0, nritems
);
5059 struct btrfs_disk_key disk_key
;
5061 btrfs_item_key(leaf
, &disk_key
, 0);
5062 fixup_low_keys(fs_info
, path
, &disk_key
, 1);
5065 /* delete the leaf if it is mostly empty */
5066 if (used
< BTRFS_LEAF_DATA_SIZE(fs_info
) / 3) {
5067 /* push_leaf_left fixes the path.
5068 * make sure the path still points to our leaf
5069 * for possible call to del_ptr below
5071 slot
= path
->slots
[1];
5072 extent_buffer_get(leaf
);
5074 btrfs_set_path_blocking(path
);
5075 wret
= push_leaf_left(trans
, root
, path
, 1, 1,
5077 if (wret
< 0 && wret
!= -ENOSPC
)
5080 if (path
->nodes
[0] == leaf
&&
5081 btrfs_header_nritems(leaf
)) {
5082 wret
= push_leaf_right(trans
, root
, path
, 1,
5084 if (wret
< 0 && wret
!= -ENOSPC
)
5088 if (btrfs_header_nritems(leaf
) == 0) {
5089 path
->slots
[1] = slot
;
5090 btrfs_del_leaf(trans
, root
, path
, leaf
);
5091 free_extent_buffer(leaf
);
5094 /* if we're still in the path, make sure
5095 * we're dirty. Otherwise, one of the
5096 * push_leaf functions must have already
5097 * dirtied this buffer
5099 if (path
->nodes
[0] == leaf
)
5100 btrfs_mark_buffer_dirty(leaf
);
5101 free_extent_buffer(leaf
);
5104 btrfs_mark_buffer_dirty(leaf
);
5111 * search the tree again to find a leaf with lesser keys
5112 * returns 0 if it found something or 1 if there are no lesser leaves.
5113 * returns < 0 on io errors.
5115 * This may release the path, and so you may lose any locks held at the
5118 int btrfs_prev_leaf(struct btrfs_root
*root
, struct btrfs_path
*path
)
5120 struct btrfs_key key
;
5121 struct btrfs_disk_key found_key
;
5124 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, 0);
5126 if (key
.offset
> 0) {
5128 } else if (key
.type
> 0) {
5130 key
.offset
= (u64
)-1;
5131 } else if (key
.objectid
> 0) {
5134 key
.offset
= (u64
)-1;
5139 btrfs_release_path(path
);
5140 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
5143 btrfs_item_key(path
->nodes
[0], &found_key
, 0);
5144 ret
= comp_keys(&found_key
, &key
);
5146 * We might have had an item with the previous key in the tree right
5147 * before we released our path. And after we released our path, that
5148 * item might have been pushed to the first slot (0) of the leaf we
5149 * were holding due to a tree balance. Alternatively, an item with the
5150 * previous key can exist as the only element of a leaf (big fat item).
5151 * Therefore account for these 2 cases, so that our callers (like
5152 * btrfs_previous_item) don't miss an existing item with a key matching
5153 * the previous key we computed above.
5161 * A helper function to walk down the tree starting at min_key, and looking
5162 * for nodes or leaves that are have a minimum transaction id.
5163 * This is used by the btree defrag code, and tree logging
5165 * This does not cow, but it does stuff the starting key it finds back
5166 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5167 * key and get a writable path.
5169 * This does lock as it descends, and path->keep_locks should be set
5170 * to 1 by the caller.
5172 * This honors path->lowest_level to prevent descent past a given level
5175 * min_trans indicates the oldest transaction that you are interested
5176 * in walking through. Any nodes or leaves older than min_trans are
5177 * skipped over (without reading them).
5179 * returns zero if something useful was found, < 0 on error and 1 if there
5180 * was nothing in the tree that matched the search criteria.
5182 int btrfs_search_forward(struct btrfs_root
*root
, struct btrfs_key
*min_key
,
5183 struct btrfs_path
*path
,
5186 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5187 struct extent_buffer
*cur
;
5188 struct btrfs_key found_key
;
5194 int keep_locks
= path
->keep_locks
;
5196 path
->keep_locks
= 1;
5198 cur
= btrfs_read_lock_root_node(root
);
5199 level
= btrfs_header_level(cur
);
5200 WARN_ON(path
->nodes
[level
]);
5201 path
->nodes
[level
] = cur
;
5202 path
->locks
[level
] = BTRFS_READ_LOCK
;
5204 if (btrfs_header_generation(cur
) < min_trans
) {
5209 nritems
= btrfs_header_nritems(cur
);
5210 level
= btrfs_header_level(cur
);
5211 sret
= bin_search(cur
, min_key
, level
, &slot
);
5213 /* at the lowest level, we're done, setup the path and exit */
5214 if (level
== path
->lowest_level
) {
5215 if (slot
>= nritems
)
5218 path
->slots
[level
] = slot
;
5219 btrfs_item_key_to_cpu(cur
, &found_key
, slot
);
5222 if (sret
&& slot
> 0)
5225 * check this node pointer against the min_trans parameters.
5226 * If it is too old, old, skip to the next one.
5228 while (slot
< nritems
) {
5231 gen
= btrfs_node_ptr_generation(cur
, slot
);
5232 if (gen
< min_trans
) {
5240 * we didn't find a candidate key in this node, walk forward
5241 * and find another one
5243 if (slot
>= nritems
) {
5244 path
->slots
[level
] = slot
;
5245 btrfs_set_path_blocking(path
);
5246 sret
= btrfs_find_next_key(root
, path
, min_key
, level
,
5249 btrfs_release_path(path
);
5255 /* save our key for returning back */
5256 btrfs_node_key_to_cpu(cur
, &found_key
, slot
);
5257 path
->slots
[level
] = slot
;
5258 if (level
== path
->lowest_level
) {
5262 btrfs_set_path_blocking(path
);
5263 cur
= read_node_slot(fs_info
, cur
, slot
);
5269 btrfs_tree_read_lock(cur
);
5271 path
->locks
[level
- 1] = BTRFS_READ_LOCK
;
5272 path
->nodes
[level
- 1] = cur
;
5273 unlock_up(path
, level
, 1, 0, NULL
);
5274 btrfs_clear_path_blocking(path
, NULL
, 0);
5277 path
->keep_locks
= keep_locks
;
5279 btrfs_unlock_up_safe(path
, path
->lowest_level
+ 1);
5280 btrfs_set_path_blocking(path
);
5281 memcpy(min_key
, &found_key
, sizeof(found_key
));
5286 static int tree_move_down(struct btrfs_fs_info
*fs_info
,
5287 struct btrfs_path
*path
,
5290 struct extent_buffer
*eb
;
5292 BUG_ON(*level
== 0);
5293 eb
= read_node_slot(fs_info
, path
->nodes
[*level
], path
->slots
[*level
]);
5297 path
->nodes
[*level
- 1] = eb
;
5298 path
->slots
[*level
- 1] = 0;
5303 static int tree_move_next_or_upnext(struct btrfs_path
*path
,
5304 int *level
, int root_level
)
5308 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
5310 path
->slots
[*level
]++;
5312 while (path
->slots
[*level
] >= nritems
) {
5313 if (*level
== root_level
)
5317 path
->slots
[*level
] = 0;
5318 free_extent_buffer(path
->nodes
[*level
]);
5319 path
->nodes
[*level
] = NULL
;
5321 path
->slots
[*level
]++;
5323 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
5330 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5333 static int tree_advance(struct btrfs_fs_info
*fs_info
,
5334 struct btrfs_path
*path
,
5335 int *level
, int root_level
,
5337 struct btrfs_key
*key
)
5341 if (*level
== 0 || !allow_down
) {
5342 ret
= tree_move_next_or_upnext(path
, level
, root_level
);
5344 ret
= tree_move_down(fs_info
, path
, level
);
5348 btrfs_item_key_to_cpu(path
->nodes
[*level
], key
,
5349 path
->slots
[*level
]);
5351 btrfs_node_key_to_cpu(path
->nodes
[*level
], key
,
5352 path
->slots
[*level
]);
5357 static int tree_compare_item(struct btrfs_path
*left_path
,
5358 struct btrfs_path
*right_path
,
5363 unsigned long off1
, off2
;
5365 len1
= btrfs_item_size_nr(left_path
->nodes
[0], left_path
->slots
[0]);
5366 len2
= btrfs_item_size_nr(right_path
->nodes
[0], right_path
->slots
[0]);
5370 off1
= btrfs_item_ptr_offset(left_path
->nodes
[0], left_path
->slots
[0]);
5371 off2
= btrfs_item_ptr_offset(right_path
->nodes
[0],
5372 right_path
->slots
[0]);
5374 read_extent_buffer(left_path
->nodes
[0], tmp_buf
, off1
, len1
);
5376 cmp
= memcmp_extent_buffer(right_path
->nodes
[0], tmp_buf
, off2
, len1
);
5383 #define ADVANCE_ONLY_NEXT -1
5386 * This function compares two trees and calls the provided callback for
5387 * every changed/new/deleted item it finds.
5388 * If shared tree blocks are encountered, whole subtrees are skipped, making
5389 * the compare pretty fast on snapshotted subvolumes.
5391 * This currently works on commit roots only. As commit roots are read only,
5392 * we don't do any locking. The commit roots are protected with transactions.
5393 * Transactions are ended and rejoined when a commit is tried in between.
5395 * This function checks for modifications done to the trees while comparing.
5396 * If it detects a change, it aborts immediately.
5398 int btrfs_compare_trees(struct btrfs_root
*left_root
,
5399 struct btrfs_root
*right_root
,
5400 btrfs_changed_cb_t changed_cb
, void *ctx
)
5402 struct btrfs_fs_info
*fs_info
= left_root
->fs_info
;
5405 struct btrfs_path
*left_path
= NULL
;
5406 struct btrfs_path
*right_path
= NULL
;
5407 struct btrfs_key left_key
;
5408 struct btrfs_key right_key
;
5409 char *tmp_buf
= NULL
;
5410 int left_root_level
;
5411 int right_root_level
;
5414 int left_end_reached
;
5415 int right_end_reached
;
5423 left_path
= btrfs_alloc_path();
5428 right_path
= btrfs_alloc_path();
5434 tmp_buf
= kvmalloc(fs_info
->nodesize
, GFP_KERNEL
);
5440 left_path
->search_commit_root
= 1;
5441 left_path
->skip_locking
= 1;
5442 right_path
->search_commit_root
= 1;
5443 right_path
->skip_locking
= 1;
5446 * Strategy: Go to the first items of both trees. Then do
5448 * If both trees are at level 0
5449 * Compare keys of current items
5450 * If left < right treat left item as new, advance left tree
5452 * If left > right treat right item as deleted, advance right tree
5454 * If left == right do deep compare of items, treat as changed if
5455 * needed, advance both trees and repeat
5456 * If both trees are at the same level but not at level 0
5457 * Compare keys of current nodes/leafs
5458 * If left < right advance left tree and repeat
5459 * If left > right advance right tree and repeat
5460 * If left == right compare blockptrs of the next nodes/leafs
5461 * If they match advance both trees but stay at the same level
5463 * If they don't match advance both trees while allowing to go
5465 * If tree levels are different
5466 * Advance the tree that needs it and repeat
5468 * Advancing a tree means:
5469 * If we are at level 0, try to go to the next slot. If that's not
5470 * possible, go one level up and repeat. Stop when we found a level
5471 * where we could go to the next slot. We may at this point be on a
5474 * If we are not at level 0 and not on shared tree blocks, go one
5477 * If we are not at level 0 and on shared tree blocks, go one slot to
5478 * the right if possible or go up and right.
5481 down_read(&fs_info
->commit_root_sem
);
5482 left_level
= btrfs_header_level(left_root
->commit_root
);
5483 left_root_level
= left_level
;
5484 left_path
->nodes
[left_level
] =
5485 btrfs_clone_extent_buffer(left_root
->commit_root
);
5486 if (!left_path
->nodes
[left_level
]) {
5487 up_read(&fs_info
->commit_root_sem
);
5491 extent_buffer_get(left_path
->nodes
[left_level
]);
5493 right_level
= btrfs_header_level(right_root
->commit_root
);
5494 right_root_level
= right_level
;
5495 right_path
->nodes
[right_level
] =
5496 btrfs_clone_extent_buffer(right_root
->commit_root
);
5497 if (!right_path
->nodes
[right_level
]) {
5498 up_read(&fs_info
->commit_root_sem
);
5502 extent_buffer_get(right_path
->nodes
[right_level
]);
5503 up_read(&fs_info
->commit_root_sem
);
5505 if (left_level
== 0)
5506 btrfs_item_key_to_cpu(left_path
->nodes
[left_level
],
5507 &left_key
, left_path
->slots
[left_level
]);
5509 btrfs_node_key_to_cpu(left_path
->nodes
[left_level
],
5510 &left_key
, left_path
->slots
[left_level
]);
5511 if (right_level
== 0)
5512 btrfs_item_key_to_cpu(right_path
->nodes
[right_level
],
5513 &right_key
, right_path
->slots
[right_level
]);
5515 btrfs_node_key_to_cpu(right_path
->nodes
[right_level
],
5516 &right_key
, right_path
->slots
[right_level
]);
5518 left_end_reached
= right_end_reached
= 0;
5519 advance_left
= advance_right
= 0;
5523 if (advance_left
&& !left_end_reached
) {
5524 ret
= tree_advance(fs_info
, left_path
, &left_level
,
5526 advance_left
!= ADVANCE_ONLY_NEXT
,
5529 left_end_reached
= ADVANCE
;
5534 if (advance_right
&& !right_end_reached
) {
5535 ret
= tree_advance(fs_info
, right_path
, &right_level
,
5537 advance_right
!= ADVANCE_ONLY_NEXT
,
5540 right_end_reached
= ADVANCE
;
5546 if (left_end_reached
&& right_end_reached
) {
5549 } else if (left_end_reached
) {
5550 if (right_level
== 0) {
5551 ret
= changed_cb(left_path
, right_path
,
5553 BTRFS_COMPARE_TREE_DELETED
,
5558 advance_right
= ADVANCE
;
5560 } else if (right_end_reached
) {
5561 if (left_level
== 0) {
5562 ret
= changed_cb(left_path
, right_path
,
5564 BTRFS_COMPARE_TREE_NEW
,
5569 advance_left
= ADVANCE
;
5573 if (left_level
== 0 && right_level
== 0) {
5574 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
5576 ret
= changed_cb(left_path
, right_path
,
5578 BTRFS_COMPARE_TREE_NEW
,
5582 advance_left
= ADVANCE
;
5583 } else if (cmp
> 0) {
5584 ret
= changed_cb(left_path
, right_path
,
5586 BTRFS_COMPARE_TREE_DELETED
,
5590 advance_right
= ADVANCE
;
5592 enum btrfs_compare_tree_result result
;
5594 WARN_ON(!extent_buffer_uptodate(left_path
->nodes
[0]));
5595 ret
= tree_compare_item(left_path
, right_path
,
5598 result
= BTRFS_COMPARE_TREE_CHANGED
;
5600 result
= BTRFS_COMPARE_TREE_SAME
;
5601 ret
= changed_cb(left_path
, right_path
,
5602 &left_key
, result
, ctx
);
5605 advance_left
= ADVANCE
;
5606 advance_right
= ADVANCE
;
5608 } else if (left_level
== right_level
) {
5609 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
5611 advance_left
= ADVANCE
;
5612 } else if (cmp
> 0) {
5613 advance_right
= ADVANCE
;
5615 left_blockptr
= btrfs_node_blockptr(
5616 left_path
->nodes
[left_level
],
5617 left_path
->slots
[left_level
]);
5618 right_blockptr
= btrfs_node_blockptr(
5619 right_path
->nodes
[right_level
],
5620 right_path
->slots
[right_level
]);
5621 left_gen
= btrfs_node_ptr_generation(
5622 left_path
->nodes
[left_level
],
5623 left_path
->slots
[left_level
]);
5624 right_gen
= btrfs_node_ptr_generation(
5625 right_path
->nodes
[right_level
],
5626 right_path
->slots
[right_level
]);
5627 if (left_blockptr
== right_blockptr
&&
5628 left_gen
== right_gen
) {
5630 * As we're on a shared block, don't
5631 * allow to go deeper.
5633 advance_left
= ADVANCE_ONLY_NEXT
;
5634 advance_right
= ADVANCE_ONLY_NEXT
;
5636 advance_left
= ADVANCE
;
5637 advance_right
= ADVANCE
;
5640 } else if (left_level
< right_level
) {
5641 advance_right
= ADVANCE
;
5643 advance_left
= ADVANCE
;
5648 btrfs_free_path(left_path
);
5649 btrfs_free_path(right_path
);
5655 * this is similar to btrfs_next_leaf, but does not try to preserve
5656 * and fixup the path. It looks for and returns the next key in the
5657 * tree based on the current path and the min_trans parameters.
5659 * 0 is returned if another key is found, < 0 if there are any errors
5660 * and 1 is returned if there are no higher keys in the tree
5662 * path->keep_locks should be set to 1 on the search made before
5663 * calling this function.
5665 int btrfs_find_next_key(struct btrfs_root
*root
, struct btrfs_path
*path
,
5666 struct btrfs_key
*key
, int level
, u64 min_trans
)
5669 struct extent_buffer
*c
;
5671 WARN_ON(!path
->keep_locks
);
5672 while (level
< BTRFS_MAX_LEVEL
) {
5673 if (!path
->nodes
[level
])
5676 slot
= path
->slots
[level
] + 1;
5677 c
= path
->nodes
[level
];
5679 if (slot
>= btrfs_header_nritems(c
)) {
5682 struct btrfs_key cur_key
;
5683 if (level
+ 1 >= BTRFS_MAX_LEVEL
||
5684 !path
->nodes
[level
+ 1])
5687 if (path
->locks
[level
+ 1]) {
5692 slot
= btrfs_header_nritems(c
) - 1;
5694 btrfs_item_key_to_cpu(c
, &cur_key
, slot
);
5696 btrfs_node_key_to_cpu(c
, &cur_key
, slot
);
5698 orig_lowest
= path
->lowest_level
;
5699 btrfs_release_path(path
);
5700 path
->lowest_level
= level
;
5701 ret
= btrfs_search_slot(NULL
, root
, &cur_key
, path
,
5703 path
->lowest_level
= orig_lowest
;
5707 c
= path
->nodes
[level
];
5708 slot
= path
->slots
[level
];
5715 btrfs_item_key_to_cpu(c
, key
, slot
);
5717 u64 gen
= btrfs_node_ptr_generation(c
, slot
);
5719 if (gen
< min_trans
) {
5723 btrfs_node_key_to_cpu(c
, key
, slot
);
5731 * search the tree again to find a leaf with greater keys
5732 * returns 0 if it found something or 1 if there are no greater leaves.
5733 * returns < 0 on io errors.
5735 int btrfs_next_leaf(struct btrfs_root
*root
, struct btrfs_path
*path
)
5737 return btrfs_next_old_leaf(root
, path
, 0);
5740 int btrfs_next_old_leaf(struct btrfs_root
*root
, struct btrfs_path
*path
,
5745 struct extent_buffer
*c
;
5746 struct extent_buffer
*next
;
5747 struct btrfs_key key
;
5750 int old_spinning
= path
->leave_spinning
;
5751 int next_rw_lock
= 0;
5753 nritems
= btrfs_header_nritems(path
->nodes
[0]);
5757 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, nritems
- 1);
5762 btrfs_release_path(path
);
5764 path
->keep_locks
= 1;
5765 path
->leave_spinning
= 1;
5768 ret
= btrfs_search_old_slot(root
, &key
, path
, time_seq
);
5770 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
5771 path
->keep_locks
= 0;
5776 nritems
= btrfs_header_nritems(path
->nodes
[0]);
5778 * by releasing the path above we dropped all our locks. A balance
5779 * could have added more items next to the key that used to be
5780 * at the very end of the block. So, check again here and
5781 * advance the path if there are now more items available.
5783 if (nritems
> 0 && path
->slots
[0] < nritems
- 1) {
5790 * So the above check misses one case:
5791 * - after releasing the path above, someone has removed the item that
5792 * used to be at the very end of the block, and balance between leafs
5793 * gets another one with bigger key.offset to replace it.
5795 * This one should be returned as well, or we can get leaf corruption
5796 * later(esp. in __btrfs_drop_extents()).
5798 * And a bit more explanation about this check,
5799 * with ret > 0, the key isn't found, the path points to the slot
5800 * where it should be inserted, so the path->slots[0] item must be the
5803 if (nritems
> 0 && ret
> 0 && path
->slots
[0] == nritems
- 1) {
5808 while (level
< BTRFS_MAX_LEVEL
) {
5809 if (!path
->nodes
[level
]) {
5814 slot
= path
->slots
[level
] + 1;
5815 c
= path
->nodes
[level
];
5816 if (slot
>= btrfs_header_nritems(c
)) {
5818 if (level
== BTRFS_MAX_LEVEL
) {
5826 btrfs_tree_unlock_rw(next
, next_rw_lock
);
5827 free_extent_buffer(next
);
5831 next_rw_lock
= path
->locks
[level
];
5832 ret
= read_block_for_search(root
, path
, &next
, level
,
5838 btrfs_release_path(path
);
5842 if (!path
->skip_locking
) {
5843 ret
= btrfs_try_tree_read_lock(next
);
5844 if (!ret
&& time_seq
) {
5846 * If we don't get the lock, we may be racing
5847 * with push_leaf_left, holding that lock while
5848 * itself waiting for the leaf we've currently
5849 * locked. To solve this situation, we give up
5850 * on our lock and cycle.
5852 free_extent_buffer(next
);
5853 btrfs_release_path(path
);
5858 btrfs_set_path_blocking(path
);
5859 btrfs_tree_read_lock(next
);
5860 btrfs_clear_path_blocking(path
, next
,
5863 next_rw_lock
= BTRFS_READ_LOCK
;
5867 path
->slots
[level
] = slot
;
5870 c
= path
->nodes
[level
];
5871 if (path
->locks
[level
])
5872 btrfs_tree_unlock_rw(c
, path
->locks
[level
]);
5874 free_extent_buffer(c
);
5875 path
->nodes
[level
] = next
;
5876 path
->slots
[level
] = 0;
5877 if (!path
->skip_locking
)
5878 path
->locks
[level
] = next_rw_lock
;
5882 ret
= read_block_for_search(root
, path
, &next
, level
,
5888 btrfs_release_path(path
);
5892 if (!path
->skip_locking
) {
5893 ret
= btrfs_try_tree_read_lock(next
);
5895 btrfs_set_path_blocking(path
);
5896 btrfs_tree_read_lock(next
);
5897 btrfs_clear_path_blocking(path
, next
,
5900 next_rw_lock
= BTRFS_READ_LOCK
;
5905 unlock_up(path
, 0, 1, 0, NULL
);
5906 path
->leave_spinning
= old_spinning
;
5908 btrfs_set_path_blocking(path
);
5914 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5915 * searching until it gets past min_objectid or finds an item of 'type'
5917 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5919 int btrfs_previous_item(struct btrfs_root
*root
,
5920 struct btrfs_path
*path
, u64 min_objectid
,
5923 struct btrfs_key found_key
;
5924 struct extent_buffer
*leaf
;
5929 if (path
->slots
[0] == 0) {
5930 btrfs_set_path_blocking(path
);
5931 ret
= btrfs_prev_leaf(root
, path
);
5937 leaf
= path
->nodes
[0];
5938 nritems
= btrfs_header_nritems(leaf
);
5941 if (path
->slots
[0] == nritems
)
5944 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
5945 if (found_key
.objectid
< min_objectid
)
5947 if (found_key
.type
== type
)
5949 if (found_key
.objectid
== min_objectid
&&
5950 found_key
.type
< type
)
5957 * search in extent tree to find a previous Metadata/Data extent item with
5960 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5962 int btrfs_previous_extent_item(struct btrfs_root
*root
,
5963 struct btrfs_path
*path
, u64 min_objectid
)
5965 struct btrfs_key found_key
;
5966 struct extent_buffer
*leaf
;
5971 if (path
->slots
[0] == 0) {
5972 btrfs_set_path_blocking(path
);
5973 ret
= btrfs_prev_leaf(root
, path
);
5979 leaf
= path
->nodes
[0];
5980 nritems
= btrfs_header_nritems(leaf
);
5983 if (path
->slots
[0] == nritems
)
5986 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
5987 if (found_key
.objectid
< min_objectid
)
5989 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
||
5990 found_key
.type
== BTRFS_METADATA_ITEM_KEY
)
5992 if (found_key
.objectid
== min_objectid
&&
5993 found_key
.type
< BTRFS_EXTENT_ITEM_KEY
)