]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - fs/btrfs/ctree.c
d94e7162d91df8839fe09ea394d24d9e78ba219c
[mirror_ubuntu-bionic-kernel.git] / fs / btrfs / ctree.c
1 /*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include <linux/mm.h>
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "transaction.h"
26 #include "print-tree.h"
27 #include "locking.h"
28
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 const struct btrfs_key *ins_key, struct btrfs_path *path,
33 int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35 struct btrfs_fs_info *fs_info,
36 struct extent_buffer *dst,
37 struct extent_buffer *src, int empty);
38 static int balance_node_right(struct btrfs_trans_handle *trans,
39 struct btrfs_fs_info *fs_info,
40 struct extent_buffer *dst_buf,
41 struct extent_buffer *src_buf);
42 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
43 int level, int slot);
44 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
45 struct extent_buffer *eb);
46
47 struct btrfs_path *btrfs_alloc_path(void)
48 {
49 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 }
51
52 /*
53 * set all locked nodes in the path to blocking locks. This should
54 * be done before scheduling
55 */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58 int i;
59 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60 if (!p->nodes[i] || !p->locks[i])
61 continue;
62 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63 if (p->locks[i] == BTRFS_READ_LOCK)
64 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65 else if (p->locks[i] == BTRFS_WRITE_LOCK)
66 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67 }
68 }
69
70 /*
71 * reset all the locked nodes in the patch to spinning locks.
72 *
73 * held is used to keep lockdep happy, when lockdep is enabled
74 * we set held to a blocking lock before we go around and
75 * retake all the spinlocks in the path. You can safely use NULL
76 * for held
77 */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 struct extent_buffer *held, int held_rw)
80 {
81 int i;
82
83 if (held) {
84 btrfs_set_lock_blocking_rw(held, held_rw);
85 if (held_rw == BTRFS_WRITE_LOCK)
86 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87 else if (held_rw == BTRFS_READ_LOCK)
88 held_rw = BTRFS_READ_LOCK_BLOCKING;
89 }
90 btrfs_set_path_blocking(p);
91
92 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93 if (p->nodes[i] && p->locks[i]) {
94 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96 p->locks[i] = BTRFS_WRITE_LOCK;
97 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98 p->locks[i] = BTRFS_READ_LOCK;
99 }
100 }
101
102 if (held)
103 btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109 if (!p)
110 return;
111 btrfs_release_path(p);
112 kmem_cache_free(btrfs_path_cachep, p);
113 }
114
115 /*
116 * path release drops references on the extent buffers in the path
117 * and it drops any locks held by this path
118 *
119 * It is safe to call this on paths that no locks or extent buffers held.
120 */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123 int i;
124
125 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126 p->slots[i] = 0;
127 if (!p->nodes[i])
128 continue;
129 if (p->locks[i]) {
130 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131 p->locks[i] = 0;
132 }
133 free_extent_buffer(p->nodes[i]);
134 p->nodes[i] = NULL;
135 }
136 }
137
138 /*
139 * safely gets a reference on the root node of a tree. A lock
140 * is not taken, so a concurrent writer may put a different node
141 * at the root of the tree. See btrfs_lock_root_node for the
142 * looping required.
143 *
144 * The extent buffer returned by this has a reference taken, so
145 * it won't disappear. It may stop being the root of the tree
146 * at any time because there are no locks held.
147 */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150 struct extent_buffer *eb;
151
152 while (1) {
153 rcu_read_lock();
154 eb = rcu_dereference(root->node);
155
156 /*
157 * RCU really hurts here, we could free up the root node because
158 * it was COWed but we may not get the new root node yet so do
159 * the inc_not_zero dance and if it doesn't work then
160 * synchronize_rcu and try again.
161 */
162 if (atomic_inc_not_zero(&eb->refs)) {
163 rcu_read_unlock();
164 break;
165 }
166 rcu_read_unlock();
167 synchronize_rcu();
168 }
169 return eb;
170 }
171
172 /* loop around taking references on and locking the root node of the
173 * tree until you end up with a lock on the root. A locked buffer
174 * is returned, with a reference held.
175 */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178 struct extent_buffer *eb;
179
180 while (1) {
181 eb = btrfs_root_node(root);
182 btrfs_tree_lock(eb);
183 if (eb == root->node)
184 break;
185 btrfs_tree_unlock(eb);
186 free_extent_buffer(eb);
187 }
188 return eb;
189 }
190
191 /* loop around taking references on and locking the root node of the
192 * tree until you end up with a lock on the root. A locked buffer
193 * is returned, with a reference held.
194 */
195 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197 struct extent_buffer *eb;
198
199 while (1) {
200 eb = btrfs_root_node(root);
201 btrfs_tree_read_lock(eb);
202 if (eb == root->node)
203 break;
204 btrfs_tree_read_unlock(eb);
205 free_extent_buffer(eb);
206 }
207 return eb;
208 }
209
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211 * put onto a simple dirty list. transaction.c walks this to make sure they
212 * get properly updated on disk.
213 */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216 struct btrfs_fs_info *fs_info = root->fs_info;
217
218 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
219 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
220 return;
221
222 spin_lock(&fs_info->trans_lock);
223 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
224 /* Want the extent tree to be the last on the list */
225 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
226 list_move_tail(&root->dirty_list,
227 &fs_info->dirty_cowonly_roots);
228 else
229 list_move(&root->dirty_list,
230 &fs_info->dirty_cowonly_roots);
231 }
232 spin_unlock(&fs_info->trans_lock);
233 }
234
235 /*
236 * used by snapshot creation to make a copy of a root for a tree with
237 * a given objectid. The buffer with the new root node is returned in
238 * cow_ret, and this func returns zero on success or a negative error code.
239 */
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241 struct btrfs_root *root,
242 struct extent_buffer *buf,
243 struct extent_buffer **cow_ret, u64 new_root_objectid)
244 {
245 struct btrfs_fs_info *fs_info = root->fs_info;
246 struct extent_buffer *cow;
247 int ret = 0;
248 int level;
249 struct btrfs_disk_key disk_key;
250
251 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
252 trans->transid != fs_info->running_transaction->transid);
253 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
254 trans->transid != root->last_trans);
255
256 level = btrfs_header_level(buf);
257 if (level == 0)
258 btrfs_item_key(buf, &disk_key, 0);
259 else
260 btrfs_node_key(buf, &disk_key, 0);
261
262 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
263 &disk_key, level, buf->start, 0);
264 if (IS_ERR(cow))
265 return PTR_ERR(cow);
266
267 copy_extent_buffer_full(cow, buf);
268 btrfs_set_header_bytenr(cow, cow->start);
269 btrfs_set_header_generation(cow, trans->transid);
270 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
271 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
272 BTRFS_HEADER_FLAG_RELOC);
273 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
274 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
275 else
276 btrfs_set_header_owner(cow, new_root_objectid);
277
278 write_extent_buffer_fsid(cow, fs_info->fsid);
279
280 WARN_ON(btrfs_header_generation(buf) > trans->transid);
281 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
282 ret = btrfs_inc_ref(trans, root, cow, 1);
283 else
284 ret = btrfs_inc_ref(trans, root, cow, 0);
285
286 if (ret)
287 return ret;
288
289 btrfs_mark_buffer_dirty(cow);
290 *cow_ret = cow;
291 return 0;
292 }
293
294 enum mod_log_op {
295 MOD_LOG_KEY_REPLACE,
296 MOD_LOG_KEY_ADD,
297 MOD_LOG_KEY_REMOVE,
298 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
299 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
300 MOD_LOG_MOVE_KEYS,
301 MOD_LOG_ROOT_REPLACE,
302 };
303
304 struct tree_mod_move {
305 int dst_slot;
306 int nr_items;
307 };
308
309 struct tree_mod_root {
310 u64 logical;
311 u8 level;
312 };
313
314 struct tree_mod_elem {
315 struct rb_node node;
316 u64 logical;
317 u64 seq;
318 enum mod_log_op op;
319
320 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
321 int slot;
322
323 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
324 u64 generation;
325
326 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
327 struct btrfs_disk_key key;
328 u64 blockptr;
329
330 /* this is used for op == MOD_LOG_MOVE_KEYS */
331 struct tree_mod_move move;
332
333 /* this is used for op == MOD_LOG_ROOT_REPLACE */
334 struct tree_mod_root old_root;
335 };
336
337 /*
338 * Pull a new tree mod seq number for our operation.
339 */
340 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
341 {
342 return atomic64_inc_return(&fs_info->tree_mod_seq);
343 }
344
345 /*
346 * This adds a new blocker to the tree mod log's blocker list if the @elem
347 * passed does not already have a sequence number set. So when a caller expects
348 * to record tree modifications, it should ensure to set elem->seq to zero
349 * before calling btrfs_get_tree_mod_seq.
350 * Returns a fresh, unused tree log modification sequence number, even if no new
351 * blocker was added.
352 */
353 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
354 struct seq_list *elem)
355 {
356 write_lock(&fs_info->tree_mod_log_lock);
357 spin_lock(&fs_info->tree_mod_seq_lock);
358 if (!elem->seq) {
359 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
360 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
361 }
362 spin_unlock(&fs_info->tree_mod_seq_lock);
363 write_unlock(&fs_info->tree_mod_log_lock);
364
365 return elem->seq;
366 }
367
368 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
369 struct seq_list *elem)
370 {
371 struct rb_root *tm_root;
372 struct rb_node *node;
373 struct rb_node *next;
374 struct seq_list *cur_elem;
375 struct tree_mod_elem *tm;
376 u64 min_seq = (u64)-1;
377 u64 seq_putting = elem->seq;
378
379 if (!seq_putting)
380 return;
381
382 spin_lock(&fs_info->tree_mod_seq_lock);
383 list_del(&elem->list);
384 elem->seq = 0;
385
386 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
387 if (cur_elem->seq < min_seq) {
388 if (seq_putting > cur_elem->seq) {
389 /*
390 * blocker with lower sequence number exists, we
391 * cannot remove anything from the log
392 */
393 spin_unlock(&fs_info->tree_mod_seq_lock);
394 return;
395 }
396 min_seq = cur_elem->seq;
397 }
398 }
399 spin_unlock(&fs_info->tree_mod_seq_lock);
400
401 /*
402 * anything that's lower than the lowest existing (read: blocked)
403 * sequence number can be removed from the tree.
404 */
405 write_lock(&fs_info->tree_mod_log_lock);
406 tm_root = &fs_info->tree_mod_log;
407 for (node = rb_first(tm_root); node; node = next) {
408 next = rb_next(node);
409 tm = rb_entry(node, struct tree_mod_elem, node);
410 if (tm->seq >= min_seq)
411 continue;
412 rb_erase(node, tm_root);
413 kfree(tm);
414 }
415 write_unlock(&fs_info->tree_mod_log_lock);
416 }
417
418 /*
419 * key order of the log:
420 * node/leaf start address -> sequence
421 *
422 * The 'start address' is the logical address of the *new* root node
423 * for root replace operations, or the logical address of the affected
424 * block for all other operations.
425 *
426 * Note: must be called with write lock for fs_info::tree_mod_log_lock.
427 */
428 static noinline int
429 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
430 {
431 struct rb_root *tm_root;
432 struct rb_node **new;
433 struct rb_node *parent = NULL;
434 struct tree_mod_elem *cur;
435
436 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
437
438 tm_root = &fs_info->tree_mod_log;
439 new = &tm_root->rb_node;
440 while (*new) {
441 cur = rb_entry(*new, struct tree_mod_elem, node);
442 parent = *new;
443 if (cur->logical < tm->logical)
444 new = &((*new)->rb_left);
445 else if (cur->logical > tm->logical)
446 new = &((*new)->rb_right);
447 else if (cur->seq < tm->seq)
448 new = &((*new)->rb_left);
449 else if (cur->seq > tm->seq)
450 new = &((*new)->rb_right);
451 else
452 return -EEXIST;
453 }
454
455 rb_link_node(&tm->node, parent, new);
456 rb_insert_color(&tm->node, tm_root);
457 return 0;
458 }
459
460 /*
461 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
462 * returns zero with the tree_mod_log_lock acquired. The caller must hold
463 * this until all tree mod log insertions are recorded in the rb tree and then
464 * write unlock fs_info::tree_mod_log_lock.
465 */
466 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
467 struct extent_buffer *eb) {
468 smp_mb();
469 if (list_empty(&(fs_info)->tree_mod_seq_list))
470 return 1;
471 if (eb && btrfs_header_level(eb) == 0)
472 return 1;
473
474 write_lock(&fs_info->tree_mod_log_lock);
475 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
476 write_unlock(&fs_info->tree_mod_log_lock);
477 return 1;
478 }
479
480 return 0;
481 }
482
483 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
484 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
485 struct extent_buffer *eb)
486 {
487 smp_mb();
488 if (list_empty(&(fs_info)->tree_mod_seq_list))
489 return 0;
490 if (eb && btrfs_header_level(eb) == 0)
491 return 0;
492
493 return 1;
494 }
495
496 static struct tree_mod_elem *
497 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
498 enum mod_log_op op, gfp_t flags)
499 {
500 struct tree_mod_elem *tm;
501
502 tm = kzalloc(sizeof(*tm), flags);
503 if (!tm)
504 return NULL;
505
506 tm->logical = eb->start;
507 if (op != MOD_LOG_KEY_ADD) {
508 btrfs_node_key(eb, &tm->key, slot);
509 tm->blockptr = btrfs_node_blockptr(eb, slot);
510 }
511 tm->op = op;
512 tm->slot = slot;
513 tm->generation = btrfs_node_ptr_generation(eb, slot);
514 RB_CLEAR_NODE(&tm->node);
515
516 return tm;
517 }
518
519 static noinline int
520 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
521 struct extent_buffer *eb, int slot,
522 enum mod_log_op op, gfp_t flags)
523 {
524 struct tree_mod_elem *tm;
525 int ret;
526
527 if (!tree_mod_need_log(fs_info, eb))
528 return 0;
529
530 tm = alloc_tree_mod_elem(eb, slot, op, flags);
531 if (!tm)
532 return -ENOMEM;
533
534 if (tree_mod_dont_log(fs_info, eb)) {
535 kfree(tm);
536 return 0;
537 }
538
539 ret = __tree_mod_log_insert(fs_info, tm);
540 write_unlock(&eb->fs_info->tree_mod_log_lock);
541 if (ret)
542 kfree(tm);
543
544 return ret;
545 }
546
547 static noinline int
548 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
549 struct extent_buffer *eb, int dst_slot, int src_slot,
550 int nr_items)
551 {
552 struct tree_mod_elem *tm = NULL;
553 struct tree_mod_elem **tm_list = NULL;
554 int ret = 0;
555 int i;
556 int locked = 0;
557
558 if (!tree_mod_need_log(fs_info, eb))
559 return 0;
560
561 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
562 if (!tm_list)
563 return -ENOMEM;
564
565 tm = kzalloc(sizeof(*tm), GFP_NOFS);
566 if (!tm) {
567 ret = -ENOMEM;
568 goto free_tms;
569 }
570
571 tm->logical = eb->start;
572 tm->slot = src_slot;
573 tm->move.dst_slot = dst_slot;
574 tm->move.nr_items = nr_items;
575 tm->op = MOD_LOG_MOVE_KEYS;
576
577 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
578 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
579 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
580 if (!tm_list[i]) {
581 ret = -ENOMEM;
582 goto free_tms;
583 }
584 }
585
586 if (tree_mod_dont_log(fs_info, eb))
587 goto free_tms;
588 locked = 1;
589
590 /*
591 * When we override something during the move, we log these removals.
592 * This can only happen when we move towards the beginning of the
593 * buffer, i.e. dst_slot < src_slot.
594 */
595 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
596 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
597 if (ret)
598 goto free_tms;
599 }
600
601 ret = __tree_mod_log_insert(fs_info, tm);
602 if (ret)
603 goto free_tms;
604 write_unlock(&eb->fs_info->tree_mod_log_lock);
605 kfree(tm_list);
606
607 return 0;
608 free_tms:
609 for (i = 0; i < nr_items; i++) {
610 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
611 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
612 kfree(tm_list[i]);
613 }
614 if (locked)
615 write_unlock(&eb->fs_info->tree_mod_log_lock);
616 kfree(tm_list);
617 kfree(tm);
618
619 return ret;
620 }
621
622 static inline int
623 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
624 struct tree_mod_elem **tm_list,
625 int nritems)
626 {
627 int i, j;
628 int ret;
629
630 for (i = nritems - 1; i >= 0; i--) {
631 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
632 if (ret) {
633 for (j = nritems - 1; j > i; j--)
634 rb_erase(&tm_list[j]->node,
635 &fs_info->tree_mod_log);
636 return ret;
637 }
638 }
639
640 return 0;
641 }
642
643 static noinline int
644 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
645 struct extent_buffer *old_root,
646 struct extent_buffer *new_root,
647 int log_removal)
648 {
649 struct tree_mod_elem *tm = NULL;
650 struct tree_mod_elem **tm_list = NULL;
651 int nritems = 0;
652 int ret = 0;
653 int i;
654
655 if (!tree_mod_need_log(fs_info, NULL))
656 return 0;
657
658 if (log_removal && btrfs_header_level(old_root) > 0) {
659 nritems = btrfs_header_nritems(old_root);
660 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
661 GFP_NOFS);
662 if (!tm_list) {
663 ret = -ENOMEM;
664 goto free_tms;
665 }
666 for (i = 0; i < nritems; i++) {
667 tm_list[i] = alloc_tree_mod_elem(old_root, i,
668 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
669 if (!tm_list[i]) {
670 ret = -ENOMEM;
671 goto free_tms;
672 }
673 }
674 }
675
676 tm = kzalloc(sizeof(*tm), GFP_NOFS);
677 if (!tm) {
678 ret = -ENOMEM;
679 goto free_tms;
680 }
681
682 tm->logical = new_root->start;
683 tm->old_root.logical = old_root->start;
684 tm->old_root.level = btrfs_header_level(old_root);
685 tm->generation = btrfs_header_generation(old_root);
686 tm->op = MOD_LOG_ROOT_REPLACE;
687
688 if (tree_mod_dont_log(fs_info, NULL))
689 goto free_tms;
690
691 if (tm_list)
692 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
693 if (!ret)
694 ret = __tree_mod_log_insert(fs_info, tm);
695
696 write_unlock(&fs_info->tree_mod_log_lock);
697 if (ret)
698 goto free_tms;
699 kfree(tm_list);
700
701 return ret;
702
703 free_tms:
704 if (tm_list) {
705 for (i = 0; i < nritems; i++)
706 kfree(tm_list[i]);
707 kfree(tm_list);
708 }
709 kfree(tm);
710
711 return ret;
712 }
713
714 static struct tree_mod_elem *
715 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
716 int smallest)
717 {
718 struct rb_root *tm_root;
719 struct rb_node *node;
720 struct tree_mod_elem *cur = NULL;
721 struct tree_mod_elem *found = NULL;
722
723 read_lock(&fs_info->tree_mod_log_lock);
724 tm_root = &fs_info->tree_mod_log;
725 node = tm_root->rb_node;
726 while (node) {
727 cur = rb_entry(node, struct tree_mod_elem, node);
728 if (cur->logical < start) {
729 node = node->rb_left;
730 } else if (cur->logical > start) {
731 node = node->rb_right;
732 } else if (cur->seq < min_seq) {
733 node = node->rb_left;
734 } else if (!smallest) {
735 /* we want the node with the highest seq */
736 if (found)
737 BUG_ON(found->seq > cur->seq);
738 found = cur;
739 node = node->rb_left;
740 } else if (cur->seq > min_seq) {
741 /* we want the node with the smallest seq */
742 if (found)
743 BUG_ON(found->seq < cur->seq);
744 found = cur;
745 node = node->rb_right;
746 } else {
747 found = cur;
748 break;
749 }
750 }
751 read_unlock(&fs_info->tree_mod_log_lock);
752
753 return found;
754 }
755
756 /*
757 * this returns the element from the log with the smallest time sequence
758 * value that's in the log (the oldest log item). any element with a time
759 * sequence lower than min_seq will be ignored.
760 */
761 static struct tree_mod_elem *
762 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
763 u64 min_seq)
764 {
765 return __tree_mod_log_search(fs_info, start, min_seq, 1);
766 }
767
768 /*
769 * this returns the element from the log with the largest time sequence
770 * value that's in the log (the most recent log item). any element with
771 * a time sequence lower than min_seq will be ignored.
772 */
773 static struct tree_mod_elem *
774 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
775 {
776 return __tree_mod_log_search(fs_info, start, min_seq, 0);
777 }
778
779 static noinline int
780 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
781 struct extent_buffer *src, unsigned long dst_offset,
782 unsigned long src_offset, int nr_items)
783 {
784 int ret = 0;
785 struct tree_mod_elem **tm_list = NULL;
786 struct tree_mod_elem **tm_list_add, **tm_list_rem;
787 int i;
788 int locked = 0;
789
790 if (!tree_mod_need_log(fs_info, NULL))
791 return 0;
792
793 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
794 return 0;
795
796 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
797 GFP_NOFS);
798 if (!tm_list)
799 return -ENOMEM;
800
801 tm_list_add = tm_list;
802 tm_list_rem = tm_list + nr_items;
803 for (i = 0; i < nr_items; i++) {
804 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
805 MOD_LOG_KEY_REMOVE, GFP_NOFS);
806 if (!tm_list_rem[i]) {
807 ret = -ENOMEM;
808 goto free_tms;
809 }
810
811 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
812 MOD_LOG_KEY_ADD, GFP_NOFS);
813 if (!tm_list_add[i]) {
814 ret = -ENOMEM;
815 goto free_tms;
816 }
817 }
818
819 if (tree_mod_dont_log(fs_info, NULL))
820 goto free_tms;
821 locked = 1;
822
823 for (i = 0; i < nr_items; i++) {
824 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
825 if (ret)
826 goto free_tms;
827 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
828 if (ret)
829 goto free_tms;
830 }
831
832 write_unlock(&fs_info->tree_mod_log_lock);
833 kfree(tm_list);
834
835 return 0;
836
837 free_tms:
838 for (i = 0; i < nr_items * 2; i++) {
839 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
840 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
841 kfree(tm_list[i]);
842 }
843 if (locked)
844 write_unlock(&fs_info->tree_mod_log_lock);
845 kfree(tm_list);
846
847 return ret;
848 }
849
850 static inline void
851 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
852 int dst_offset, int src_offset, int nr_items)
853 {
854 int ret;
855 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
856 nr_items);
857 BUG_ON(ret < 0);
858 }
859
860 static noinline void
861 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
862 struct extent_buffer *eb, int slot, int atomic)
863 {
864 int ret;
865
866 ret = tree_mod_log_insert_key(fs_info, eb, slot,
867 MOD_LOG_KEY_REPLACE,
868 atomic ? GFP_ATOMIC : GFP_NOFS);
869 BUG_ON(ret < 0);
870 }
871
872 static noinline int
873 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
874 {
875 struct tree_mod_elem **tm_list = NULL;
876 int nritems = 0;
877 int i;
878 int ret = 0;
879
880 if (btrfs_header_level(eb) == 0)
881 return 0;
882
883 if (!tree_mod_need_log(fs_info, NULL))
884 return 0;
885
886 nritems = btrfs_header_nritems(eb);
887 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
888 if (!tm_list)
889 return -ENOMEM;
890
891 for (i = 0; i < nritems; i++) {
892 tm_list[i] = alloc_tree_mod_elem(eb, i,
893 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
894 if (!tm_list[i]) {
895 ret = -ENOMEM;
896 goto free_tms;
897 }
898 }
899
900 if (tree_mod_dont_log(fs_info, eb))
901 goto free_tms;
902
903 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
904 write_unlock(&eb->fs_info->tree_mod_log_lock);
905 if (ret)
906 goto free_tms;
907 kfree(tm_list);
908
909 return 0;
910
911 free_tms:
912 for (i = 0; i < nritems; i++)
913 kfree(tm_list[i]);
914 kfree(tm_list);
915
916 return ret;
917 }
918
919 static noinline void
920 tree_mod_log_set_root_pointer(struct btrfs_root *root,
921 struct extent_buffer *new_root_node,
922 int log_removal)
923 {
924 int ret;
925 ret = tree_mod_log_insert_root(root->fs_info, root->node,
926 new_root_node, log_removal);
927 BUG_ON(ret < 0);
928 }
929
930 /*
931 * check if the tree block can be shared by multiple trees
932 */
933 int btrfs_block_can_be_shared(struct btrfs_root *root,
934 struct extent_buffer *buf)
935 {
936 /*
937 * Tree blocks not in reference counted trees and tree roots
938 * are never shared. If a block was allocated after the last
939 * snapshot and the block was not allocated by tree relocation,
940 * we know the block is not shared.
941 */
942 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
943 buf != root->node && buf != root->commit_root &&
944 (btrfs_header_generation(buf) <=
945 btrfs_root_last_snapshot(&root->root_item) ||
946 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
947 return 1;
948 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
949 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
950 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
951 return 1;
952 #endif
953 return 0;
954 }
955
956 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
957 struct btrfs_root *root,
958 struct extent_buffer *buf,
959 struct extent_buffer *cow,
960 int *last_ref)
961 {
962 struct btrfs_fs_info *fs_info = root->fs_info;
963 u64 refs;
964 u64 owner;
965 u64 flags;
966 u64 new_flags = 0;
967 int ret;
968
969 /*
970 * Backrefs update rules:
971 *
972 * Always use full backrefs for extent pointers in tree block
973 * allocated by tree relocation.
974 *
975 * If a shared tree block is no longer referenced by its owner
976 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
977 * use full backrefs for extent pointers in tree block.
978 *
979 * If a tree block is been relocating
980 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
981 * use full backrefs for extent pointers in tree block.
982 * The reason for this is some operations (such as drop tree)
983 * are only allowed for blocks use full backrefs.
984 */
985
986 if (btrfs_block_can_be_shared(root, buf)) {
987 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
988 btrfs_header_level(buf), 1,
989 &refs, &flags);
990 if (ret)
991 return ret;
992 if (refs == 0) {
993 ret = -EROFS;
994 btrfs_handle_fs_error(fs_info, ret, NULL);
995 return ret;
996 }
997 } else {
998 refs = 1;
999 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1000 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1001 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1002 else
1003 flags = 0;
1004 }
1005
1006 owner = btrfs_header_owner(buf);
1007 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1008 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1009
1010 if (refs > 1) {
1011 if ((owner == root->root_key.objectid ||
1012 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1013 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1014 ret = btrfs_inc_ref(trans, root, buf, 1);
1015 if (ret)
1016 return ret;
1017
1018 if (root->root_key.objectid ==
1019 BTRFS_TREE_RELOC_OBJECTID) {
1020 ret = btrfs_dec_ref(trans, root, buf, 0);
1021 if (ret)
1022 return ret;
1023 ret = btrfs_inc_ref(trans, root, cow, 1);
1024 if (ret)
1025 return ret;
1026 }
1027 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1028 } else {
1029
1030 if (root->root_key.objectid ==
1031 BTRFS_TREE_RELOC_OBJECTID)
1032 ret = btrfs_inc_ref(trans, root, cow, 1);
1033 else
1034 ret = btrfs_inc_ref(trans, root, cow, 0);
1035 if (ret)
1036 return ret;
1037 }
1038 if (new_flags != 0) {
1039 int level = btrfs_header_level(buf);
1040
1041 ret = btrfs_set_disk_extent_flags(trans, fs_info,
1042 buf->start,
1043 buf->len,
1044 new_flags, level, 0);
1045 if (ret)
1046 return ret;
1047 }
1048 } else {
1049 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1050 if (root->root_key.objectid ==
1051 BTRFS_TREE_RELOC_OBJECTID)
1052 ret = btrfs_inc_ref(trans, root, cow, 1);
1053 else
1054 ret = btrfs_inc_ref(trans, root, cow, 0);
1055 if (ret)
1056 return ret;
1057 ret = btrfs_dec_ref(trans, root, buf, 1);
1058 if (ret)
1059 return ret;
1060 }
1061 clean_tree_block(fs_info, buf);
1062 *last_ref = 1;
1063 }
1064 return 0;
1065 }
1066
1067 static struct extent_buffer *alloc_tree_block_no_bg_flush(
1068 struct btrfs_trans_handle *trans,
1069 struct btrfs_root *root,
1070 u64 parent_start,
1071 const struct btrfs_disk_key *disk_key,
1072 int level,
1073 u64 hint,
1074 u64 empty_size)
1075 {
1076 struct btrfs_fs_info *fs_info = root->fs_info;
1077 struct extent_buffer *ret;
1078
1079 /*
1080 * If we are COWing a node/leaf from the extent, chunk, device or free
1081 * space trees, make sure that we do not finish block group creation of
1082 * pending block groups. We do this to avoid a deadlock.
1083 * COWing can result in allocation of a new chunk, and flushing pending
1084 * block groups (btrfs_create_pending_block_groups()) can be triggered
1085 * when finishing allocation of a new chunk. Creation of a pending block
1086 * group modifies the extent, chunk, device and free space trees,
1087 * therefore we could deadlock with ourselves since we are holding a
1088 * lock on an extent buffer that btrfs_create_pending_block_groups() may
1089 * try to COW later.
1090 * For similar reasons, we also need to delay flushing pending block
1091 * groups when splitting a leaf or node, from one of those trees, since
1092 * we are holding a write lock on it and its parent or when inserting a
1093 * new root node for one of those trees.
1094 */
1095 if (root == fs_info->extent_root ||
1096 root == fs_info->chunk_root ||
1097 root == fs_info->dev_root ||
1098 root == fs_info->free_space_root)
1099 trans->can_flush_pending_bgs = false;
1100
1101 ret = btrfs_alloc_tree_block(trans, root, parent_start,
1102 root->root_key.objectid, disk_key, level,
1103 hint, empty_size);
1104 trans->can_flush_pending_bgs = true;
1105
1106 return ret;
1107 }
1108
1109 /*
1110 * does the dirty work in cow of a single block. The parent block (if
1111 * supplied) is updated to point to the new cow copy. The new buffer is marked
1112 * dirty and returned locked. If you modify the block it needs to be marked
1113 * dirty again.
1114 *
1115 * search_start -- an allocation hint for the new block
1116 *
1117 * empty_size -- a hint that you plan on doing more cow. This is the size in
1118 * bytes the allocator should try to find free next to the block it returns.
1119 * This is just a hint and may be ignored by the allocator.
1120 */
1121 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1122 struct btrfs_root *root,
1123 struct extent_buffer *buf,
1124 struct extent_buffer *parent, int parent_slot,
1125 struct extent_buffer **cow_ret,
1126 u64 search_start, u64 empty_size)
1127 {
1128 struct btrfs_fs_info *fs_info = root->fs_info;
1129 struct btrfs_disk_key disk_key;
1130 struct extent_buffer *cow;
1131 int level, ret;
1132 int last_ref = 0;
1133 int unlock_orig = 0;
1134 u64 parent_start = 0;
1135
1136 if (*cow_ret == buf)
1137 unlock_orig = 1;
1138
1139 btrfs_assert_tree_locked(buf);
1140
1141 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1142 trans->transid != fs_info->running_transaction->transid);
1143 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1144 trans->transid != root->last_trans);
1145
1146 level = btrfs_header_level(buf);
1147
1148 if (level == 0)
1149 btrfs_item_key(buf, &disk_key, 0);
1150 else
1151 btrfs_node_key(buf, &disk_key, 0);
1152
1153 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1154 parent_start = parent->start;
1155
1156 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1157 level, search_start, empty_size);
1158 if (IS_ERR(cow))
1159 return PTR_ERR(cow);
1160
1161 /* cow is set to blocking by btrfs_init_new_buffer */
1162
1163 copy_extent_buffer_full(cow, buf);
1164 btrfs_set_header_bytenr(cow, cow->start);
1165 btrfs_set_header_generation(cow, trans->transid);
1166 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1167 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1168 BTRFS_HEADER_FLAG_RELOC);
1169 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1170 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1171 else
1172 btrfs_set_header_owner(cow, root->root_key.objectid);
1173
1174 write_extent_buffer_fsid(cow, fs_info->fsid);
1175
1176 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1177 if (ret) {
1178 btrfs_abort_transaction(trans, ret);
1179 return ret;
1180 }
1181
1182 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1183 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1184 if (ret) {
1185 btrfs_abort_transaction(trans, ret);
1186 return ret;
1187 }
1188 }
1189
1190 if (buf == root->node) {
1191 WARN_ON(parent && parent != buf);
1192 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1193 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1194 parent_start = buf->start;
1195
1196 extent_buffer_get(cow);
1197 tree_mod_log_set_root_pointer(root, cow, 1);
1198 rcu_assign_pointer(root->node, cow);
1199
1200 btrfs_free_tree_block(trans, root, buf, parent_start,
1201 last_ref);
1202 free_extent_buffer(buf);
1203 add_root_to_dirty_list(root);
1204 } else {
1205 WARN_ON(trans->transid != btrfs_header_generation(parent));
1206 tree_mod_log_insert_key(fs_info, parent, parent_slot,
1207 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1208 btrfs_set_node_blockptr(parent, parent_slot,
1209 cow->start);
1210 btrfs_set_node_ptr_generation(parent, parent_slot,
1211 trans->transid);
1212 btrfs_mark_buffer_dirty(parent);
1213 if (last_ref) {
1214 ret = tree_mod_log_free_eb(fs_info, buf);
1215 if (ret) {
1216 btrfs_abort_transaction(trans, ret);
1217 return ret;
1218 }
1219 }
1220 btrfs_free_tree_block(trans, root, buf, parent_start,
1221 last_ref);
1222 }
1223 if (unlock_orig)
1224 btrfs_tree_unlock(buf);
1225 free_extent_buffer_stale(buf);
1226 btrfs_mark_buffer_dirty(cow);
1227 *cow_ret = cow;
1228 return 0;
1229 }
1230
1231 /*
1232 * returns the logical address of the oldest predecessor of the given root.
1233 * entries older than time_seq are ignored.
1234 */
1235 static struct tree_mod_elem *
1236 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1237 struct extent_buffer *eb_root, u64 time_seq)
1238 {
1239 struct tree_mod_elem *tm;
1240 struct tree_mod_elem *found = NULL;
1241 u64 root_logical = eb_root->start;
1242 int looped = 0;
1243
1244 if (!time_seq)
1245 return NULL;
1246
1247 /*
1248 * the very last operation that's logged for a root is the
1249 * replacement operation (if it is replaced at all). this has
1250 * the logical address of the *new* root, making it the very
1251 * first operation that's logged for this root.
1252 */
1253 while (1) {
1254 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1255 time_seq);
1256 if (!looped && !tm)
1257 return NULL;
1258 /*
1259 * if there are no tree operation for the oldest root, we simply
1260 * return it. this should only happen if that (old) root is at
1261 * level 0.
1262 */
1263 if (!tm)
1264 break;
1265
1266 /*
1267 * if there's an operation that's not a root replacement, we
1268 * found the oldest version of our root. normally, we'll find a
1269 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1270 */
1271 if (tm->op != MOD_LOG_ROOT_REPLACE)
1272 break;
1273
1274 found = tm;
1275 root_logical = tm->old_root.logical;
1276 looped = 1;
1277 }
1278
1279 /* if there's no old root to return, return what we found instead */
1280 if (!found)
1281 found = tm;
1282
1283 return found;
1284 }
1285
1286 /*
1287 * tm is a pointer to the first operation to rewind within eb. then, all
1288 * previous operations will be rewound (until we reach something older than
1289 * time_seq).
1290 */
1291 static void
1292 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1293 u64 time_seq, struct tree_mod_elem *first_tm)
1294 {
1295 u32 n;
1296 struct rb_node *next;
1297 struct tree_mod_elem *tm = first_tm;
1298 unsigned long o_dst;
1299 unsigned long o_src;
1300 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1301
1302 n = btrfs_header_nritems(eb);
1303 read_lock(&fs_info->tree_mod_log_lock);
1304 while (tm && tm->seq >= time_seq) {
1305 /*
1306 * all the operations are recorded with the operator used for
1307 * the modification. as we're going backwards, we do the
1308 * opposite of each operation here.
1309 */
1310 switch (tm->op) {
1311 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1312 BUG_ON(tm->slot < n);
1313 /* Fallthrough */
1314 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1315 case MOD_LOG_KEY_REMOVE:
1316 btrfs_set_node_key(eb, &tm->key, tm->slot);
1317 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1318 btrfs_set_node_ptr_generation(eb, tm->slot,
1319 tm->generation);
1320 n++;
1321 break;
1322 case MOD_LOG_KEY_REPLACE:
1323 BUG_ON(tm->slot >= n);
1324 btrfs_set_node_key(eb, &tm->key, tm->slot);
1325 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1326 btrfs_set_node_ptr_generation(eb, tm->slot,
1327 tm->generation);
1328 break;
1329 case MOD_LOG_KEY_ADD:
1330 /* if a move operation is needed it's in the log */
1331 n--;
1332 break;
1333 case MOD_LOG_MOVE_KEYS:
1334 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1335 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1336 memmove_extent_buffer(eb, o_dst, o_src,
1337 tm->move.nr_items * p_size);
1338 break;
1339 case MOD_LOG_ROOT_REPLACE:
1340 /*
1341 * this operation is special. for roots, this must be
1342 * handled explicitly before rewinding.
1343 * for non-roots, this operation may exist if the node
1344 * was a root: root A -> child B; then A gets empty and
1345 * B is promoted to the new root. in the mod log, we'll
1346 * have a root-replace operation for B, a tree block
1347 * that is no root. we simply ignore that operation.
1348 */
1349 break;
1350 }
1351 next = rb_next(&tm->node);
1352 if (!next)
1353 break;
1354 tm = rb_entry(next, struct tree_mod_elem, node);
1355 if (tm->logical != first_tm->logical)
1356 break;
1357 }
1358 read_unlock(&fs_info->tree_mod_log_lock);
1359 btrfs_set_header_nritems(eb, n);
1360 }
1361
1362 /*
1363 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1364 * is returned. If rewind operations happen, a fresh buffer is returned. The
1365 * returned buffer is always read-locked. If the returned buffer is not the
1366 * input buffer, the lock on the input buffer is released and the input buffer
1367 * is freed (its refcount is decremented).
1368 */
1369 static struct extent_buffer *
1370 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1371 struct extent_buffer *eb, u64 time_seq)
1372 {
1373 struct extent_buffer *eb_rewin;
1374 struct tree_mod_elem *tm;
1375
1376 if (!time_seq)
1377 return eb;
1378
1379 if (btrfs_header_level(eb) == 0)
1380 return eb;
1381
1382 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1383 if (!tm)
1384 return eb;
1385
1386 btrfs_set_path_blocking(path);
1387 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1388
1389 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1390 BUG_ON(tm->slot != 0);
1391 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1392 if (!eb_rewin) {
1393 btrfs_tree_read_unlock_blocking(eb);
1394 free_extent_buffer(eb);
1395 return NULL;
1396 }
1397 btrfs_set_header_bytenr(eb_rewin, eb->start);
1398 btrfs_set_header_backref_rev(eb_rewin,
1399 btrfs_header_backref_rev(eb));
1400 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1401 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1402 } else {
1403 eb_rewin = btrfs_clone_extent_buffer(eb);
1404 if (!eb_rewin) {
1405 btrfs_tree_read_unlock_blocking(eb);
1406 free_extent_buffer(eb);
1407 return NULL;
1408 }
1409 }
1410
1411 btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1412 btrfs_tree_read_unlock_blocking(eb);
1413 free_extent_buffer(eb);
1414
1415 extent_buffer_get(eb_rewin);
1416 btrfs_tree_read_lock(eb_rewin);
1417 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1418 WARN_ON(btrfs_header_nritems(eb_rewin) >
1419 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1420
1421 return eb_rewin;
1422 }
1423
1424 /*
1425 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1426 * value. If there are no changes, the current root->root_node is returned. If
1427 * anything changed in between, there's a fresh buffer allocated on which the
1428 * rewind operations are done. In any case, the returned buffer is read locked.
1429 * Returns NULL on error (with no locks held).
1430 */
1431 static inline struct extent_buffer *
1432 get_old_root(struct btrfs_root *root, u64 time_seq)
1433 {
1434 struct btrfs_fs_info *fs_info = root->fs_info;
1435 struct tree_mod_elem *tm;
1436 struct extent_buffer *eb = NULL;
1437 struct extent_buffer *eb_root;
1438 u64 eb_root_owner = 0;
1439 struct extent_buffer *old;
1440 struct tree_mod_root *old_root = NULL;
1441 u64 old_generation = 0;
1442 u64 logical;
1443
1444 eb_root = btrfs_read_lock_root_node(root);
1445 tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
1446 if (!tm)
1447 return eb_root;
1448
1449 if (tm->op == MOD_LOG_ROOT_REPLACE) {
1450 old_root = &tm->old_root;
1451 old_generation = tm->generation;
1452 logical = old_root->logical;
1453 } else {
1454 logical = eb_root->start;
1455 }
1456
1457 tm = tree_mod_log_search(fs_info, logical, time_seq);
1458 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1459 btrfs_tree_read_unlock(eb_root);
1460 free_extent_buffer(eb_root);
1461 old = read_tree_block(fs_info, logical, 0);
1462 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1463 if (!IS_ERR(old))
1464 free_extent_buffer(old);
1465 btrfs_warn(fs_info,
1466 "failed to read tree block %llu from get_old_root",
1467 logical);
1468 } else {
1469 eb = btrfs_clone_extent_buffer(old);
1470 free_extent_buffer(old);
1471 }
1472 } else if (old_root) {
1473 eb_root_owner = btrfs_header_owner(eb_root);
1474 btrfs_tree_read_unlock(eb_root);
1475 free_extent_buffer(eb_root);
1476 eb = alloc_dummy_extent_buffer(fs_info, logical);
1477 } else {
1478 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1479 eb = btrfs_clone_extent_buffer(eb_root);
1480 btrfs_tree_read_unlock_blocking(eb_root);
1481 free_extent_buffer(eb_root);
1482 }
1483
1484 if (!eb)
1485 return NULL;
1486 extent_buffer_get(eb);
1487 btrfs_tree_read_lock(eb);
1488 if (old_root) {
1489 btrfs_set_header_bytenr(eb, eb->start);
1490 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1491 btrfs_set_header_owner(eb, eb_root_owner);
1492 btrfs_set_header_level(eb, old_root->level);
1493 btrfs_set_header_generation(eb, old_generation);
1494 }
1495 if (tm)
1496 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1497 else
1498 WARN_ON(btrfs_header_level(eb) != 0);
1499 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1500
1501 return eb;
1502 }
1503
1504 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1505 {
1506 struct tree_mod_elem *tm;
1507 int level;
1508 struct extent_buffer *eb_root = btrfs_root_node(root);
1509
1510 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1511 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1512 level = tm->old_root.level;
1513 } else {
1514 level = btrfs_header_level(eb_root);
1515 }
1516 free_extent_buffer(eb_root);
1517
1518 return level;
1519 }
1520
1521 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1522 struct btrfs_root *root,
1523 struct extent_buffer *buf)
1524 {
1525 if (btrfs_is_testing(root->fs_info))
1526 return 0;
1527
1528 /* ensure we can see the force_cow */
1529 smp_rmb();
1530
1531 /*
1532 * We do not need to cow a block if
1533 * 1) this block is not created or changed in this transaction;
1534 * 2) this block does not belong to TREE_RELOC tree;
1535 * 3) the root is not forced COW.
1536 *
1537 * What is forced COW:
1538 * when we create snapshot during committing the transaction,
1539 * after we've finished coping src root, we must COW the shared
1540 * block to ensure the metadata consistency.
1541 */
1542 if (btrfs_header_generation(buf) == trans->transid &&
1543 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1544 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1545 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1546 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1547 return 0;
1548 return 1;
1549 }
1550
1551 /*
1552 * cows a single block, see __btrfs_cow_block for the real work.
1553 * This version of it has extra checks so that a block isn't COWed more than
1554 * once per transaction, as long as it hasn't been written yet
1555 */
1556 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1557 struct btrfs_root *root, struct extent_buffer *buf,
1558 struct extent_buffer *parent, int parent_slot,
1559 struct extent_buffer **cow_ret)
1560 {
1561 struct btrfs_fs_info *fs_info = root->fs_info;
1562 u64 search_start;
1563 int ret;
1564
1565 if (trans->transaction != fs_info->running_transaction)
1566 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1567 trans->transid,
1568 fs_info->running_transaction->transid);
1569
1570 if (trans->transid != fs_info->generation)
1571 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1572 trans->transid, fs_info->generation);
1573
1574 if (!should_cow_block(trans, root, buf)) {
1575 trans->dirty = true;
1576 *cow_ret = buf;
1577 return 0;
1578 }
1579
1580 search_start = buf->start & ~((u64)SZ_1G - 1);
1581
1582 if (parent)
1583 btrfs_set_lock_blocking(parent);
1584 btrfs_set_lock_blocking(buf);
1585
1586 ret = __btrfs_cow_block(trans, root, buf, parent,
1587 parent_slot, cow_ret, search_start, 0);
1588
1589 trace_btrfs_cow_block(root, buf, *cow_ret);
1590
1591 return ret;
1592 }
1593
1594 /*
1595 * helper function for defrag to decide if two blocks pointed to by a
1596 * node are actually close by
1597 */
1598 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1599 {
1600 if (blocknr < other && other - (blocknr + blocksize) < 32768)
1601 return 1;
1602 if (blocknr > other && blocknr - (other + blocksize) < 32768)
1603 return 1;
1604 return 0;
1605 }
1606
1607 /*
1608 * compare two keys in a memcmp fashion
1609 */
1610 static int comp_keys(const struct btrfs_disk_key *disk,
1611 const struct btrfs_key *k2)
1612 {
1613 struct btrfs_key k1;
1614
1615 btrfs_disk_key_to_cpu(&k1, disk);
1616
1617 return btrfs_comp_cpu_keys(&k1, k2);
1618 }
1619
1620 /*
1621 * same as comp_keys only with two btrfs_key's
1622 */
1623 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1624 {
1625 if (k1->objectid > k2->objectid)
1626 return 1;
1627 if (k1->objectid < k2->objectid)
1628 return -1;
1629 if (k1->type > k2->type)
1630 return 1;
1631 if (k1->type < k2->type)
1632 return -1;
1633 if (k1->offset > k2->offset)
1634 return 1;
1635 if (k1->offset < k2->offset)
1636 return -1;
1637 return 0;
1638 }
1639
1640 /*
1641 * this is used by the defrag code to go through all the
1642 * leaves pointed to by a node and reallocate them so that
1643 * disk order is close to key order
1644 */
1645 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1646 struct btrfs_root *root, struct extent_buffer *parent,
1647 int start_slot, u64 *last_ret,
1648 struct btrfs_key *progress)
1649 {
1650 struct btrfs_fs_info *fs_info = root->fs_info;
1651 struct extent_buffer *cur;
1652 u64 blocknr;
1653 u64 gen;
1654 u64 search_start = *last_ret;
1655 u64 last_block = 0;
1656 u64 other;
1657 u32 parent_nritems;
1658 int end_slot;
1659 int i;
1660 int err = 0;
1661 int parent_level;
1662 int uptodate;
1663 u32 blocksize;
1664 int progress_passed = 0;
1665 struct btrfs_disk_key disk_key;
1666
1667 parent_level = btrfs_header_level(parent);
1668
1669 WARN_ON(trans->transaction != fs_info->running_transaction);
1670 WARN_ON(trans->transid != fs_info->generation);
1671
1672 parent_nritems = btrfs_header_nritems(parent);
1673 blocksize = fs_info->nodesize;
1674 end_slot = parent_nritems - 1;
1675
1676 if (parent_nritems <= 1)
1677 return 0;
1678
1679 btrfs_set_lock_blocking(parent);
1680
1681 for (i = start_slot; i <= end_slot; i++) {
1682 int close = 1;
1683
1684 btrfs_node_key(parent, &disk_key, i);
1685 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1686 continue;
1687
1688 progress_passed = 1;
1689 blocknr = btrfs_node_blockptr(parent, i);
1690 gen = btrfs_node_ptr_generation(parent, i);
1691 if (last_block == 0)
1692 last_block = blocknr;
1693
1694 if (i > 0) {
1695 other = btrfs_node_blockptr(parent, i - 1);
1696 close = close_blocks(blocknr, other, blocksize);
1697 }
1698 if (!close && i < end_slot) {
1699 other = btrfs_node_blockptr(parent, i + 1);
1700 close = close_blocks(blocknr, other, blocksize);
1701 }
1702 if (close) {
1703 last_block = blocknr;
1704 continue;
1705 }
1706
1707 cur = find_extent_buffer(fs_info, blocknr);
1708 if (cur)
1709 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1710 else
1711 uptodate = 0;
1712 if (!cur || !uptodate) {
1713 if (!cur) {
1714 cur = read_tree_block(fs_info, blocknr, gen);
1715 if (IS_ERR(cur)) {
1716 return PTR_ERR(cur);
1717 } else if (!extent_buffer_uptodate(cur)) {
1718 free_extent_buffer(cur);
1719 return -EIO;
1720 }
1721 } else if (!uptodate) {
1722 err = btrfs_read_buffer(cur, gen);
1723 if (err) {
1724 free_extent_buffer(cur);
1725 return err;
1726 }
1727 }
1728 }
1729 if (search_start == 0)
1730 search_start = last_block;
1731
1732 btrfs_tree_lock(cur);
1733 btrfs_set_lock_blocking(cur);
1734 err = __btrfs_cow_block(trans, root, cur, parent, i,
1735 &cur, search_start,
1736 min(16 * blocksize,
1737 (end_slot - i) * blocksize));
1738 if (err) {
1739 btrfs_tree_unlock(cur);
1740 free_extent_buffer(cur);
1741 break;
1742 }
1743 search_start = cur->start;
1744 last_block = cur->start;
1745 *last_ret = search_start;
1746 btrfs_tree_unlock(cur);
1747 free_extent_buffer(cur);
1748 }
1749 return err;
1750 }
1751
1752 /*
1753 * search for key in the extent_buffer. The items start at offset p,
1754 * and they are item_size apart. There are 'max' items in p.
1755 *
1756 * the slot in the array is returned via slot, and it points to
1757 * the place where you would insert key if it is not found in
1758 * the array.
1759 *
1760 * slot may point to max if the key is bigger than all of the keys
1761 */
1762 static noinline int generic_bin_search(struct extent_buffer *eb,
1763 unsigned long p, int item_size,
1764 const struct btrfs_key *key,
1765 int max, int *slot)
1766 {
1767 int low = 0;
1768 int high = max;
1769 int mid;
1770 int ret;
1771 struct btrfs_disk_key *tmp = NULL;
1772 struct btrfs_disk_key unaligned;
1773 unsigned long offset;
1774 char *kaddr = NULL;
1775 unsigned long map_start = 0;
1776 unsigned long map_len = 0;
1777 int err;
1778
1779 if (low > high) {
1780 btrfs_err(eb->fs_info,
1781 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1782 __func__, low, high, eb->start,
1783 btrfs_header_owner(eb), btrfs_header_level(eb));
1784 return -EINVAL;
1785 }
1786
1787 while (low < high) {
1788 mid = (low + high) / 2;
1789 offset = p + mid * item_size;
1790
1791 if (!kaddr || offset < map_start ||
1792 (offset + sizeof(struct btrfs_disk_key)) >
1793 map_start + map_len) {
1794
1795 err = map_private_extent_buffer(eb, offset,
1796 sizeof(struct btrfs_disk_key),
1797 &kaddr, &map_start, &map_len);
1798
1799 if (!err) {
1800 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1801 map_start);
1802 } else if (err == 1) {
1803 read_extent_buffer(eb, &unaligned,
1804 offset, sizeof(unaligned));
1805 tmp = &unaligned;
1806 } else {
1807 return err;
1808 }
1809
1810 } else {
1811 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1812 map_start);
1813 }
1814 ret = comp_keys(tmp, key);
1815
1816 if (ret < 0)
1817 low = mid + 1;
1818 else if (ret > 0)
1819 high = mid;
1820 else {
1821 *slot = mid;
1822 return 0;
1823 }
1824 }
1825 *slot = low;
1826 return 1;
1827 }
1828
1829 /*
1830 * simple bin_search frontend that does the right thing for
1831 * leaves vs nodes
1832 */
1833 static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1834 int level, int *slot)
1835 {
1836 if (level == 0)
1837 return generic_bin_search(eb,
1838 offsetof(struct btrfs_leaf, items),
1839 sizeof(struct btrfs_item),
1840 key, btrfs_header_nritems(eb),
1841 slot);
1842 else
1843 return generic_bin_search(eb,
1844 offsetof(struct btrfs_node, ptrs),
1845 sizeof(struct btrfs_key_ptr),
1846 key, btrfs_header_nritems(eb),
1847 slot);
1848 }
1849
1850 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1851 int level, int *slot)
1852 {
1853 return bin_search(eb, key, level, slot);
1854 }
1855
1856 static void root_add_used(struct btrfs_root *root, u32 size)
1857 {
1858 spin_lock(&root->accounting_lock);
1859 btrfs_set_root_used(&root->root_item,
1860 btrfs_root_used(&root->root_item) + size);
1861 spin_unlock(&root->accounting_lock);
1862 }
1863
1864 static void root_sub_used(struct btrfs_root *root, u32 size)
1865 {
1866 spin_lock(&root->accounting_lock);
1867 btrfs_set_root_used(&root->root_item,
1868 btrfs_root_used(&root->root_item) - size);
1869 spin_unlock(&root->accounting_lock);
1870 }
1871
1872 /* given a node and slot number, this reads the blocks it points to. The
1873 * extent buffer is returned with a reference taken (but unlocked).
1874 */
1875 static noinline struct extent_buffer *
1876 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1877 int slot)
1878 {
1879 int level = btrfs_header_level(parent);
1880 struct extent_buffer *eb;
1881
1882 if (slot < 0 || slot >= btrfs_header_nritems(parent))
1883 return ERR_PTR(-ENOENT);
1884
1885 BUG_ON(level == 0);
1886
1887 eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1888 btrfs_node_ptr_generation(parent, slot));
1889 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1890 free_extent_buffer(eb);
1891 eb = ERR_PTR(-EIO);
1892 }
1893
1894 return eb;
1895 }
1896
1897 /*
1898 * node level balancing, used to make sure nodes are in proper order for
1899 * item deletion. We balance from the top down, so we have to make sure
1900 * that a deletion won't leave an node completely empty later on.
1901 */
1902 static noinline int balance_level(struct btrfs_trans_handle *trans,
1903 struct btrfs_root *root,
1904 struct btrfs_path *path, int level)
1905 {
1906 struct btrfs_fs_info *fs_info = root->fs_info;
1907 struct extent_buffer *right = NULL;
1908 struct extent_buffer *mid;
1909 struct extent_buffer *left = NULL;
1910 struct extent_buffer *parent = NULL;
1911 int ret = 0;
1912 int wret;
1913 int pslot;
1914 int orig_slot = path->slots[level];
1915 u64 orig_ptr;
1916
1917 if (level == 0)
1918 return 0;
1919
1920 mid = path->nodes[level];
1921
1922 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1923 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1924 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1925
1926 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1927
1928 if (level < BTRFS_MAX_LEVEL - 1) {
1929 parent = path->nodes[level + 1];
1930 pslot = path->slots[level + 1];
1931 }
1932
1933 /*
1934 * deal with the case where there is only one pointer in the root
1935 * by promoting the node below to a root
1936 */
1937 if (!parent) {
1938 struct extent_buffer *child;
1939
1940 if (btrfs_header_nritems(mid) != 1)
1941 return 0;
1942
1943 /* promote the child to a root */
1944 child = read_node_slot(fs_info, mid, 0);
1945 if (IS_ERR(child)) {
1946 ret = PTR_ERR(child);
1947 btrfs_handle_fs_error(fs_info, ret, NULL);
1948 goto enospc;
1949 }
1950
1951 btrfs_tree_lock(child);
1952 btrfs_set_lock_blocking(child);
1953 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1954 if (ret) {
1955 btrfs_tree_unlock(child);
1956 free_extent_buffer(child);
1957 goto enospc;
1958 }
1959
1960 tree_mod_log_set_root_pointer(root, child, 1);
1961 rcu_assign_pointer(root->node, child);
1962
1963 add_root_to_dirty_list(root);
1964 btrfs_tree_unlock(child);
1965
1966 path->locks[level] = 0;
1967 path->nodes[level] = NULL;
1968 clean_tree_block(fs_info, mid);
1969 btrfs_tree_unlock(mid);
1970 /* once for the path */
1971 free_extent_buffer(mid);
1972
1973 root_sub_used(root, mid->len);
1974 btrfs_free_tree_block(trans, root, mid, 0, 1);
1975 /* once for the root ptr */
1976 free_extent_buffer_stale(mid);
1977 return 0;
1978 }
1979 if (btrfs_header_nritems(mid) >
1980 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1981 return 0;
1982
1983 left = read_node_slot(fs_info, parent, pslot - 1);
1984 if (IS_ERR(left))
1985 left = NULL;
1986
1987 if (left) {
1988 btrfs_tree_lock(left);
1989 btrfs_set_lock_blocking(left);
1990 wret = btrfs_cow_block(trans, root, left,
1991 parent, pslot - 1, &left);
1992 if (wret) {
1993 ret = wret;
1994 goto enospc;
1995 }
1996 }
1997
1998 right = read_node_slot(fs_info, parent, pslot + 1);
1999 if (IS_ERR(right))
2000 right = NULL;
2001
2002 if (right) {
2003 btrfs_tree_lock(right);
2004 btrfs_set_lock_blocking(right);
2005 wret = btrfs_cow_block(trans, root, right,
2006 parent, pslot + 1, &right);
2007 if (wret) {
2008 ret = wret;
2009 goto enospc;
2010 }
2011 }
2012
2013 /* first, try to make some room in the middle buffer */
2014 if (left) {
2015 orig_slot += btrfs_header_nritems(left);
2016 wret = push_node_left(trans, fs_info, left, mid, 1);
2017 if (wret < 0)
2018 ret = wret;
2019 }
2020
2021 /*
2022 * then try to empty the right most buffer into the middle
2023 */
2024 if (right) {
2025 wret = push_node_left(trans, fs_info, mid, right, 1);
2026 if (wret < 0 && wret != -ENOSPC)
2027 ret = wret;
2028 if (btrfs_header_nritems(right) == 0) {
2029 clean_tree_block(fs_info, right);
2030 btrfs_tree_unlock(right);
2031 del_ptr(root, path, level + 1, pslot + 1);
2032 root_sub_used(root, right->len);
2033 btrfs_free_tree_block(trans, root, right, 0, 1);
2034 free_extent_buffer_stale(right);
2035 right = NULL;
2036 } else {
2037 struct btrfs_disk_key right_key;
2038 btrfs_node_key(right, &right_key, 0);
2039 tree_mod_log_set_node_key(fs_info, parent,
2040 pslot + 1, 0);
2041 btrfs_set_node_key(parent, &right_key, pslot + 1);
2042 btrfs_mark_buffer_dirty(parent);
2043 }
2044 }
2045 if (btrfs_header_nritems(mid) == 1) {
2046 /*
2047 * we're not allowed to leave a node with one item in the
2048 * tree during a delete. A deletion from lower in the tree
2049 * could try to delete the only pointer in this node.
2050 * So, pull some keys from the left.
2051 * There has to be a left pointer at this point because
2052 * otherwise we would have pulled some pointers from the
2053 * right
2054 */
2055 if (!left) {
2056 ret = -EROFS;
2057 btrfs_handle_fs_error(fs_info, ret, NULL);
2058 goto enospc;
2059 }
2060 wret = balance_node_right(trans, fs_info, mid, left);
2061 if (wret < 0) {
2062 ret = wret;
2063 goto enospc;
2064 }
2065 if (wret == 1) {
2066 wret = push_node_left(trans, fs_info, left, mid, 1);
2067 if (wret < 0)
2068 ret = wret;
2069 }
2070 BUG_ON(wret == 1);
2071 }
2072 if (btrfs_header_nritems(mid) == 0) {
2073 clean_tree_block(fs_info, mid);
2074 btrfs_tree_unlock(mid);
2075 del_ptr(root, path, level + 1, pslot);
2076 root_sub_used(root, mid->len);
2077 btrfs_free_tree_block(trans, root, mid, 0, 1);
2078 free_extent_buffer_stale(mid);
2079 mid = NULL;
2080 } else {
2081 /* update the parent key to reflect our changes */
2082 struct btrfs_disk_key mid_key;
2083 btrfs_node_key(mid, &mid_key, 0);
2084 tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
2085 btrfs_set_node_key(parent, &mid_key, pslot);
2086 btrfs_mark_buffer_dirty(parent);
2087 }
2088
2089 /* update the path */
2090 if (left) {
2091 if (btrfs_header_nritems(left) > orig_slot) {
2092 extent_buffer_get(left);
2093 /* left was locked after cow */
2094 path->nodes[level] = left;
2095 path->slots[level + 1] -= 1;
2096 path->slots[level] = orig_slot;
2097 if (mid) {
2098 btrfs_tree_unlock(mid);
2099 free_extent_buffer(mid);
2100 }
2101 } else {
2102 orig_slot -= btrfs_header_nritems(left);
2103 path->slots[level] = orig_slot;
2104 }
2105 }
2106 /* double check we haven't messed things up */
2107 if (orig_ptr !=
2108 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2109 BUG();
2110 enospc:
2111 if (right) {
2112 btrfs_tree_unlock(right);
2113 free_extent_buffer(right);
2114 }
2115 if (left) {
2116 if (path->nodes[level] != left)
2117 btrfs_tree_unlock(left);
2118 free_extent_buffer(left);
2119 }
2120 return ret;
2121 }
2122
2123 /* Node balancing for insertion. Here we only split or push nodes around
2124 * when they are completely full. This is also done top down, so we
2125 * have to be pessimistic.
2126 */
2127 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2128 struct btrfs_root *root,
2129 struct btrfs_path *path, int level)
2130 {
2131 struct btrfs_fs_info *fs_info = root->fs_info;
2132 struct extent_buffer *right = NULL;
2133 struct extent_buffer *mid;
2134 struct extent_buffer *left = NULL;
2135 struct extent_buffer *parent = NULL;
2136 int ret = 0;
2137 int wret;
2138 int pslot;
2139 int orig_slot = path->slots[level];
2140
2141 if (level == 0)
2142 return 1;
2143
2144 mid = path->nodes[level];
2145 WARN_ON(btrfs_header_generation(mid) != trans->transid);
2146
2147 if (level < BTRFS_MAX_LEVEL - 1) {
2148 parent = path->nodes[level + 1];
2149 pslot = path->slots[level + 1];
2150 }
2151
2152 if (!parent)
2153 return 1;
2154
2155 left = read_node_slot(fs_info, parent, pslot - 1);
2156 if (IS_ERR(left))
2157 left = NULL;
2158
2159 /* first, try to make some room in the middle buffer */
2160 if (left) {
2161 u32 left_nr;
2162
2163 btrfs_tree_lock(left);
2164 btrfs_set_lock_blocking(left);
2165
2166 left_nr = btrfs_header_nritems(left);
2167 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2168 wret = 1;
2169 } else {
2170 ret = btrfs_cow_block(trans, root, left, parent,
2171 pslot - 1, &left);
2172 if (ret)
2173 wret = 1;
2174 else {
2175 wret = push_node_left(trans, fs_info,
2176 left, mid, 0);
2177 }
2178 }
2179 if (wret < 0)
2180 ret = wret;
2181 if (wret == 0) {
2182 struct btrfs_disk_key disk_key;
2183 orig_slot += left_nr;
2184 btrfs_node_key(mid, &disk_key, 0);
2185 tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
2186 btrfs_set_node_key(parent, &disk_key, pslot);
2187 btrfs_mark_buffer_dirty(parent);
2188 if (btrfs_header_nritems(left) > orig_slot) {
2189 path->nodes[level] = left;
2190 path->slots[level + 1] -= 1;
2191 path->slots[level] = orig_slot;
2192 btrfs_tree_unlock(mid);
2193 free_extent_buffer(mid);
2194 } else {
2195 orig_slot -=
2196 btrfs_header_nritems(left);
2197 path->slots[level] = orig_slot;
2198 btrfs_tree_unlock(left);
2199 free_extent_buffer(left);
2200 }
2201 return 0;
2202 }
2203 btrfs_tree_unlock(left);
2204 free_extent_buffer(left);
2205 }
2206 right = read_node_slot(fs_info, parent, pslot + 1);
2207 if (IS_ERR(right))
2208 right = NULL;
2209
2210 /*
2211 * then try to empty the right most buffer into the middle
2212 */
2213 if (right) {
2214 u32 right_nr;
2215
2216 btrfs_tree_lock(right);
2217 btrfs_set_lock_blocking(right);
2218
2219 right_nr = btrfs_header_nritems(right);
2220 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2221 wret = 1;
2222 } else {
2223 ret = btrfs_cow_block(trans, root, right,
2224 parent, pslot + 1,
2225 &right);
2226 if (ret)
2227 wret = 1;
2228 else {
2229 wret = balance_node_right(trans, fs_info,
2230 right, mid);
2231 }
2232 }
2233 if (wret < 0)
2234 ret = wret;
2235 if (wret == 0) {
2236 struct btrfs_disk_key disk_key;
2237
2238 btrfs_node_key(right, &disk_key, 0);
2239 tree_mod_log_set_node_key(fs_info, parent,
2240 pslot + 1, 0);
2241 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2242 btrfs_mark_buffer_dirty(parent);
2243
2244 if (btrfs_header_nritems(mid) <= orig_slot) {
2245 path->nodes[level] = right;
2246 path->slots[level + 1] += 1;
2247 path->slots[level] = orig_slot -
2248 btrfs_header_nritems(mid);
2249 btrfs_tree_unlock(mid);
2250 free_extent_buffer(mid);
2251 } else {
2252 btrfs_tree_unlock(right);
2253 free_extent_buffer(right);
2254 }
2255 return 0;
2256 }
2257 btrfs_tree_unlock(right);
2258 free_extent_buffer(right);
2259 }
2260 return 1;
2261 }
2262
2263 /*
2264 * readahead one full node of leaves, finding things that are close
2265 * to the block in 'slot', and triggering ra on them.
2266 */
2267 static void reada_for_search(struct btrfs_fs_info *fs_info,
2268 struct btrfs_path *path,
2269 int level, int slot, u64 objectid)
2270 {
2271 struct extent_buffer *node;
2272 struct btrfs_disk_key disk_key;
2273 u32 nritems;
2274 u64 search;
2275 u64 target;
2276 u64 nread = 0;
2277 struct extent_buffer *eb;
2278 u32 nr;
2279 u32 blocksize;
2280 u32 nscan = 0;
2281
2282 if (level != 1)
2283 return;
2284
2285 if (!path->nodes[level])
2286 return;
2287
2288 node = path->nodes[level];
2289
2290 search = btrfs_node_blockptr(node, slot);
2291 blocksize = fs_info->nodesize;
2292 eb = find_extent_buffer(fs_info, search);
2293 if (eb) {
2294 free_extent_buffer(eb);
2295 return;
2296 }
2297
2298 target = search;
2299
2300 nritems = btrfs_header_nritems(node);
2301 nr = slot;
2302
2303 while (1) {
2304 if (path->reada == READA_BACK) {
2305 if (nr == 0)
2306 break;
2307 nr--;
2308 } else if (path->reada == READA_FORWARD) {
2309 nr++;
2310 if (nr >= nritems)
2311 break;
2312 }
2313 if (path->reada == READA_BACK && objectid) {
2314 btrfs_node_key(node, &disk_key, nr);
2315 if (btrfs_disk_key_objectid(&disk_key) != objectid)
2316 break;
2317 }
2318 search = btrfs_node_blockptr(node, nr);
2319 if ((search <= target && target - search <= 65536) ||
2320 (search > target && search - target <= 65536)) {
2321 readahead_tree_block(fs_info, search);
2322 nread += blocksize;
2323 }
2324 nscan++;
2325 if ((nread > 65536 || nscan > 32))
2326 break;
2327 }
2328 }
2329
2330 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2331 struct btrfs_path *path, int level)
2332 {
2333 int slot;
2334 int nritems;
2335 struct extent_buffer *parent;
2336 struct extent_buffer *eb;
2337 u64 gen;
2338 u64 block1 = 0;
2339 u64 block2 = 0;
2340
2341 parent = path->nodes[level + 1];
2342 if (!parent)
2343 return;
2344
2345 nritems = btrfs_header_nritems(parent);
2346 slot = path->slots[level + 1];
2347
2348 if (slot > 0) {
2349 block1 = btrfs_node_blockptr(parent, slot - 1);
2350 gen = btrfs_node_ptr_generation(parent, slot - 1);
2351 eb = find_extent_buffer(fs_info, block1);
2352 /*
2353 * if we get -eagain from btrfs_buffer_uptodate, we
2354 * don't want to return eagain here. That will loop
2355 * forever
2356 */
2357 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2358 block1 = 0;
2359 free_extent_buffer(eb);
2360 }
2361 if (slot + 1 < nritems) {
2362 block2 = btrfs_node_blockptr(parent, slot + 1);
2363 gen = btrfs_node_ptr_generation(parent, slot + 1);
2364 eb = find_extent_buffer(fs_info, block2);
2365 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2366 block2 = 0;
2367 free_extent_buffer(eb);
2368 }
2369
2370 if (block1)
2371 readahead_tree_block(fs_info, block1);
2372 if (block2)
2373 readahead_tree_block(fs_info, block2);
2374 }
2375
2376
2377 /*
2378 * when we walk down the tree, it is usually safe to unlock the higher layers
2379 * in the tree. The exceptions are when our path goes through slot 0, because
2380 * operations on the tree might require changing key pointers higher up in the
2381 * tree.
2382 *
2383 * callers might also have set path->keep_locks, which tells this code to keep
2384 * the lock if the path points to the last slot in the block. This is part of
2385 * walking through the tree, and selecting the next slot in the higher block.
2386 *
2387 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
2388 * if lowest_unlock is 1, level 0 won't be unlocked
2389 */
2390 static noinline void unlock_up(struct btrfs_path *path, int level,
2391 int lowest_unlock, int min_write_lock_level,
2392 int *write_lock_level)
2393 {
2394 int i;
2395 int skip_level = level;
2396 int no_skips = 0;
2397 struct extent_buffer *t;
2398
2399 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2400 if (!path->nodes[i])
2401 break;
2402 if (!path->locks[i])
2403 break;
2404 if (!no_skips && path->slots[i] == 0) {
2405 skip_level = i + 1;
2406 continue;
2407 }
2408 if (!no_skips && path->keep_locks) {
2409 u32 nritems;
2410 t = path->nodes[i];
2411 nritems = btrfs_header_nritems(t);
2412 if (nritems < 1 || path->slots[i] >= nritems - 1) {
2413 skip_level = i + 1;
2414 continue;
2415 }
2416 }
2417 if (skip_level < i && i >= lowest_unlock)
2418 no_skips = 1;
2419
2420 t = path->nodes[i];
2421 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2422 btrfs_tree_unlock_rw(t, path->locks[i]);
2423 path->locks[i] = 0;
2424 if (write_lock_level &&
2425 i > min_write_lock_level &&
2426 i <= *write_lock_level) {
2427 *write_lock_level = i - 1;
2428 }
2429 }
2430 }
2431 }
2432
2433 /*
2434 * This releases any locks held in the path starting at level and
2435 * going all the way up to the root.
2436 *
2437 * btrfs_search_slot will keep the lock held on higher nodes in a few
2438 * corner cases, such as COW of the block at slot zero in the node. This
2439 * ignores those rules, and it should only be called when there are no
2440 * more updates to be done higher up in the tree.
2441 */
2442 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2443 {
2444 int i;
2445
2446 if (path->keep_locks)
2447 return;
2448
2449 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2450 if (!path->nodes[i])
2451 continue;
2452 if (!path->locks[i])
2453 continue;
2454 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2455 path->locks[i] = 0;
2456 }
2457 }
2458
2459 /*
2460 * helper function for btrfs_search_slot. The goal is to find a block
2461 * in cache without setting the path to blocking. If we find the block
2462 * we return zero and the path is unchanged.
2463 *
2464 * If we can't find the block, we set the path blocking and do some
2465 * reada. -EAGAIN is returned and the search must be repeated.
2466 */
2467 static int
2468 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2469 struct extent_buffer **eb_ret, int level, int slot,
2470 const struct btrfs_key *key)
2471 {
2472 struct btrfs_fs_info *fs_info = root->fs_info;
2473 u64 blocknr;
2474 u64 gen;
2475 struct extent_buffer *b = *eb_ret;
2476 struct extent_buffer *tmp;
2477 int ret;
2478
2479 blocknr = btrfs_node_blockptr(b, slot);
2480 gen = btrfs_node_ptr_generation(b, slot);
2481
2482 tmp = find_extent_buffer(fs_info, blocknr);
2483 if (tmp) {
2484 /* first we do an atomic uptodate check */
2485 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2486 *eb_ret = tmp;
2487 return 0;
2488 }
2489
2490 /* the pages were up to date, but we failed
2491 * the generation number check. Do a full
2492 * read for the generation number that is correct.
2493 * We must do this without dropping locks so
2494 * we can trust our generation number
2495 */
2496 btrfs_set_path_blocking(p);
2497
2498 /* now we're allowed to do a blocking uptodate check */
2499 ret = btrfs_read_buffer(tmp, gen);
2500 if (!ret) {
2501 *eb_ret = tmp;
2502 return 0;
2503 }
2504 free_extent_buffer(tmp);
2505 btrfs_release_path(p);
2506 return -EIO;
2507 }
2508
2509 /*
2510 * reduce lock contention at high levels
2511 * of the btree by dropping locks before
2512 * we read. Don't release the lock on the current
2513 * level because we need to walk this node to figure
2514 * out which blocks to read.
2515 */
2516 btrfs_unlock_up_safe(p, level + 1);
2517 btrfs_set_path_blocking(p);
2518
2519 free_extent_buffer(tmp);
2520 if (p->reada != READA_NONE)
2521 reada_for_search(fs_info, p, level, slot, key->objectid);
2522
2523 ret = -EAGAIN;
2524 tmp = read_tree_block(fs_info, blocknr, gen);
2525 if (!IS_ERR(tmp)) {
2526 /*
2527 * If the read above didn't mark this buffer up to date,
2528 * it will never end up being up to date. Set ret to EIO now
2529 * and give up so that our caller doesn't loop forever
2530 * on our EAGAINs.
2531 */
2532 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2533 ret = -EIO;
2534 free_extent_buffer(tmp);
2535 } else {
2536 ret = PTR_ERR(tmp);
2537 }
2538
2539 btrfs_release_path(p);
2540 return ret;
2541 }
2542
2543 /*
2544 * helper function for btrfs_search_slot. This does all of the checks
2545 * for node-level blocks and does any balancing required based on
2546 * the ins_len.
2547 *
2548 * If no extra work was required, zero is returned. If we had to
2549 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2550 * start over
2551 */
2552 static int
2553 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2554 struct btrfs_root *root, struct btrfs_path *p,
2555 struct extent_buffer *b, int level, int ins_len,
2556 int *write_lock_level)
2557 {
2558 struct btrfs_fs_info *fs_info = root->fs_info;
2559 int ret;
2560
2561 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2562 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2563 int sret;
2564
2565 if (*write_lock_level < level + 1) {
2566 *write_lock_level = level + 1;
2567 btrfs_release_path(p);
2568 goto again;
2569 }
2570
2571 btrfs_set_path_blocking(p);
2572 reada_for_balance(fs_info, p, level);
2573 sret = split_node(trans, root, p, level);
2574 btrfs_clear_path_blocking(p, NULL, 0);
2575
2576 BUG_ON(sret > 0);
2577 if (sret) {
2578 ret = sret;
2579 goto done;
2580 }
2581 b = p->nodes[level];
2582 } else if (ins_len < 0 && btrfs_header_nritems(b) <
2583 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2584 int sret;
2585
2586 if (*write_lock_level < level + 1) {
2587 *write_lock_level = level + 1;
2588 btrfs_release_path(p);
2589 goto again;
2590 }
2591
2592 btrfs_set_path_blocking(p);
2593 reada_for_balance(fs_info, p, level);
2594 sret = balance_level(trans, root, p, level);
2595 btrfs_clear_path_blocking(p, NULL, 0);
2596
2597 if (sret) {
2598 ret = sret;
2599 goto done;
2600 }
2601 b = p->nodes[level];
2602 if (!b) {
2603 btrfs_release_path(p);
2604 goto again;
2605 }
2606 BUG_ON(btrfs_header_nritems(b) == 1);
2607 }
2608 return 0;
2609
2610 again:
2611 ret = -EAGAIN;
2612 done:
2613 return ret;
2614 }
2615
2616 static void key_search_validate(struct extent_buffer *b,
2617 const struct btrfs_key *key,
2618 int level)
2619 {
2620 #ifdef CONFIG_BTRFS_ASSERT
2621 struct btrfs_disk_key disk_key;
2622
2623 btrfs_cpu_key_to_disk(&disk_key, key);
2624
2625 if (level == 0)
2626 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2627 offsetof(struct btrfs_leaf, items[0].key),
2628 sizeof(disk_key)));
2629 else
2630 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2631 offsetof(struct btrfs_node, ptrs[0].key),
2632 sizeof(disk_key)));
2633 #endif
2634 }
2635
2636 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2637 int level, int *prev_cmp, int *slot)
2638 {
2639 if (*prev_cmp != 0) {
2640 *prev_cmp = bin_search(b, key, level, slot);
2641 return *prev_cmp;
2642 }
2643
2644 key_search_validate(b, key, level);
2645 *slot = 0;
2646
2647 return 0;
2648 }
2649
2650 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2651 u64 iobjectid, u64 ioff, u8 key_type,
2652 struct btrfs_key *found_key)
2653 {
2654 int ret;
2655 struct btrfs_key key;
2656 struct extent_buffer *eb;
2657
2658 ASSERT(path);
2659 ASSERT(found_key);
2660
2661 key.type = key_type;
2662 key.objectid = iobjectid;
2663 key.offset = ioff;
2664
2665 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2666 if (ret < 0)
2667 return ret;
2668
2669 eb = path->nodes[0];
2670 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2671 ret = btrfs_next_leaf(fs_root, path);
2672 if (ret)
2673 return ret;
2674 eb = path->nodes[0];
2675 }
2676
2677 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2678 if (found_key->type != key.type ||
2679 found_key->objectid != key.objectid)
2680 return 1;
2681
2682 return 0;
2683 }
2684
2685 /*
2686 * look for key in the tree. path is filled in with nodes along the way
2687 * if key is found, we return zero and you can find the item in the leaf
2688 * level of the path (level 0)
2689 *
2690 * If the key isn't found, the path points to the slot where it should
2691 * be inserted, and 1 is returned. If there are other errors during the
2692 * search a negative error number is returned.
2693 *
2694 * if ins_len > 0, nodes and leaves will be split as we walk down the
2695 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
2696 * possible)
2697 */
2698 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2699 const struct btrfs_key *key, struct btrfs_path *p,
2700 int ins_len, int cow)
2701 {
2702 struct btrfs_fs_info *fs_info = root->fs_info;
2703 struct extent_buffer *b;
2704 int slot;
2705 int ret;
2706 int err;
2707 int level;
2708 int lowest_unlock = 1;
2709 int root_lock;
2710 /* everything at write_lock_level or lower must be write locked */
2711 int write_lock_level = 0;
2712 u8 lowest_level = 0;
2713 int min_write_lock_level;
2714 int prev_cmp;
2715
2716 lowest_level = p->lowest_level;
2717 WARN_ON(lowest_level && ins_len > 0);
2718 WARN_ON(p->nodes[0] != NULL);
2719 BUG_ON(!cow && ins_len);
2720
2721 if (ins_len < 0) {
2722 lowest_unlock = 2;
2723
2724 /* when we are removing items, we might have to go up to level
2725 * two as we update tree pointers Make sure we keep write
2726 * for those levels as well
2727 */
2728 write_lock_level = 2;
2729 } else if (ins_len > 0) {
2730 /*
2731 * for inserting items, make sure we have a write lock on
2732 * level 1 so we can update keys
2733 */
2734 write_lock_level = 1;
2735 }
2736
2737 if (!cow)
2738 write_lock_level = -1;
2739
2740 if (cow && (p->keep_locks || p->lowest_level))
2741 write_lock_level = BTRFS_MAX_LEVEL;
2742
2743 min_write_lock_level = write_lock_level;
2744
2745 again:
2746 prev_cmp = -1;
2747 /*
2748 * we try very hard to do read locks on the root
2749 */
2750 root_lock = BTRFS_READ_LOCK;
2751 level = 0;
2752 if (p->search_commit_root) {
2753 /*
2754 * the commit roots are read only
2755 * so we always do read locks
2756 */
2757 if (p->need_commit_sem)
2758 down_read(&fs_info->commit_root_sem);
2759 b = root->commit_root;
2760 extent_buffer_get(b);
2761 level = btrfs_header_level(b);
2762 if (p->need_commit_sem)
2763 up_read(&fs_info->commit_root_sem);
2764 if (!p->skip_locking)
2765 btrfs_tree_read_lock(b);
2766 } else {
2767 if (p->skip_locking) {
2768 b = btrfs_root_node(root);
2769 level = btrfs_header_level(b);
2770 } else {
2771 /* we don't know the level of the root node
2772 * until we actually have it read locked
2773 */
2774 b = btrfs_read_lock_root_node(root);
2775 level = btrfs_header_level(b);
2776 if (level <= write_lock_level) {
2777 /* whoops, must trade for write lock */
2778 btrfs_tree_read_unlock(b);
2779 free_extent_buffer(b);
2780 b = btrfs_lock_root_node(root);
2781 root_lock = BTRFS_WRITE_LOCK;
2782
2783 /* the level might have changed, check again */
2784 level = btrfs_header_level(b);
2785 }
2786 }
2787 }
2788 p->nodes[level] = b;
2789 if (!p->skip_locking)
2790 p->locks[level] = root_lock;
2791
2792 while (b) {
2793 level = btrfs_header_level(b);
2794
2795 /*
2796 * setup the path here so we can release it under lock
2797 * contention with the cow code
2798 */
2799 if (cow) {
2800 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2801
2802 /*
2803 * if we don't really need to cow this block
2804 * then we don't want to set the path blocking,
2805 * so we test it here
2806 */
2807 if (!should_cow_block(trans, root, b)) {
2808 trans->dirty = true;
2809 goto cow_done;
2810 }
2811
2812 /*
2813 * must have write locks on this node and the
2814 * parent
2815 */
2816 if (level > write_lock_level ||
2817 (level + 1 > write_lock_level &&
2818 level + 1 < BTRFS_MAX_LEVEL &&
2819 p->nodes[level + 1])) {
2820 write_lock_level = level + 1;
2821 btrfs_release_path(p);
2822 goto again;
2823 }
2824
2825 btrfs_set_path_blocking(p);
2826 if (last_level)
2827 err = btrfs_cow_block(trans, root, b, NULL, 0,
2828 &b);
2829 else
2830 err = btrfs_cow_block(trans, root, b,
2831 p->nodes[level + 1],
2832 p->slots[level + 1], &b);
2833 if (err) {
2834 ret = err;
2835 goto done;
2836 }
2837 }
2838 cow_done:
2839 p->nodes[level] = b;
2840 btrfs_clear_path_blocking(p, NULL, 0);
2841
2842 /*
2843 * we have a lock on b and as long as we aren't changing
2844 * the tree, there is no way to for the items in b to change.
2845 * It is safe to drop the lock on our parent before we
2846 * go through the expensive btree search on b.
2847 *
2848 * If we're inserting or deleting (ins_len != 0), then we might
2849 * be changing slot zero, which may require changing the parent.
2850 * So, we can't drop the lock until after we know which slot
2851 * we're operating on.
2852 */
2853 if (!ins_len && !p->keep_locks) {
2854 int u = level + 1;
2855
2856 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2857 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2858 p->locks[u] = 0;
2859 }
2860 }
2861
2862 ret = key_search(b, key, level, &prev_cmp, &slot);
2863 if (ret < 0)
2864 goto done;
2865
2866 if (level != 0) {
2867 int dec = 0;
2868 if (ret && slot > 0) {
2869 dec = 1;
2870 slot -= 1;
2871 }
2872 p->slots[level] = slot;
2873 err = setup_nodes_for_search(trans, root, p, b, level,
2874 ins_len, &write_lock_level);
2875 if (err == -EAGAIN)
2876 goto again;
2877 if (err) {
2878 ret = err;
2879 goto done;
2880 }
2881 b = p->nodes[level];
2882 slot = p->slots[level];
2883
2884 /*
2885 * slot 0 is special, if we change the key
2886 * we have to update the parent pointer
2887 * which means we must have a write lock
2888 * on the parent
2889 */
2890 if (slot == 0 && ins_len &&
2891 write_lock_level < level + 1) {
2892 write_lock_level = level + 1;
2893 btrfs_release_path(p);
2894 goto again;
2895 }
2896
2897 unlock_up(p, level, lowest_unlock,
2898 min_write_lock_level, &write_lock_level);
2899
2900 if (level == lowest_level) {
2901 if (dec)
2902 p->slots[level]++;
2903 goto done;
2904 }
2905
2906 err = read_block_for_search(root, p, &b, level,
2907 slot, key);
2908 if (err == -EAGAIN)
2909 goto again;
2910 if (err) {
2911 ret = err;
2912 goto done;
2913 }
2914
2915 if (!p->skip_locking) {
2916 level = btrfs_header_level(b);
2917 if (level <= write_lock_level) {
2918 err = btrfs_try_tree_write_lock(b);
2919 if (!err) {
2920 btrfs_set_path_blocking(p);
2921 btrfs_tree_lock(b);
2922 btrfs_clear_path_blocking(p, b,
2923 BTRFS_WRITE_LOCK);
2924 }
2925 p->locks[level] = BTRFS_WRITE_LOCK;
2926 } else {
2927 err = btrfs_tree_read_lock_atomic(b);
2928 if (!err) {
2929 btrfs_set_path_blocking(p);
2930 btrfs_tree_read_lock(b);
2931 btrfs_clear_path_blocking(p, b,
2932 BTRFS_READ_LOCK);
2933 }
2934 p->locks[level] = BTRFS_READ_LOCK;
2935 }
2936 p->nodes[level] = b;
2937 }
2938 } else {
2939 p->slots[level] = slot;
2940 if (ins_len > 0 &&
2941 btrfs_leaf_free_space(fs_info, b) < ins_len) {
2942 if (write_lock_level < 1) {
2943 write_lock_level = 1;
2944 btrfs_release_path(p);
2945 goto again;
2946 }
2947
2948 btrfs_set_path_blocking(p);
2949 err = split_leaf(trans, root, key,
2950 p, ins_len, ret == 0);
2951 btrfs_clear_path_blocking(p, NULL, 0);
2952
2953 BUG_ON(err > 0);
2954 if (err) {
2955 ret = err;
2956 goto done;
2957 }
2958 }
2959 if (!p->search_for_split)
2960 unlock_up(p, level, lowest_unlock,
2961 min_write_lock_level, &write_lock_level);
2962 goto done;
2963 }
2964 }
2965 ret = 1;
2966 done:
2967 /*
2968 * we don't really know what they plan on doing with the path
2969 * from here on, so for now just mark it as blocking
2970 */
2971 if (!p->leave_spinning)
2972 btrfs_set_path_blocking(p);
2973 if (ret < 0 && !p->skip_release_on_error)
2974 btrfs_release_path(p);
2975 return ret;
2976 }
2977
2978 /*
2979 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2980 * current state of the tree together with the operations recorded in the tree
2981 * modification log to search for the key in a previous version of this tree, as
2982 * denoted by the time_seq parameter.
2983 *
2984 * Naturally, there is no support for insert, delete or cow operations.
2985 *
2986 * The resulting path and return value will be set up as if we called
2987 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2988 */
2989 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2990 struct btrfs_path *p, u64 time_seq)
2991 {
2992 struct btrfs_fs_info *fs_info = root->fs_info;
2993 struct extent_buffer *b;
2994 int slot;
2995 int ret;
2996 int err;
2997 int level;
2998 int lowest_unlock = 1;
2999 u8 lowest_level = 0;
3000 int prev_cmp = -1;
3001
3002 lowest_level = p->lowest_level;
3003 WARN_ON(p->nodes[0] != NULL);
3004
3005 if (p->search_commit_root) {
3006 BUG_ON(time_seq);
3007 return btrfs_search_slot(NULL, root, key, p, 0, 0);
3008 }
3009
3010 again:
3011 b = get_old_root(root, time_seq);
3012 if (!b) {
3013 ret = -EIO;
3014 goto done;
3015 }
3016 level = btrfs_header_level(b);
3017 p->locks[level] = BTRFS_READ_LOCK;
3018
3019 while (b) {
3020 level = btrfs_header_level(b);
3021 p->nodes[level] = b;
3022 btrfs_clear_path_blocking(p, NULL, 0);
3023
3024 /*
3025 * we have a lock on b and as long as we aren't changing
3026 * the tree, there is no way to for the items in b to change.
3027 * It is safe to drop the lock on our parent before we
3028 * go through the expensive btree search on b.
3029 */
3030 btrfs_unlock_up_safe(p, level + 1);
3031
3032 /*
3033 * Since we can unwind ebs we want to do a real search every
3034 * time.
3035 */
3036 prev_cmp = -1;
3037 ret = key_search(b, key, level, &prev_cmp, &slot);
3038
3039 if (level != 0) {
3040 int dec = 0;
3041 if (ret && slot > 0) {
3042 dec = 1;
3043 slot -= 1;
3044 }
3045 p->slots[level] = slot;
3046 unlock_up(p, level, lowest_unlock, 0, NULL);
3047
3048 if (level == lowest_level) {
3049 if (dec)
3050 p->slots[level]++;
3051 goto done;
3052 }
3053
3054 err = read_block_for_search(root, p, &b, level,
3055 slot, key);
3056 if (err == -EAGAIN)
3057 goto again;
3058 if (err) {
3059 ret = err;
3060 goto done;
3061 }
3062
3063 level = btrfs_header_level(b);
3064 err = btrfs_tree_read_lock_atomic(b);
3065 if (!err) {
3066 btrfs_set_path_blocking(p);
3067 btrfs_tree_read_lock(b);
3068 btrfs_clear_path_blocking(p, b,
3069 BTRFS_READ_LOCK);
3070 }
3071 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3072 if (!b) {
3073 ret = -ENOMEM;
3074 goto done;
3075 }
3076 p->locks[level] = BTRFS_READ_LOCK;
3077 p->nodes[level] = b;
3078 } else {
3079 p->slots[level] = slot;
3080 unlock_up(p, level, lowest_unlock, 0, NULL);
3081 goto done;
3082 }
3083 }
3084 ret = 1;
3085 done:
3086 if (!p->leave_spinning)
3087 btrfs_set_path_blocking(p);
3088 if (ret < 0)
3089 btrfs_release_path(p);
3090
3091 return ret;
3092 }
3093
3094 /*
3095 * helper to use instead of search slot if no exact match is needed but
3096 * instead the next or previous item should be returned.
3097 * When find_higher is true, the next higher item is returned, the next lower
3098 * otherwise.
3099 * When return_any and find_higher are both true, and no higher item is found,
3100 * return the next lower instead.
3101 * When return_any is true and find_higher is false, and no lower item is found,
3102 * return the next higher instead.
3103 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3104 * < 0 on error
3105 */
3106 int btrfs_search_slot_for_read(struct btrfs_root *root,
3107 const struct btrfs_key *key,
3108 struct btrfs_path *p, int find_higher,
3109 int return_any)
3110 {
3111 int ret;
3112 struct extent_buffer *leaf;
3113
3114 again:
3115 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3116 if (ret <= 0)
3117 return ret;
3118 /*
3119 * a return value of 1 means the path is at the position where the
3120 * item should be inserted. Normally this is the next bigger item,
3121 * but in case the previous item is the last in a leaf, path points
3122 * to the first free slot in the previous leaf, i.e. at an invalid
3123 * item.
3124 */
3125 leaf = p->nodes[0];
3126
3127 if (find_higher) {
3128 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3129 ret = btrfs_next_leaf(root, p);
3130 if (ret <= 0)
3131 return ret;
3132 if (!return_any)
3133 return 1;
3134 /*
3135 * no higher item found, return the next
3136 * lower instead
3137 */
3138 return_any = 0;
3139 find_higher = 0;
3140 btrfs_release_path(p);
3141 goto again;
3142 }
3143 } else {
3144 if (p->slots[0] == 0) {
3145 ret = btrfs_prev_leaf(root, p);
3146 if (ret < 0)
3147 return ret;
3148 if (!ret) {
3149 leaf = p->nodes[0];
3150 if (p->slots[0] == btrfs_header_nritems(leaf))
3151 p->slots[0]--;
3152 return 0;
3153 }
3154 if (!return_any)
3155 return 1;
3156 /*
3157 * no lower item found, return the next
3158 * higher instead
3159 */
3160 return_any = 0;
3161 find_higher = 1;
3162 btrfs_release_path(p);
3163 goto again;
3164 } else {
3165 --p->slots[0];
3166 }
3167 }
3168 return 0;
3169 }
3170
3171 /*
3172 * adjust the pointers going up the tree, starting at level
3173 * making sure the right key of each node is points to 'key'.
3174 * This is used after shifting pointers to the left, so it stops
3175 * fixing up pointers when a given leaf/node is not in slot 0 of the
3176 * higher levels
3177 *
3178 */
3179 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3180 struct btrfs_path *path,
3181 struct btrfs_disk_key *key, int level)
3182 {
3183 int i;
3184 struct extent_buffer *t;
3185
3186 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3187 int tslot = path->slots[i];
3188 if (!path->nodes[i])
3189 break;
3190 t = path->nodes[i];
3191 tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3192 btrfs_set_node_key(t, key, tslot);
3193 btrfs_mark_buffer_dirty(path->nodes[i]);
3194 if (tslot != 0)
3195 break;
3196 }
3197 }
3198
3199 /*
3200 * update item key.
3201 *
3202 * This function isn't completely safe. It's the caller's responsibility
3203 * that the new key won't break the order
3204 */
3205 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3206 struct btrfs_path *path,
3207 const struct btrfs_key *new_key)
3208 {
3209 struct btrfs_disk_key disk_key;
3210 struct extent_buffer *eb;
3211 int slot;
3212
3213 eb = path->nodes[0];
3214 slot = path->slots[0];
3215 if (slot > 0) {
3216 btrfs_item_key(eb, &disk_key, slot - 1);
3217 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3218 }
3219 if (slot < btrfs_header_nritems(eb) - 1) {
3220 btrfs_item_key(eb, &disk_key, slot + 1);
3221 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3222 }
3223
3224 btrfs_cpu_key_to_disk(&disk_key, new_key);
3225 btrfs_set_item_key(eb, &disk_key, slot);
3226 btrfs_mark_buffer_dirty(eb);
3227 if (slot == 0)
3228 fixup_low_keys(fs_info, path, &disk_key, 1);
3229 }
3230
3231 /*
3232 * try to push data from one node into the next node left in the
3233 * tree.
3234 *
3235 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3236 * error, and > 0 if there was no room in the left hand block.
3237 */
3238 static int push_node_left(struct btrfs_trans_handle *trans,
3239 struct btrfs_fs_info *fs_info,
3240 struct extent_buffer *dst,
3241 struct extent_buffer *src, int empty)
3242 {
3243 int push_items = 0;
3244 int src_nritems;
3245 int dst_nritems;
3246 int ret = 0;
3247
3248 src_nritems = btrfs_header_nritems(src);
3249 dst_nritems = btrfs_header_nritems(dst);
3250 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3251 WARN_ON(btrfs_header_generation(src) != trans->transid);
3252 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3253
3254 if (!empty && src_nritems <= 8)
3255 return 1;
3256
3257 if (push_items <= 0)
3258 return 1;
3259
3260 if (empty) {
3261 push_items = min(src_nritems, push_items);
3262 if (push_items < src_nritems) {
3263 /* leave at least 8 pointers in the node if
3264 * we aren't going to empty it
3265 */
3266 if (src_nritems - push_items < 8) {
3267 if (push_items <= 8)
3268 return 1;
3269 push_items -= 8;
3270 }
3271 }
3272 } else
3273 push_items = min(src_nritems - 8, push_items);
3274
3275 ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3276 push_items);
3277 if (ret) {
3278 btrfs_abort_transaction(trans, ret);
3279 return ret;
3280 }
3281 copy_extent_buffer(dst, src,
3282 btrfs_node_key_ptr_offset(dst_nritems),
3283 btrfs_node_key_ptr_offset(0),
3284 push_items * sizeof(struct btrfs_key_ptr));
3285
3286 if (push_items < src_nritems) {
3287 /*
3288 * don't call tree_mod_log_eb_move here, key removal was already
3289 * fully logged by tree_mod_log_eb_copy above.
3290 */
3291 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3292 btrfs_node_key_ptr_offset(push_items),
3293 (src_nritems - push_items) *
3294 sizeof(struct btrfs_key_ptr));
3295 }
3296 btrfs_set_header_nritems(src, src_nritems - push_items);
3297 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3298 btrfs_mark_buffer_dirty(src);
3299 btrfs_mark_buffer_dirty(dst);
3300
3301 return ret;
3302 }
3303
3304 /*
3305 * try to push data from one node into the next node right in the
3306 * tree.
3307 *
3308 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3309 * error, and > 0 if there was no room in the right hand block.
3310 *
3311 * this will only push up to 1/2 the contents of the left node over
3312 */
3313 static int balance_node_right(struct btrfs_trans_handle *trans,
3314 struct btrfs_fs_info *fs_info,
3315 struct extent_buffer *dst,
3316 struct extent_buffer *src)
3317 {
3318 int push_items = 0;
3319 int max_push;
3320 int src_nritems;
3321 int dst_nritems;
3322 int ret = 0;
3323
3324 WARN_ON(btrfs_header_generation(src) != trans->transid);
3325 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3326
3327 src_nritems = btrfs_header_nritems(src);
3328 dst_nritems = btrfs_header_nritems(dst);
3329 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3330 if (push_items <= 0)
3331 return 1;
3332
3333 if (src_nritems < 4)
3334 return 1;
3335
3336 max_push = src_nritems / 2 + 1;
3337 /* don't try to empty the node */
3338 if (max_push >= src_nritems)
3339 return 1;
3340
3341 if (max_push < push_items)
3342 push_items = max_push;
3343
3344 tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
3345 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3346 btrfs_node_key_ptr_offset(0),
3347 (dst_nritems) *
3348 sizeof(struct btrfs_key_ptr));
3349
3350 ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3351 src_nritems - push_items, push_items);
3352 if (ret) {
3353 btrfs_abort_transaction(trans, ret);
3354 return ret;
3355 }
3356 copy_extent_buffer(dst, src,
3357 btrfs_node_key_ptr_offset(0),
3358 btrfs_node_key_ptr_offset(src_nritems - push_items),
3359 push_items * sizeof(struct btrfs_key_ptr));
3360
3361 btrfs_set_header_nritems(src, src_nritems - push_items);
3362 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3363
3364 btrfs_mark_buffer_dirty(src);
3365 btrfs_mark_buffer_dirty(dst);
3366
3367 return ret;
3368 }
3369
3370 /*
3371 * helper function to insert a new root level in the tree.
3372 * A new node is allocated, and a single item is inserted to
3373 * point to the existing root
3374 *
3375 * returns zero on success or < 0 on failure.
3376 */
3377 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3378 struct btrfs_root *root,
3379 struct btrfs_path *path, int level)
3380 {
3381 struct btrfs_fs_info *fs_info = root->fs_info;
3382 u64 lower_gen;
3383 struct extent_buffer *lower;
3384 struct extent_buffer *c;
3385 struct extent_buffer *old;
3386 struct btrfs_disk_key lower_key;
3387
3388 BUG_ON(path->nodes[level]);
3389 BUG_ON(path->nodes[level-1] != root->node);
3390
3391 lower = path->nodes[level-1];
3392 if (level == 1)
3393 btrfs_item_key(lower, &lower_key, 0);
3394 else
3395 btrfs_node_key(lower, &lower_key, 0);
3396
3397 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3398 root->node->start, 0);
3399 if (IS_ERR(c))
3400 return PTR_ERR(c);
3401
3402 root_add_used(root, fs_info->nodesize);
3403
3404 memzero_extent_buffer(c, 0, sizeof(struct btrfs_header));
3405 btrfs_set_header_nritems(c, 1);
3406 btrfs_set_header_level(c, level);
3407 btrfs_set_header_bytenr(c, c->start);
3408 btrfs_set_header_generation(c, trans->transid);
3409 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3410 btrfs_set_header_owner(c, root->root_key.objectid);
3411
3412 write_extent_buffer_fsid(c, fs_info->fsid);
3413 write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid);
3414
3415 btrfs_set_node_key(c, &lower_key, 0);
3416 btrfs_set_node_blockptr(c, 0, lower->start);
3417 lower_gen = btrfs_header_generation(lower);
3418 WARN_ON(lower_gen != trans->transid);
3419
3420 btrfs_set_node_ptr_generation(c, 0, lower_gen);
3421
3422 btrfs_mark_buffer_dirty(c);
3423
3424 old = root->node;
3425 tree_mod_log_set_root_pointer(root, c, 0);
3426 rcu_assign_pointer(root->node, c);
3427
3428 /* the super has an extra ref to root->node */
3429 free_extent_buffer(old);
3430
3431 add_root_to_dirty_list(root);
3432 extent_buffer_get(c);
3433 path->nodes[level] = c;
3434 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3435 path->slots[level] = 0;
3436 return 0;
3437 }
3438
3439 /*
3440 * worker function to insert a single pointer in a node.
3441 * the node should have enough room for the pointer already
3442 *
3443 * slot and level indicate where you want the key to go, and
3444 * blocknr is the block the key points to.
3445 */
3446 static void insert_ptr(struct btrfs_trans_handle *trans,
3447 struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3448 struct btrfs_disk_key *key, u64 bytenr,
3449 int slot, int level)
3450 {
3451 struct extent_buffer *lower;
3452 int nritems;
3453 int ret;
3454
3455 BUG_ON(!path->nodes[level]);
3456 btrfs_assert_tree_locked(path->nodes[level]);
3457 lower = path->nodes[level];
3458 nritems = btrfs_header_nritems(lower);
3459 BUG_ON(slot > nritems);
3460 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3461 if (slot != nritems) {
3462 if (level)
3463 tree_mod_log_eb_move(fs_info, lower, slot + 1,
3464 slot, nritems - slot);
3465 memmove_extent_buffer(lower,
3466 btrfs_node_key_ptr_offset(slot + 1),
3467 btrfs_node_key_ptr_offset(slot),
3468 (nritems - slot) * sizeof(struct btrfs_key_ptr));
3469 }
3470 if (level) {
3471 ret = tree_mod_log_insert_key(fs_info, lower, slot,
3472 MOD_LOG_KEY_ADD, GFP_NOFS);
3473 BUG_ON(ret < 0);
3474 }
3475 btrfs_set_node_key(lower, key, slot);
3476 btrfs_set_node_blockptr(lower, slot, bytenr);
3477 WARN_ON(trans->transid == 0);
3478 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3479 btrfs_set_header_nritems(lower, nritems + 1);
3480 btrfs_mark_buffer_dirty(lower);
3481 }
3482
3483 /*
3484 * split the node at the specified level in path in two.
3485 * The path is corrected to point to the appropriate node after the split
3486 *
3487 * Before splitting this tries to make some room in the node by pushing
3488 * left and right, if either one works, it returns right away.
3489 *
3490 * returns 0 on success and < 0 on failure
3491 */
3492 static noinline int split_node(struct btrfs_trans_handle *trans,
3493 struct btrfs_root *root,
3494 struct btrfs_path *path, int level)
3495 {
3496 struct btrfs_fs_info *fs_info = root->fs_info;
3497 struct extent_buffer *c;
3498 struct extent_buffer *split;
3499 struct btrfs_disk_key disk_key;
3500 int mid;
3501 int ret;
3502 u32 c_nritems;
3503
3504 c = path->nodes[level];
3505 WARN_ON(btrfs_header_generation(c) != trans->transid);
3506 if (c == root->node) {
3507 /*
3508 * trying to split the root, lets make a new one
3509 *
3510 * tree mod log: We don't log_removal old root in
3511 * insert_new_root, because that root buffer will be kept as a
3512 * normal node. We are going to log removal of half of the
3513 * elements below with tree_mod_log_eb_copy. We're holding a
3514 * tree lock on the buffer, which is why we cannot race with
3515 * other tree_mod_log users.
3516 */
3517 ret = insert_new_root(trans, root, path, level + 1);
3518 if (ret)
3519 return ret;
3520 } else {
3521 ret = push_nodes_for_insert(trans, root, path, level);
3522 c = path->nodes[level];
3523 if (!ret && btrfs_header_nritems(c) <
3524 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3525 return 0;
3526 if (ret < 0)
3527 return ret;
3528 }
3529
3530 c_nritems = btrfs_header_nritems(c);
3531 mid = (c_nritems + 1) / 2;
3532 btrfs_node_key(c, &disk_key, mid);
3533
3534 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3535 c->start, 0);
3536 if (IS_ERR(split))
3537 return PTR_ERR(split);
3538
3539 root_add_used(root, fs_info->nodesize);
3540
3541 memzero_extent_buffer(split, 0, sizeof(struct btrfs_header));
3542 btrfs_set_header_level(split, btrfs_header_level(c));
3543 btrfs_set_header_bytenr(split, split->start);
3544 btrfs_set_header_generation(split, trans->transid);
3545 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3546 btrfs_set_header_owner(split, root->root_key.objectid);
3547 write_extent_buffer_fsid(split, fs_info->fsid);
3548 write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid);
3549
3550 ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3551 if (ret) {
3552 btrfs_abort_transaction(trans, ret);
3553 return ret;
3554 }
3555 copy_extent_buffer(split, c,
3556 btrfs_node_key_ptr_offset(0),
3557 btrfs_node_key_ptr_offset(mid),
3558 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3559 btrfs_set_header_nritems(split, c_nritems - mid);
3560 btrfs_set_header_nritems(c, mid);
3561 ret = 0;
3562
3563 btrfs_mark_buffer_dirty(c);
3564 btrfs_mark_buffer_dirty(split);
3565
3566 insert_ptr(trans, fs_info, path, &disk_key, split->start,
3567 path->slots[level + 1] + 1, level + 1);
3568
3569 if (path->slots[level] >= mid) {
3570 path->slots[level] -= mid;
3571 btrfs_tree_unlock(c);
3572 free_extent_buffer(c);
3573 path->nodes[level] = split;
3574 path->slots[level + 1] += 1;
3575 } else {
3576 btrfs_tree_unlock(split);
3577 free_extent_buffer(split);
3578 }
3579 return ret;
3580 }
3581
3582 /*
3583 * how many bytes are required to store the items in a leaf. start
3584 * and nr indicate which items in the leaf to check. This totals up the
3585 * space used both by the item structs and the item data
3586 */
3587 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3588 {
3589 struct btrfs_item *start_item;
3590 struct btrfs_item *end_item;
3591 struct btrfs_map_token token;
3592 int data_len;
3593 int nritems = btrfs_header_nritems(l);
3594 int end = min(nritems, start + nr) - 1;
3595
3596 if (!nr)
3597 return 0;
3598 btrfs_init_map_token(&token);
3599 start_item = btrfs_item_nr(start);
3600 end_item = btrfs_item_nr(end);
3601 data_len = btrfs_token_item_offset(l, start_item, &token) +
3602 btrfs_token_item_size(l, start_item, &token);
3603 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3604 data_len += sizeof(struct btrfs_item) * nr;
3605 WARN_ON(data_len < 0);
3606 return data_len;
3607 }
3608
3609 /*
3610 * The space between the end of the leaf items and
3611 * the start of the leaf data. IOW, how much room
3612 * the leaf has left for both items and data
3613 */
3614 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3615 struct extent_buffer *leaf)
3616 {
3617 int nritems = btrfs_header_nritems(leaf);
3618 int ret;
3619
3620 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3621 if (ret < 0) {
3622 btrfs_crit(fs_info,
3623 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3624 ret,
3625 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3626 leaf_space_used(leaf, 0, nritems), nritems);
3627 }
3628 return ret;
3629 }
3630
3631 /*
3632 * min slot controls the lowest index we're willing to push to the
3633 * right. We'll push up to and including min_slot, but no lower
3634 */
3635 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3636 struct btrfs_path *path,
3637 int data_size, int empty,
3638 struct extent_buffer *right,
3639 int free_space, u32 left_nritems,
3640 u32 min_slot)
3641 {
3642 struct extent_buffer *left = path->nodes[0];
3643 struct extent_buffer *upper = path->nodes[1];
3644 struct btrfs_map_token token;
3645 struct btrfs_disk_key disk_key;
3646 int slot;
3647 u32 i;
3648 int push_space = 0;
3649 int push_items = 0;
3650 struct btrfs_item *item;
3651 u32 nr;
3652 u32 right_nritems;
3653 u32 data_end;
3654 u32 this_item_size;
3655
3656 btrfs_init_map_token(&token);
3657
3658 if (empty)
3659 nr = 0;
3660 else
3661 nr = max_t(u32, 1, min_slot);
3662
3663 if (path->slots[0] >= left_nritems)
3664 push_space += data_size;
3665
3666 slot = path->slots[1];
3667 i = left_nritems - 1;
3668 while (i >= nr) {
3669 item = btrfs_item_nr(i);
3670
3671 if (!empty && push_items > 0) {
3672 if (path->slots[0] > i)
3673 break;
3674 if (path->slots[0] == i) {
3675 int space = btrfs_leaf_free_space(fs_info, left);
3676 if (space + push_space * 2 > free_space)
3677 break;
3678 }
3679 }
3680
3681 if (path->slots[0] == i)
3682 push_space += data_size;
3683
3684 this_item_size = btrfs_item_size(left, item);
3685 if (this_item_size + sizeof(*item) + push_space > free_space)
3686 break;
3687
3688 push_items++;
3689 push_space += this_item_size + sizeof(*item);
3690 if (i == 0)
3691 break;
3692 i--;
3693 }
3694
3695 if (push_items == 0)
3696 goto out_unlock;
3697
3698 WARN_ON(!empty && push_items == left_nritems);
3699
3700 /* push left to right */
3701 right_nritems = btrfs_header_nritems(right);
3702
3703 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3704 push_space -= leaf_data_end(fs_info, left);
3705
3706 /* make room in the right data area */
3707 data_end = leaf_data_end(fs_info, right);
3708 memmove_extent_buffer(right,
3709 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3710 BTRFS_LEAF_DATA_OFFSET + data_end,
3711 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3712
3713 /* copy from the left data area */
3714 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3715 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3716 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3717 push_space);
3718
3719 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3720 btrfs_item_nr_offset(0),
3721 right_nritems * sizeof(struct btrfs_item));
3722
3723 /* copy the items from left to right */
3724 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3725 btrfs_item_nr_offset(left_nritems - push_items),
3726 push_items * sizeof(struct btrfs_item));
3727
3728 /* update the item pointers */
3729 right_nritems += push_items;
3730 btrfs_set_header_nritems(right, right_nritems);
3731 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3732 for (i = 0; i < right_nritems; i++) {
3733 item = btrfs_item_nr(i);
3734 push_space -= btrfs_token_item_size(right, item, &token);
3735 btrfs_set_token_item_offset(right, item, push_space, &token);
3736 }
3737
3738 left_nritems -= push_items;
3739 btrfs_set_header_nritems(left, left_nritems);
3740
3741 if (left_nritems)
3742 btrfs_mark_buffer_dirty(left);
3743 else
3744 clean_tree_block(fs_info, left);
3745
3746 btrfs_mark_buffer_dirty(right);
3747
3748 btrfs_item_key(right, &disk_key, 0);
3749 btrfs_set_node_key(upper, &disk_key, slot + 1);
3750 btrfs_mark_buffer_dirty(upper);
3751
3752 /* then fixup the leaf pointer in the path */
3753 if (path->slots[0] >= left_nritems) {
3754 path->slots[0] -= left_nritems;
3755 if (btrfs_header_nritems(path->nodes[0]) == 0)
3756 clean_tree_block(fs_info, path->nodes[0]);
3757 btrfs_tree_unlock(path->nodes[0]);
3758 free_extent_buffer(path->nodes[0]);
3759 path->nodes[0] = right;
3760 path->slots[1] += 1;
3761 } else {
3762 btrfs_tree_unlock(right);
3763 free_extent_buffer(right);
3764 }
3765 return 0;
3766
3767 out_unlock:
3768 btrfs_tree_unlock(right);
3769 free_extent_buffer(right);
3770 return 1;
3771 }
3772
3773 /*
3774 * push some data in the path leaf to the right, trying to free up at
3775 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3776 *
3777 * returns 1 if the push failed because the other node didn't have enough
3778 * room, 0 if everything worked out and < 0 if there were major errors.
3779 *
3780 * this will push starting from min_slot to the end of the leaf. It won't
3781 * push any slot lower than min_slot
3782 */
3783 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3784 *root, struct btrfs_path *path,
3785 int min_data_size, int data_size,
3786 int empty, u32 min_slot)
3787 {
3788 struct btrfs_fs_info *fs_info = root->fs_info;
3789 struct extent_buffer *left = path->nodes[0];
3790 struct extent_buffer *right;
3791 struct extent_buffer *upper;
3792 int slot;
3793 int free_space;
3794 u32 left_nritems;
3795 int ret;
3796
3797 if (!path->nodes[1])
3798 return 1;
3799
3800 slot = path->slots[1];
3801 upper = path->nodes[1];
3802 if (slot >= btrfs_header_nritems(upper) - 1)
3803 return 1;
3804
3805 btrfs_assert_tree_locked(path->nodes[1]);
3806
3807 right = read_node_slot(fs_info, upper, slot + 1);
3808 /*
3809 * slot + 1 is not valid or we fail to read the right node,
3810 * no big deal, just return.
3811 */
3812 if (IS_ERR(right))
3813 return 1;
3814
3815 btrfs_tree_lock(right);
3816 btrfs_set_lock_blocking(right);
3817
3818 free_space = btrfs_leaf_free_space(fs_info, right);
3819 if (free_space < data_size)
3820 goto out_unlock;
3821
3822 /* cow and double check */
3823 ret = btrfs_cow_block(trans, root, right, upper,
3824 slot + 1, &right);
3825 if (ret)
3826 goto out_unlock;
3827
3828 free_space = btrfs_leaf_free_space(fs_info, right);
3829 if (free_space < data_size)
3830 goto out_unlock;
3831
3832 left_nritems = btrfs_header_nritems(left);
3833 if (left_nritems == 0)
3834 goto out_unlock;
3835
3836 if (path->slots[0] == left_nritems && !empty) {
3837 /* Key greater than all keys in the leaf, right neighbor has
3838 * enough room for it and we're not emptying our leaf to delete
3839 * it, therefore use right neighbor to insert the new item and
3840 * no need to touch/dirty our left leaft. */
3841 btrfs_tree_unlock(left);
3842 free_extent_buffer(left);
3843 path->nodes[0] = right;
3844 path->slots[0] = 0;
3845 path->slots[1]++;
3846 return 0;
3847 }
3848
3849 return __push_leaf_right(fs_info, path, min_data_size, empty,
3850 right, free_space, left_nritems, min_slot);
3851 out_unlock:
3852 btrfs_tree_unlock(right);
3853 free_extent_buffer(right);
3854 return 1;
3855 }
3856
3857 /*
3858 * push some data in the path leaf to the left, trying to free up at
3859 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3860 *
3861 * max_slot can put a limit on how far into the leaf we'll push items. The
3862 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3863 * items
3864 */
3865 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3866 struct btrfs_path *path, int data_size,
3867 int empty, struct extent_buffer *left,
3868 int free_space, u32 right_nritems,
3869 u32 max_slot)
3870 {
3871 struct btrfs_disk_key disk_key;
3872 struct extent_buffer *right = path->nodes[0];
3873 int i;
3874 int push_space = 0;
3875 int push_items = 0;
3876 struct btrfs_item *item;
3877 u32 old_left_nritems;
3878 u32 nr;
3879 int ret = 0;
3880 u32 this_item_size;
3881 u32 old_left_item_size;
3882 struct btrfs_map_token token;
3883
3884 btrfs_init_map_token(&token);
3885
3886 if (empty)
3887 nr = min(right_nritems, max_slot);
3888 else
3889 nr = min(right_nritems - 1, max_slot);
3890
3891 for (i = 0; i < nr; i++) {
3892 item = btrfs_item_nr(i);
3893
3894 if (!empty && push_items > 0) {
3895 if (path->slots[0] < i)
3896 break;
3897 if (path->slots[0] == i) {
3898 int space = btrfs_leaf_free_space(fs_info, right);
3899 if (space + push_space * 2 > free_space)
3900 break;
3901 }
3902 }
3903
3904 if (path->slots[0] == i)
3905 push_space += data_size;
3906
3907 this_item_size = btrfs_item_size(right, item);
3908 if (this_item_size + sizeof(*item) + push_space > free_space)
3909 break;
3910
3911 push_items++;
3912 push_space += this_item_size + sizeof(*item);
3913 }
3914
3915 if (push_items == 0) {
3916 ret = 1;
3917 goto out;
3918 }
3919 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3920
3921 /* push data from right to left */
3922 copy_extent_buffer(left, right,
3923 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3924 btrfs_item_nr_offset(0),
3925 push_items * sizeof(struct btrfs_item));
3926
3927 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3928 btrfs_item_offset_nr(right, push_items - 1);
3929
3930 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3931 leaf_data_end(fs_info, left) - push_space,
3932 BTRFS_LEAF_DATA_OFFSET +
3933 btrfs_item_offset_nr(right, push_items - 1),
3934 push_space);
3935 old_left_nritems = btrfs_header_nritems(left);
3936 BUG_ON(old_left_nritems <= 0);
3937
3938 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3939 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3940 u32 ioff;
3941
3942 item = btrfs_item_nr(i);
3943
3944 ioff = btrfs_token_item_offset(left, item, &token);
3945 btrfs_set_token_item_offset(left, item,
3946 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3947 &token);
3948 }
3949 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3950
3951 /* fixup right node */
3952 if (push_items > right_nritems)
3953 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3954 right_nritems);
3955
3956 if (push_items < right_nritems) {
3957 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3958 leaf_data_end(fs_info, right);
3959 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3960 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3961 BTRFS_LEAF_DATA_OFFSET +
3962 leaf_data_end(fs_info, right), push_space);
3963
3964 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3965 btrfs_item_nr_offset(push_items),
3966 (btrfs_header_nritems(right) - push_items) *
3967 sizeof(struct btrfs_item));
3968 }
3969 right_nritems -= push_items;
3970 btrfs_set_header_nritems(right, right_nritems);
3971 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3972 for (i = 0; i < right_nritems; i++) {
3973 item = btrfs_item_nr(i);
3974
3975 push_space = push_space - btrfs_token_item_size(right,
3976 item, &token);
3977 btrfs_set_token_item_offset(right, item, push_space, &token);
3978 }
3979
3980 btrfs_mark_buffer_dirty(left);
3981 if (right_nritems)
3982 btrfs_mark_buffer_dirty(right);
3983 else
3984 clean_tree_block(fs_info, right);
3985
3986 btrfs_item_key(right, &disk_key, 0);
3987 fixup_low_keys(fs_info, path, &disk_key, 1);
3988
3989 /* then fixup the leaf pointer in the path */
3990 if (path->slots[0] < push_items) {
3991 path->slots[0] += old_left_nritems;
3992 btrfs_tree_unlock(path->nodes[0]);
3993 free_extent_buffer(path->nodes[0]);
3994 path->nodes[0] = left;
3995 path->slots[1] -= 1;
3996 } else {
3997 btrfs_tree_unlock(left);
3998 free_extent_buffer(left);
3999 path->slots[0] -= push_items;
4000 }
4001 BUG_ON(path->slots[0] < 0);
4002 return ret;
4003 out:
4004 btrfs_tree_unlock(left);
4005 free_extent_buffer(left);
4006 return ret;
4007 }
4008
4009 /*
4010 * push some data in the path leaf to the left, trying to free up at
4011 * least data_size bytes. returns zero if the push worked, nonzero otherwise
4012 *
4013 * max_slot can put a limit on how far into the leaf we'll push items. The
4014 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
4015 * items
4016 */
4017 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4018 *root, struct btrfs_path *path, int min_data_size,
4019 int data_size, int empty, u32 max_slot)
4020 {
4021 struct btrfs_fs_info *fs_info = root->fs_info;
4022 struct extent_buffer *right = path->nodes[0];
4023 struct extent_buffer *left;
4024 int slot;
4025 int free_space;
4026 u32 right_nritems;
4027 int ret = 0;
4028
4029 slot = path->slots[1];
4030 if (slot == 0)
4031 return 1;
4032 if (!path->nodes[1])
4033 return 1;
4034
4035 right_nritems = btrfs_header_nritems(right);
4036 if (right_nritems == 0)
4037 return 1;
4038
4039 btrfs_assert_tree_locked(path->nodes[1]);
4040
4041 left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4042 /*
4043 * slot - 1 is not valid or we fail to read the left node,
4044 * no big deal, just return.
4045 */
4046 if (IS_ERR(left))
4047 return 1;
4048
4049 btrfs_tree_lock(left);
4050 btrfs_set_lock_blocking(left);
4051
4052 free_space = btrfs_leaf_free_space(fs_info, left);
4053 if (free_space < data_size) {
4054 ret = 1;
4055 goto out;
4056 }
4057
4058 /* cow and double check */
4059 ret = btrfs_cow_block(trans, root, left,
4060 path->nodes[1], slot - 1, &left);
4061 if (ret) {
4062 /* we hit -ENOSPC, but it isn't fatal here */
4063 if (ret == -ENOSPC)
4064 ret = 1;
4065 goto out;
4066 }
4067
4068 free_space = btrfs_leaf_free_space(fs_info, left);
4069 if (free_space < data_size) {
4070 ret = 1;
4071 goto out;
4072 }
4073
4074 return __push_leaf_left(fs_info, path, min_data_size,
4075 empty, left, free_space, right_nritems,
4076 max_slot);
4077 out:
4078 btrfs_tree_unlock(left);
4079 free_extent_buffer(left);
4080 return ret;
4081 }
4082
4083 /*
4084 * split the path's leaf in two, making sure there is at least data_size
4085 * available for the resulting leaf level of the path.
4086 */
4087 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4088 struct btrfs_fs_info *fs_info,
4089 struct btrfs_path *path,
4090 struct extent_buffer *l,
4091 struct extent_buffer *right,
4092 int slot, int mid, int nritems)
4093 {
4094 int data_copy_size;
4095 int rt_data_off;
4096 int i;
4097 struct btrfs_disk_key disk_key;
4098 struct btrfs_map_token token;
4099
4100 btrfs_init_map_token(&token);
4101
4102 nritems = nritems - mid;
4103 btrfs_set_header_nritems(right, nritems);
4104 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4105
4106 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4107 btrfs_item_nr_offset(mid),
4108 nritems * sizeof(struct btrfs_item));
4109
4110 copy_extent_buffer(right, l,
4111 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4112 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4113 leaf_data_end(fs_info, l), data_copy_size);
4114
4115 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4116
4117 for (i = 0; i < nritems; i++) {
4118 struct btrfs_item *item = btrfs_item_nr(i);
4119 u32 ioff;
4120
4121 ioff = btrfs_token_item_offset(right, item, &token);
4122 btrfs_set_token_item_offset(right, item,
4123 ioff + rt_data_off, &token);
4124 }
4125
4126 btrfs_set_header_nritems(l, mid);
4127 btrfs_item_key(right, &disk_key, 0);
4128 insert_ptr(trans, fs_info, path, &disk_key, right->start,
4129 path->slots[1] + 1, 1);
4130
4131 btrfs_mark_buffer_dirty(right);
4132 btrfs_mark_buffer_dirty(l);
4133 BUG_ON(path->slots[0] != slot);
4134
4135 if (mid <= slot) {
4136 btrfs_tree_unlock(path->nodes[0]);
4137 free_extent_buffer(path->nodes[0]);
4138 path->nodes[0] = right;
4139 path->slots[0] -= mid;
4140 path->slots[1] += 1;
4141 } else {
4142 btrfs_tree_unlock(right);
4143 free_extent_buffer(right);
4144 }
4145
4146 BUG_ON(path->slots[0] < 0);
4147 }
4148
4149 /*
4150 * double splits happen when we need to insert a big item in the middle
4151 * of a leaf. A double split can leave us with 3 mostly empty leaves:
4152 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4153 * A B C
4154 *
4155 * We avoid this by trying to push the items on either side of our target
4156 * into the adjacent leaves. If all goes well we can avoid the double split
4157 * completely.
4158 */
4159 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4160 struct btrfs_root *root,
4161 struct btrfs_path *path,
4162 int data_size)
4163 {
4164 struct btrfs_fs_info *fs_info = root->fs_info;
4165 int ret;
4166 int progress = 0;
4167 int slot;
4168 u32 nritems;
4169 int space_needed = data_size;
4170
4171 slot = path->slots[0];
4172 if (slot < btrfs_header_nritems(path->nodes[0]))
4173 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4174
4175 /*
4176 * try to push all the items after our slot into the
4177 * right leaf
4178 */
4179 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4180 if (ret < 0)
4181 return ret;
4182
4183 if (ret == 0)
4184 progress++;
4185
4186 nritems = btrfs_header_nritems(path->nodes[0]);
4187 /*
4188 * our goal is to get our slot at the start or end of a leaf. If
4189 * we've done so we're done
4190 */
4191 if (path->slots[0] == 0 || path->slots[0] == nritems)
4192 return 0;
4193
4194 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4195 return 0;
4196
4197 /* try to push all the items before our slot into the next leaf */
4198 slot = path->slots[0];
4199 space_needed = data_size;
4200 if (slot > 0)
4201 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4202 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4203 if (ret < 0)
4204 return ret;
4205
4206 if (ret == 0)
4207 progress++;
4208
4209 if (progress)
4210 return 0;
4211 return 1;
4212 }
4213
4214 /*
4215 * split the path's leaf in two, making sure there is at least data_size
4216 * available for the resulting leaf level of the path.
4217 *
4218 * returns 0 if all went well and < 0 on failure.
4219 */
4220 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4221 struct btrfs_root *root,
4222 const struct btrfs_key *ins_key,
4223 struct btrfs_path *path, int data_size,
4224 int extend)
4225 {
4226 struct btrfs_disk_key disk_key;
4227 struct extent_buffer *l;
4228 u32 nritems;
4229 int mid;
4230 int slot;
4231 struct extent_buffer *right;
4232 struct btrfs_fs_info *fs_info = root->fs_info;
4233 int ret = 0;
4234 int wret;
4235 int split;
4236 int num_doubles = 0;
4237 int tried_avoid_double = 0;
4238
4239 l = path->nodes[0];
4240 slot = path->slots[0];
4241 if (extend && data_size + btrfs_item_size_nr(l, slot) +
4242 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4243 return -EOVERFLOW;
4244
4245 /* first try to make some room by pushing left and right */
4246 if (data_size && path->nodes[1]) {
4247 int space_needed = data_size;
4248
4249 if (slot < btrfs_header_nritems(l))
4250 space_needed -= btrfs_leaf_free_space(fs_info, l);
4251
4252 wret = push_leaf_right(trans, root, path, space_needed,
4253 space_needed, 0, 0);
4254 if (wret < 0)
4255 return wret;
4256 if (wret) {
4257 space_needed = data_size;
4258 if (slot > 0)
4259 space_needed -= btrfs_leaf_free_space(fs_info,
4260 l);
4261 wret = push_leaf_left(trans, root, path, space_needed,
4262 space_needed, 0, (u32)-1);
4263 if (wret < 0)
4264 return wret;
4265 }
4266 l = path->nodes[0];
4267
4268 /* did the pushes work? */
4269 if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4270 return 0;
4271 }
4272
4273 if (!path->nodes[1]) {
4274 ret = insert_new_root(trans, root, path, 1);
4275 if (ret)
4276 return ret;
4277 }
4278 again:
4279 split = 1;
4280 l = path->nodes[0];
4281 slot = path->slots[0];
4282 nritems = btrfs_header_nritems(l);
4283 mid = (nritems + 1) / 2;
4284
4285 if (mid <= slot) {
4286 if (nritems == 1 ||
4287 leaf_space_used(l, mid, nritems - mid) + data_size >
4288 BTRFS_LEAF_DATA_SIZE(fs_info)) {
4289 if (slot >= nritems) {
4290 split = 0;
4291 } else {
4292 mid = slot;
4293 if (mid != nritems &&
4294 leaf_space_used(l, mid, nritems - mid) +
4295 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4296 if (data_size && !tried_avoid_double)
4297 goto push_for_double;
4298 split = 2;
4299 }
4300 }
4301 }
4302 } else {
4303 if (leaf_space_used(l, 0, mid) + data_size >
4304 BTRFS_LEAF_DATA_SIZE(fs_info)) {
4305 if (!extend && data_size && slot == 0) {
4306 split = 0;
4307 } else if ((extend || !data_size) && slot == 0) {
4308 mid = 1;
4309 } else {
4310 mid = slot;
4311 if (mid != nritems &&
4312 leaf_space_used(l, mid, nritems - mid) +
4313 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4314 if (data_size && !tried_avoid_double)
4315 goto push_for_double;
4316 split = 2;
4317 }
4318 }
4319 }
4320 }
4321
4322 if (split == 0)
4323 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4324 else
4325 btrfs_item_key(l, &disk_key, mid);
4326
4327 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4328 l->start, 0);
4329 if (IS_ERR(right))
4330 return PTR_ERR(right);
4331
4332 root_add_used(root, fs_info->nodesize);
4333
4334 memzero_extent_buffer(right, 0, sizeof(struct btrfs_header));
4335 btrfs_set_header_bytenr(right, right->start);
4336 btrfs_set_header_generation(right, trans->transid);
4337 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4338 btrfs_set_header_owner(right, root->root_key.objectid);
4339 btrfs_set_header_level(right, 0);
4340 write_extent_buffer_fsid(right, fs_info->fsid);
4341 write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid);
4342
4343 if (split == 0) {
4344 if (mid <= slot) {
4345 btrfs_set_header_nritems(right, 0);
4346 insert_ptr(trans, fs_info, path, &disk_key,
4347 right->start, path->slots[1] + 1, 1);
4348 btrfs_tree_unlock(path->nodes[0]);
4349 free_extent_buffer(path->nodes[0]);
4350 path->nodes[0] = right;
4351 path->slots[0] = 0;
4352 path->slots[1] += 1;
4353 } else {
4354 btrfs_set_header_nritems(right, 0);
4355 insert_ptr(trans, fs_info, path, &disk_key,
4356 right->start, path->slots[1], 1);
4357 btrfs_tree_unlock(path->nodes[0]);
4358 free_extent_buffer(path->nodes[0]);
4359 path->nodes[0] = right;
4360 path->slots[0] = 0;
4361 if (path->slots[1] == 0)
4362 fixup_low_keys(fs_info, path, &disk_key, 1);
4363 }
4364 /*
4365 * We create a new leaf 'right' for the required ins_len and
4366 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4367 * the content of ins_len to 'right'.
4368 */
4369 return ret;
4370 }
4371
4372 copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4373
4374 if (split == 2) {
4375 BUG_ON(num_doubles != 0);
4376 num_doubles++;
4377 goto again;
4378 }
4379
4380 return 0;
4381
4382 push_for_double:
4383 push_for_double_split(trans, root, path, data_size);
4384 tried_avoid_double = 1;
4385 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4386 return 0;
4387 goto again;
4388 }
4389
4390 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4391 struct btrfs_root *root,
4392 struct btrfs_path *path, int ins_len)
4393 {
4394 struct btrfs_fs_info *fs_info = root->fs_info;
4395 struct btrfs_key key;
4396 struct extent_buffer *leaf;
4397 struct btrfs_file_extent_item *fi;
4398 u64 extent_len = 0;
4399 u32 item_size;
4400 int ret;
4401
4402 leaf = path->nodes[0];
4403 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4404
4405 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4406 key.type != BTRFS_EXTENT_CSUM_KEY);
4407
4408 if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4409 return 0;
4410
4411 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4412 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4413 fi = btrfs_item_ptr(leaf, path->slots[0],
4414 struct btrfs_file_extent_item);
4415 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4416 }
4417 btrfs_release_path(path);
4418
4419 path->keep_locks = 1;
4420 path->search_for_split = 1;
4421 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4422 path->search_for_split = 0;
4423 if (ret > 0)
4424 ret = -EAGAIN;
4425 if (ret < 0)
4426 goto err;
4427
4428 ret = -EAGAIN;
4429 leaf = path->nodes[0];
4430 /* if our item isn't there, return now */
4431 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4432 goto err;
4433
4434 /* the leaf has changed, it now has room. return now */
4435 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4436 goto err;
4437
4438 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4439 fi = btrfs_item_ptr(leaf, path->slots[0],
4440 struct btrfs_file_extent_item);
4441 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4442 goto err;
4443 }
4444
4445 btrfs_set_path_blocking(path);
4446 ret = split_leaf(trans, root, &key, path, ins_len, 1);
4447 if (ret)
4448 goto err;
4449
4450 path->keep_locks = 0;
4451 btrfs_unlock_up_safe(path, 1);
4452 return 0;
4453 err:
4454 path->keep_locks = 0;
4455 return ret;
4456 }
4457
4458 static noinline int split_item(struct btrfs_fs_info *fs_info,
4459 struct btrfs_path *path,
4460 const struct btrfs_key *new_key,
4461 unsigned long split_offset)
4462 {
4463 struct extent_buffer *leaf;
4464 struct btrfs_item *item;
4465 struct btrfs_item *new_item;
4466 int slot;
4467 char *buf;
4468 u32 nritems;
4469 u32 item_size;
4470 u32 orig_offset;
4471 struct btrfs_disk_key disk_key;
4472
4473 leaf = path->nodes[0];
4474 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4475
4476 btrfs_set_path_blocking(path);
4477
4478 item = btrfs_item_nr(path->slots[0]);
4479 orig_offset = btrfs_item_offset(leaf, item);
4480 item_size = btrfs_item_size(leaf, item);
4481
4482 buf = kmalloc(item_size, GFP_NOFS);
4483 if (!buf)
4484 return -ENOMEM;
4485
4486 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4487 path->slots[0]), item_size);
4488
4489 slot = path->slots[0] + 1;
4490 nritems = btrfs_header_nritems(leaf);
4491 if (slot != nritems) {
4492 /* shift the items */
4493 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4494 btrfs_item_nr_offset(slot),
4495 (nritems - slot) * sizeof(struct btrfs_item));
4496 }
4497
4498 btrfs_cpu_key_to_disk(&disk_key, new_key);
4499 btrfs_set_item_key(leaf, &disk_key, slot);
4500
4501 new_item = btrfs_item_nr(slot);
4502
4503 btrfs_set_item_offset(leaf, new_item, orig_offset);
4504 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4505
4506 btrfs_set_item_offset(leaf, item,
4507 orig_offset + item_size - split_offset);
4508 btrfs_set_item_size(leaf, item, split_offset);
4509
4510 btrfs_set_header_nritems(leaf, nritems + 1);
4511
4512 /* write the data for the start of the original item */
4513 write_extent_buffer(leaf, buf,
4514 btrfs_item_ptr_offset(leaf, path->slots[0]),
4515 split_offset);
4516
4517 /* write the data for the new item */
4518 write_extent_buffer(leaf, buf + split_offset,
4519 btrfs_item_ptr_offset(leaf, slot),
4520 item_size - split_offset);
4521 btrfs_mark_buffer_dirty(leaf);
4522
4523 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4524 kfree(buf);
4525 return 0;
4526 }
4527
4528 /*
4529 * This function splits a single item into two items,
4530 * giving 'new_key' to the new item and splitting the
4531 * old one at split_offset (from the start of the item).
4532 *
4533 * The path may be released by this operation. After
4534 * the split, the path is pointing to the old item. The
4535 * new item is going to be in the same node as the old one.
4536 *
4537 * Note, the item being split must be smaller enough to live alone on
4538 * a tree block with room for one extra struct btrfs_item
4539 *
4540 * This allows us to split the item in place, keeping a lock on the
4541 * leaf the entire time.
4542 */
4543 int btrfs_split_item(struct btrfs_trans_handle *trans,
4544 struct btrfs_root *root,
4545 struct btrfs_path *path,
4546 const struct btrfs_key *new_key,
4547 unsigned long split_offset)
4548 {
4549 int ret;
4550 ret = setup_leaf_for_split(trans, root, path,
4551 sizeof(struct btrfs_item));
4552 if (ret)
4553 return ret;
4554
4555 ret = split_item(root->fs_info, path, new_key, split_offset);
4556 return ret;
4557 }
4558
4559 /*
4560 * This function duplicate a item, giving 'new_key' to the new item.
4561 * It guarantees both items live in the same tree leaf and the new item
4562 * is contiguous with the original item.
4563 *
4564 * This allows us to split file extent in place, keeping a lock on the
4565 * leaf the entire time.
4566 */
4567 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4568 struct btrfs_root *root,
4569 struct btrfs_path *path,
4570 const struct btrfs_key *new_key)
4571 {
4572 struct extent_buffer *leaf;
4573 int ret;
4574 u32 item_size;
4575
4576 leaf = path->nodes[0];
4577 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4578 ret = setup_leaf_for_split(trans, root, path,
4579 item_size + sizeof(struct btrfs_item));
4580 if (ret)
4581 return ret;
4582
4583 path->slots[0]++;
4584 setup_items_for_insert(root, path, new_key, &item_size,
4585 item_size, item_size +
4586 sizeof(struct btrfs_item), 1);
4587 leaf = path->nodes[0];
4588 memcpy_extent_buffer(leaf,
4589 btrfs_item_ptr_offset(leaf, path->slots[0]),
4590 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4591 item_size);
4592 return 0;
4593 }
4594
4595 /*
4596 * make the item pointed to by the path smaller. new_size indicates
4597 * how small to make it, and from_end tells us if we just chop bytes
4598 * off the end of the item or if we shift the item to chop bytes off
4599 * the front.
4600 */
4601 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4602 struct btrfs_path *path, u32 new_size, int from_end)
4603 {
4604 int slot;
4605 struct extent_buffer *leaf;
4606 struct btrfs_item *item;
4607 u32 nritems;
4608 unsigned int data_end;
4609 unsigned int old_data_start;
4610 unsigned int old_size;
4611 unsigned int size_diff;
4612 int i;
4613 struct btrfs_map_token token;
4614
4615 btrfs_init_map_token(&token);
4616
4617 leaf = path->nodes[0];
4618 slot = path->slots[0];
4619
4620 old_size = btrfs_item_size_nr(leaf, slot);
4621 if (old_size == new_size)
4622 return;
4623
4624 nritems = btrfs_header_nritems(leaf);
4625 data_end = leaf_data_end(fs_info, leaf);
4626
4627 old_data_start = btrfs_item_offset_nr(leaf, slot);
4628
4629 size_diff = old_size - new_size;
4630
4631 BUG_ON(slot < 0);
4632 BUG_ON(slot >= nritems);
4633
4634 /*
4635 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4636 */
4637 /* first correct the data pointers */
4638 for (i = slot; i < nritems; i++) {
4639 u32 ioff;
4640 item = btrfs_item_nr(i);
4641
4642 ioff = btrfs_token_item_offset(leaf, item, &token);
4643 btrfs_set_token_item_offset(leaf, item,
4644 ioff + size_diff, &token);
4645 }
4646
4647 /* shift the data */
4648 if (from_end) {
4649 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4650 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4651 data_end, old_data_start + new_size - data_end);
4652 } else {
4653 struct btrfs_disk_key disk_key;
4654 u64 offset;
4655
4656 btrfs_item_key(leaf, &disk_key, slot);
4657
4658 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4659 unsigned long ptr;
4660 struct btrfs_file_extent_item *fi;
4661
4662 fi = btrfs_item_ptr(leaf, slot,
4663 struct btrfs_file_extent_item);
4664 fi = (struct btrfs_file_extent_item *)(
4665 (unsigned long)fi - size_diff);
4666
4667 if (btrfs_file_extent_type(leaf, fi) ==
4668 BTRFS_FILE_EXTENT_INLINE) {
4669 ptr = btrfs_item_ptr_offset(leaf, slot);
4670 memmove_extent_buffer(leaf, ptr,
4671 (unsigned long)fi,
4672 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4673 }
4674 }
4675
4676 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4677 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4678 data_end, old_data_start - data_end);
4679
4680 offset = btrfs_disk_key_offset(&disk_key);
4681 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4682 btrfs_set_item_key(leaf, &disk_key, slot);
4683 if (slot == 0)
4684 fixup_low_keys(fs_info, path, &disk_key, 1);
4685 }
4686
4687 item = btrfs_item_nr(slot);
4688 btrfs_set_item_size(leaf, item, new_size);
4689 btrfs_mark_buffer_dirty(leaf);
4690
4691 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4692 btrfs_print_leaf(leaf);
4693 BUG();
4694 }
4695 }
4696
4697 /*
4698 * make the item pointed to by the path bigger, data_size is the added size.
4699 */
4700 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4701 u32 data_size)
4702 {
4703 int slot;
4704 struct extent_buffer *leaf;
4705 struct btrfs_item *item;
4706 u32 nritems;
4707 unsigned int data_end;
4708 unsigned int old_data;
4709 unsigned int old_size;
4710 int i;
4711 struct btrfs_map_token token;
4712
4713 btrfs_init_map_token(&token);
4714
4715 leaf = path->nodes[0];
4716
4717 nritems = btrfs_header_nritems(leaf);
4718 data_end = leaf_data_end(fs_info, leaf);
4719
4720 if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4721 btrfs_print_leaf(leaf);
4722 BUG();
4723 }
4724 slot = path->slots[0];
4725 old_data = btrfs_item_end_nr(leaf, slot);
4726
4727 BUG_ON(slot < 0);
4728 if (slot >= nritems) {
4729 btrfs_print_leaf(leaf);
4730 btrfs_crit(fs_info, "slot %d too large, nritems %d",
4731 slot, nritems);
4732 BUG_ON(1);
4733 }
4734
4735 /*
4736 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4737 */
4738 /* first correct the data pointers */
4739 for (i = slot; i < nritems; i++) {
4740 u32 ioff;
4741 item = btrfs_item_nr(i);
4742
4743 ioff = btrfs_token_item_offset(leaf, item, &token);
4744 btrfs_set_token_item_offset(leaf, item,
4745 ioff - data_size, &token);
4746 }
4747
4748 /* shift the data */
4749 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4750 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4751 data_end, old_data - data_end);
4752
4753 data_end = old_data;
4754 old_size = btrfs_item_size_nr(leaf, slot);
4755 item = btrfs_item_nr(slot);
4756 btrfs_set_item_size(leaf, item, old_size + data_size);
4757 btrfs_mark_buffer_dirty(leaf);
4758
4759 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4760 btrfs_print_leaf(leaf);
4761 BUG();
4762 }
4763 }
4764
4765 /*
4766 * this is a helper for btrfs_insert_empty_items, the main goal here is
4767 * to save stack depth by doing the bulk of the work in a function
4768 * that doesn't call btrfs_search_slot
4769 */
4770 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4771 const struct btrfs_key *cpu_key, u32 *data_size,
4772 u32 total_data, u32 total_size, int nr)
4773 {
4774 struct btrfs_fs_info *fs_info = root->fs_info;
4775 struct btrfs_item *item;
4776 int i;
4777 u32 nritems;
4778 unsigned int data_end;
4779 struct btrfs_disk_key disk_key;
4780 struct extent_buffer *leaf;
4781 int slot;
4782 struct btrfs_map_token token;
4783
4784 if (path->slots[0] == 0) {
4785 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4786 fixup_low_keys(fs_info, path, &disk_key, 1);
4787 }
4788 btrfs_unlock_up_safe(path, 1);
4789
4790 btrfs_init_map_token(&token);
4791
4792 leaf = path->nodes[0];
4793 slot = path->slots[0];
4794
4795 nritems = btrfs_header_nritems(leaf);
4796 data_end = leaf_data_end(fs_info, leaf);
4797
4798 if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4799 btrfs_print_leaf(leaf);
4800 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4801 total_size, btrfs_leaf_free_space(fs_info, leaf));
4802 BUG();
4803 }
4804
4805 if (slot != nritems) {
4806 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4807
4808 if (old_data < data_end) {
4809 btrfs_print_leaf(leaf);
4810 btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4811 slot, old_data, data_end);
4812 BUG_ON(1);
4813 }
4814 /*
4815 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4816 */
4817 /* first correct the data pointers */
4818 for (i = slot; i < nritems; i++) {
4819 u32 ioff;
4820
4821 item = btrfs_item_nr(i);
4822 ioff = btrfs_token_item_offset(leaf, item, &token);
4823 btrfs_set_token_item_offset(leaf, item,
4824 ioff - total_data, &token);
4825 }
4826 /* shift the items */
4827 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4828 btrfs_item_nr_offset(slot),
4829 (nritems - slot) * sizeof(struct btrfs_item));
4830
4831 /* shift the data */
4832 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4833 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4834 data_end, old_data - data_end);
4835 data_end = old_data;
4836 }
4837
4838 /* setup the item for the new data */
4839 for (i = 0; i < nr; i++) {
4840 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4841 btrfs_set_item_key(leaf, &disk_key, slot + i);
4842 item = btrfs_item_nr(slot + i);
4843 btrfs_set_token_item_offset(leaf, item,
4844 data_end - data_size[i], &token);
4845 data_end -= data_size[i];
4846 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4847 }
4848
4849 btrfs_set_header_nritems(leaf, nritems + nr);
4850 btrfs_mark_buffer_dirty(leaf);
4851
4852 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4853 btrfs_print_leaf(leaf);
4854 BUG();
4855 }
4856 }
4857
4858 /*
4859 * Given a key and some data, insert items into the tree.
4860 * This does all the path init required, making room in the tree if needed.
4861 */
4862 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4863 struct btrfs_root *root,
4864 struct btrfs_path *path,
4865 const struct btrfs_key *cpu_key, u32 *data_size,
4866 int nr)
4867 {
4868 int ret = 0;
4869 int slot;
4870 int i;
4871 u32 total_size = 0;
4872 u32 total_data = 0;
4873
4874 for (i = 0; i < nr; i++)
4875 total_data += data_size[i];
4876
4877 total_size = total_data + (nr * sizeof(struct btrfs_item));
4878 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4879 if (ret == 0)
4880 return -EEXIST;
4881 if (ret < 0)
4882 return ret;
4883
4884 slot = path->slots[0];
4885 BUG_ON(slot < 0);
4886
4887 setup_items_for_insert(root, path, cpu_key, data_size,
4888 total_data, total_size, nr);
4889 return 0;
4890 }
4891
4892 /*
4893 * Given a key and some data, insert an item into the tree.
4894 * This does all the path init required, making room in the tree if needed.
4895 */
4896 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4897 const struct btrfs_key *cpu_key, void *data,
4898 u32 data_size)
4899 {
4900 int ret = 0;
4901 struct btrfs_path *path;
4902 struct extent_buffer *leaf;
4903 unsigned long ptr;
4904
4905 path = btrfs_alloc_path();
4906 if (!path)
4907 return -ENOMEM;
4908 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4909 if (!ret) {
4910 leaf = path->nodes[0];
4911 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4912 write_extent_buffer(leaf, data, ptr, data_size);
4913 btrfs_mark_buffer_dirty(leaf);
4914 }
4915 btrfs_free_path(path);
4916 return ret;
4917 }
4918
4919 /*
4920 * delete the pointer from a given node.
4921 *
4922 * the tree should have been previously balanced so the deletion does not
4923 * empty a node.
4924 */
4925 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4926 int level, int slot)
4927 {
4928 struct btrfs_fs_info *fs_info = root->fs_info;
4929 struct extent_buffer *parent = path->nodes[level];
4930 u32 nritems;
4931 int ret;
4932
4933 nritems = btrfs_header_nritems(parent);
4934 if (slot != nritems - 1) {
4935 if (level)
4936 tree_mod_log_eb_move(fs_info, parent, slot,
4937 slot + 1, nritems - slot - 1);
4938 memmove_extent_buffer(parent,
4939 btrfs_node_key_ptr_offset(slot),
4940 btrfs_node_key_ptr_offset(slot + 1),
4941 sizeof(struct btrfs_key_ptr) *
4942 (nritems - slot - 1));
4943 } else if (level) {
4944 ret = tree_mod_log_insert_key(fs_info, parent, slot,
4945 MOD_LOG_KEY_REMOVE, GFP_NOFS);
4946 BUG_ON(ret < 0);
4947 }
4948
4949 nritems--;
4950 btrfs_set_header_nritems(parent, nritems);
4951 if (nritems == 0 && parent == root->node) {
4952 BUG_ON(btrfs_header_level(root->node) != 1);
4953 /* just turn the root into a leaf and break */
4954 btrfs_set_header_level(root->node, 0);
4955 } else if (slot == 0) {
4956 struct btrfs_disk_key disk_key;
4957
4958 btrfs_node_key(parent, &disk_key, 0);
4959 fixup_low_keys(fs_info, path, &disk_key, level + 1);
4960 }
4961 btrfs_mark_buffer_dirty(parent);
4962 }
4963
4964 /*
4965 * a helper function to delete the leaf pointed to by path->slots[1] and
4966 * path->nodes[1].
4967 *
4968 * This deletes the pointer in path->nodes[1] and frees the leaf
4969 * block extent. zero is returned if it all worked out, < 0 otherwise.
4970 *
4971 * The path must have already been setup for deleting the leaf, including
4972 * all the proper balancing. path->nodes[1] must be locked.
4973 */
4974 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4975 struct btrfs_root *root,
4976 struct btrfs_path *path,
4977 struct extent_buffer *leaf)
4978 {
4979 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4980 del_ptr(root, path, 1, path->slots[1]);
4981
4982 /*
4983 * btrfs_free_extent is expensive, we want to make sure we
4984 * aren't holding any locks when we call it
4985 */
4986 btrfs_unlock_up_safe(path, 0);
4987
4988 root_sub_used(root, leaf->len);
4989
4990 extent_buffer_get(leaf);
4991 btrfs_free_tree_block(trans, root, leaf, 0, 1);
4992 free_extent_buffer_stale(leaf);
4993 }
4994 /*
4995 * delete the item at the leaf level in path. If that empties
4996 * the leaf, remove it from the tree
4997 */
4998 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4999 struct btrfs_path *path, int slot, int nr)
5000 {
5001 struct btrfs_fs_info *fs_info = root->fs_info;
5002 struct extent_buffer *leaf;
5003 struct btrfs_item *item;
5004 u32 last_off;
5005 u32 dsize = 0;
5006 int ret = 0;
5007 int wret;
5008 int i;
5009 u32 nritems;
5010 struct btrfs_map_token token;
5011
5012 btrfs_init_map_token(&token);
5013
5014 leaf = path->nodes[0];
5015 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5016
5017 for (i = 0; i < nr; i++)
5018 dsize += btrfs_item_size_nr(leaf, slot + i);
5019
5020 nritems = btrfs_header_nritems(leaf);
5021
5022 if (slot + nr != nritems) {
5023 int data_end = leaf_data_end(fs_info, leaf);
5024
5025 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5026 data_end + dsize,
5027 BTRFS_LEAF_DATA_OFFSET + data_end,
5028 last_off - data_end);
5029
5030 for (i = slot + nr; i < nritems; i++) {
5031 u32 ioff;
5032
5033 item = btrfs_item_nr(i);
5034 ioff = btrfs_token_item_offset(leaf, item, &token);
5035 btrfs_set_token_item_offset(leaf, item,
5036 ioff + dsize, &token);
5037 }
5038
5039 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5040 btrfs_item_nr_offset(slot + nr),
5041 sizeof(struct btrfs_item) *
5042 (nritems - slot - nr));
5043 }
5044 btrfs_set_header_nritems(leaf, nritems - nr);
5045 nritems -= nr;
5046
5047 /* delete the leaf if we've emptied it */
5048 if (nritems == 0) {
5049 if (leaf == root->node) {
5050 btrfs_set_header_level(leaf, 0);
5051 } else {
5052 btrfs_set_path_blocking(path);
5053 clean_tree_block(fs_info, leaf);
5054 btrfs_del_leaf(trans, root, path, leaf);
5055 }
5056 } else {
5057 int used = leaf_space_used(leaf, 0, nritems);
5058 if (slot == 0) {
5059 struct btrfs_disk_key disk_key;
5060
5061 btrfs_item_key(leaf, &disk_key, 0);
5062 fixup_low_keys(fs_info, path, &disk_key, 1);
5063 }
5064
5065 /* delete the leaf if it is mostly empty */
5066 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5067 /* push_leaf_left fixes the path.
5068 * make sure the path still points to our leaf
5069 * for possible call to del_ptr below
5070 */
5071 slot = path->slots[1];
5072 extent_buffer_get(leaf);
5073
5074 btrfs_set_path_blocking(path);
5075 wret = push_leaf_left(trans, root, path, 1, 1,
5076 1, (u32)-1);
5077 if (wret < 0 && wret != -ENOSPC)
5078 ret = wret;
5079
5080 if (path->nodes[0] == leaf &&
5081 btrfs_header_nritems(leaf)) {
5082 wret = push_leaf_right(trans, root, path, 1,
5083 1, 1, 0);
5084 if (wret < 0 && wret != -ENOSPC)
5085 ret = wret;
5086 }
5087
5088 if (btrfs_header_nritems(leaf) == 0) {
5089 path->slots[1] = slot;
5090 btrfs_del_leaf(trans, root, path, leaf);
5091 free_extent_buffer(leaf);
5092 ret = 0;
5093 } else {
5094 /* if we're still in the path, make sure
5095 * we're dirty. Otherwise, one of the
5096 * push_leaf functions must have already
5097 * dirtied this buffer
5098 */
5099 if (path->nodes[0] == leaf)
5100 btrfs_mark_buffer_dirty(leaf);
5101 free_extent_buffer(leaf);
5102 }
5103 } else {
5104 btrfs_mark_buffer_dirty(leaf);
5105 }
5106 }
5107 return ret;
5108 }
5109
5110 /*
5111 * search the tree again to find a leaf with lesser keys
5112 * returns 0 if it found something or 1 if there are no lesser leaves.
5113 * returns < 0 on io errors.
5114 *
5115 * This may release the path, and so you may lose any locks held at the
5116 * time you call it.
5117 */
5118 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5119 {
5120 struct btrfs_key key;
5121 struct btrfs_disk_key found_key;
5122 int ret;
5123
5124 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5125
5126 if (key.offset > 0) {
5127 key.offset--;
5128 } else if (key.type > 0) {
5129 key.type--;
5130 key.offset = (u64)-1;
5131 } else if (key.objectid > 0) {
5132 key.objectid--;
5133 key.type = (u8)-1;
5134 key.offset = (u64)-1;
5135 } else {
5136 return 1;
5137 }
5138
5139 btrfs_release_path(path);
5140 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5141 if (ret < 0)
5142 return ret;
5143 btrfs_item_key(path->nodes[0], &found_key, 0);
5144 ret = comp_keys(&found_key, &key);
5145 /*
5146 * We might have had an item with the previous key in the tree right
5147 * before we released our path. And after we released our path, that
5148 * item might have been pushed to the first slot (0) of the leaf we
5149 * were holding due to a tree balance. Alternatively, an item with the
5150 * previous key can exist as the only element of a leaf (big fat item).
5151 * Therefore account for these 2 cases, so that our callers (like
5152 * btrfs_previous_item) don't miss an existing item with a key matching
5153 * the previous key we computed above.
5154 */
5155 if (ret <= 0)
5156 return 0;
5157 return 1;
5158 }
5159
5160 /*
5161 * A helper function to walk down the tree starting at min_key, and looking
5162 * for nodes or leaves that are have a minimum transaction id.
5163 * This is used by the btree defrag code, and tree logging
5164 *
5165 * This does not cow, but it does stuff the starting key it finds back
5166 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5167 * key and get a writable path.
5168 *
5169 * This does lock as it descends, and path->keep_locks should be set
5170 * to 1 by the caller.
5171 *
5172 * This honors path->lowest_level to prevent descent past a given level
5173 * of the tree.
5174 *
5175 * min_trans indicates the oldest transaction that you are interested
5176 * in walking through. Any nodes or leaves older than min_trans are
5177 * skipped over (without reading them).
5178 *
5179 * returns zero if something useful was found, < 0 on error and 1 if there
5180 * was nothing in the tree that matched the search criteria.
5181 */
5182 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5183 struct btrfs_path *path,
5184 u64 min_trans)
5185 {
5186 struct btrfs_fs_info *fs_info = root->fs_info;
5187 struct extent_buffer *cur;
5188 struct btrfs_key found_key;
5189 int slot;
5190 int sret;
5191 u32 nritems;
5192 int level;
5193 int ret = 1;
5194 int keep_locks = path->keep_locks;
5195
5196 path->keep_locks = 1;
5197 again:
5198 cur = btrfs_read_lock_root_node(root);
5199 level = btrfs_header_level(cur);
5200 WARN_ON(path->nodes[level]);
5201 path->nodes[level] = cur;
5202 path->locks[level] = BTRFS_READ_LOCK;
5203
5204 if (btrfs_header_generation(cur) < min_trans) {
5205 ret = 1;
5206 goto out;
5207 }
5208 while (1) {
5209 nritems = btrfs_header_nritems(cur);
5210 level = btrfs_header_level(cur);
5211 sret = bin_search(cur, min_key, level, &slot);
5212
5213 /* at the lowest level, we're done, setup the path and exit */
5214 if (level == path->lowest_level) {
5215 if (slot >= nritems)
5216 goto find_next_key;
5217 ret = 0;
5218 path->slots[level] = slot;
5219 btrfs_item_key_to_cpu(cur, &found_key, slot);
5220 goto out;
5221 }
5222 if (sret && slot > 0)
5223 slot--;
5224 /*
5225 * check this node pointer against the min_trans parameters.
5226 * If it is too old, old, skip to the next one.
5227 */
5228 while (slot < nritems) {
5229 u64 gen;
5230
5231 gen = btrfs_node_ptr_generation(cur, slot);
5232 if (gen < min_trans) {
5233 slot++;
5234 continue;
5235 }
5236 break;
5237 }
5238 find_next_key:
5239 /*
5240 * we didn't find a candidate key in this node, walk forward
5241 * and find another one
5242 */
5243 if (slot >= nritems) {
5244 path->slots[level] = slot;
5245 btrfs_set_path_blocking(path);
5246 sret = btrfs_find_next_key(root, path, min_key, level,
5247 min_trans);
5248 if (sret == 0) {
5249 btrfs_release_path(path);
5250 goto again;
5251 } else {
5252 goto out;
5253 }
5254 }
5255 /* save our key for returning back */
5256 btrfs_node_key_to_cpu(cur, &found_key, slot);
5257 path->slots[level] = slot;
5258 if (level == path->lowest_level) {
5259 ret = 0;
5260 goto out;
5261 }
5262 btrfs_set_path_blocking(path);
5263 cur = read_node_slot(fs_info, cur, slot);
5264 if (IS_ERR(cur)) {
5265 ret = PTR_ERR(cur);
5266 goto out;
5267 }
5268
5269 btrfs_tree_read_lock(cur);
5270
5271 path->locks[level - 1] = BTRFS_READ_LOCK;
5272 path->nodes[level - 1] = cur;
5273 unlock_up(path, level, 1, 0, NULL);
5274 btrfs_clear_path_blocking(path, NULL, 0);
5275 }
5276 out:
5277 path->keep_locks = keep_locks;
5278 if (ret == 0) {
5279 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5280 btrfs_set_path_blocking(path);
5281 memcpy(min_key, &found_key, sizeof(found_key));
5282 }
5283 return ret;
5284 }
5285
5286 static int tree_move_down(struct btrfs_fs_info *fs_info,
5287 struct btrfs_path *path,
5288 int *level)
5289 {
5290 struct extent_buffer *eb;
5291
5292 BUG_ON(*level == 0);
5293 eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5294 if (IS_ERR(eb))
5295 return PTR_ERR(eb);
5296
5297 path->nodes[*level - 1] = eb;
5298 path->slots[*level - 1] = 0;
5299 (*level)--;
5300 return 0;
5301 }
5302
5303 static int tree_move_next_or_upnext(struct btrfs_path *path,
5304 int *level, int root_level)
5305 {
5306 int ret = 0;
5307 int nritems;
5308 nritems = btrfs_header_nritems(path->nodes[*level]);
5309
5310 path->slots[*level]++;
5311
5312 while (path->slots[*level] >= nritems) {
5313 if (*level == root_level)
5314 return -1;
5315
5316 /* move upnext */
5317 path->slots[*level] = 0;
5318 free_extent_buffer(path->nodes[*level]);
5319 path->nodes[*level] = NULL;
5320 (*level)++;
5321 path->slots[*level]++;
5322
5323 nritems = btrfs_header_nritems(path->nodes[*level]);
5324 ret = 1;
5325 }
5326 return ret;
5327 }
5328
5329 /*
5330 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5331 * or down.
5332 */
5333 static int tree_advance(struct btrfs_fs_info *fs_info,
5334 struct btrfs_path *path,
5335 int *level, int root_level,
5336 int allow_down,
5337 struct btrfs_key *key)
5338 {
5339 int ret;
5340
5341 if (*level == 0 || !allow_down) {
5342 ret = tree_move_next_or_upnext(path, level, root_level);
5343 } else {
5344 ret = tree_move_down(fs_info, path, level);
5345 }
5346 if (ret >= 0) {
5347 if (*level == 0)
5348 btrfs_item_key_to_cpu(path->nodes[*level], key,
5349 path->slots[*level]);
5350 else
5351 btrfs_node_key_to_cpu(path->nodes[*level], key,
5352 path->slots[*level]);
5353 }
5354 return ret;
5355 }
5356
5357 static int tree_compare_item(struct btrfs_path *left_path,
5358 struct btrfs_path *right_path,
5359 char *tmp_buf)
5360 {
5361 int cmp;
5362 int len1, len2;
5363 unsigned long off1, off2;
5364
5365 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5366 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5367 if (len1 != len2)
5368 return 1;
5369
5370 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5371 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5372 right_path->slots[0]);
5373
5374 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5375
5376 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5377 if (cmp)
5378 return 1;
5379 return 0;
5380 }
5381
5382 #define ADVANCE 1
5383 #define ADVANCE_ONLY_NEXT -1
5384
5385 /*
5386 * This function compares two trees and calls the provided callback for
5387 * every changed/new/deleted item it finds.
5388 * If shared tree blocks are encountered, whole subtrees are skipped, making
5389 * the compare pretty fast on snapshotted subvolumes.
5390 *
5391 * This currently works on commit roots only. As commit roots are read only,
5392 * we don't do any locking. The commit roots are protected with transactions.
5393 * Transactions are ended and rejoined when a commit is tried in between.
5394 *
5395 * This function checks for modifications done to the trees while comparing.
5396 * If it detects a change, it aborts immediately.
5397 */
5398 int btrfs_compare_trees(struct btrfs_root *left_root,
5399 struct btrfs_root *right_root,
5400 btrfs_changed_cb_t changed_cb, void *ctx)
5401 {
5402 struct btrfs_fs_info *fs_info = left_root->fs_info;
5403 int ret;
5404 int cmp;
5405 struct btrfs_path *left_path = NULL;
5406 struct btrfs_path *right_path = NULL;
5407 struct btrfs_key left_key;
5408 struct btrfs_key right_key;
5409 char *tmp_buf = NULL;
5410 int left_root_level;
5411 int right_root_level;
5412 int left_level;
5413 int right_level;
5414 int left_end_reached;
5415 int right_end_reached;
5416 int advance_left;
5417 int advance_right;
5418 u64 left_blockptr;
5419 u64 right_blockptr;
5420 u64 left_gen;
5421 u64 right_gen;
5422
5423 left_path = btrfs_alloc_path();
5424 if (!left_path) {
5425 ret = -ENOMEM;
5426 goto out;
5427 }
5428 right_path = btrfs_alloc_path();
5429 if (!right_path) {
5430 ret = -ENOMEM;
5431 goto out;
5432 }
5433
5434 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5435 if (!tmp_buf) {
5436 ret = -ENOMEM;
5437 goto out;
5438 }
5439
5440 left_path->search_commit_root = 1;
5441 left_path->skip_locking = 1;
5442 right_path->search_commit_root = 1;
5443 right_path->skip_locking = 1;
5444
5445 /*
5446 * Strategy: Go to the first items of both trees. Then do
5447 *
5448 * If both trees are at level 0
5449 * Compare keys of current items
5450 * If left < right treat left item as new, advance left tree
5451 * and repeat
5452 * If left > right treat right item as deleted, advance right tree
5453 * and repeat
5454 * If left == right do deep compare of items, treat as changed if
5455 * needed, advance both trees and repeat
5456 * If both trees are at the same level but not at level 0
5457 * Compare keys of current nodes/leafs
5458 * If left < right advance left tree and repeat
5459 * If left > right advance right tree and repeat
5460 * If left == right compare blockptrs of the next nodes/leafs
5461 * If they match advance both trees but stay at the same level
5462 * and repeat
5463 * If they don't match advance both trees while allowing to go
5464 * deeper and repeat
5465 * If tree levels are different
5466 * Advance the tree that needs it and repeat
5467 *
5468 * Advancing a tree means:
5469 * If we are at level 0, try to go to the next slot. If that's not
5470 * possible, go one level up and repeat. Stop when we found a level
5471 * where we could go to the next slot. We may at this point be on a
5472 * node or a leaf.
5473 *
5474 * If we are not at level 0 and not on shared tree blocks, go one
5475 * level deeper.
5476 *
5477 * If we are not at level 0 and on shared tree blocks, go one slot to
5478 * the right if possible or go up and right.
5479 */
5480
5481 down_read(&fs_info->commit_root_sem);
5482 left_level = btrfs_header_level(left_root->commit_root);
5483 left_root_level = left_level;
5484 left_path->nodes[left_level] =
5485 btrfs_clone_extent_buffer(left_root->commit_root);
5486 if (!left_path->nodes[left_level]) {
5487 up_read(&fs_info->commit_root_sem);
5488 ret = -ENOMEM;
5489 goto out;
5490 }
5491 extent_buffer_get(left_path->nodes[left_level]);
5492
5493 right_level = btrfs_header_level(right_root->commit_root);
5494 right_root_level = right_level;
5495 right_path->nodes[right_level] =
5496 btrfs_clone_extent_buffer(right_root->commit_root);
5497 if (!right_path->nodes[right_level]) {
5498 up_read(&fs_info->commit_root_sem);
5499 ret = -ENOMEM;
5500 goto out;
5501 }
5502 extent_buffer_get(right_path->nodes[right_level]);
5503 up_read(&fs_info->commit_root_sem);
5504
5505 if (left_level == 0)
5506 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5507 &left_key, left_path->slots[left_level]);
5508 else
5509 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5510 &left_key, left_path->slots[left_level]);
5511 if (right_level == 0)
5512 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5513 &right_key, right_path->slots[right_level]);
5514 else
5515 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5516 &right_key, right_path->slots[right_level]);
5517
5518 left_end_reached = right_end_reached = 0;
5519 advance_left = advance_right = 0;
5520
5521 while (1) {
5522 cond_resched();
5523 if (advance_left && !left_end_reached) {
5524 ret = tree_advance(fs_info, left_path, &left_level,
5525 left_root_level,
5526 advance_left != ADVANCE_ONLY_NEXT,
5527 &left_key);
5528 if (ret == -1)
5529 left_end_reached = ADVANCE;
5530 else if (ret < 0)
5531 goto out;
5532 advance_left = 0;
5533 }
5534 if (advance_right && !right_end_reached) {
5535 ret = tree_advance(fs_info, right_path, &right_level,
5536 right_root_level,
5537 advance_right != ADVANCE_ONLY_NEXT,
5538 &right_key);
5539 if (ret == -1)
5540 right_end_reached = ADVANCE;
5541 else if (ret < 0)
5542 goto out;
5543 advance_right = 0;
5544 }
5545
5546 if (left_end_reached && right_end_reached) {
5547 ret = 0;
5548 goto out;
5549 } else if (left_end_reached) {
5550 if (right_level == 0) {
5551 ret = changed_cb(left_path, right_path,
5552 &right_key,
5553 BTRFS_COMPARE_TREE_DELETED,
5554 ctx);
5555 if (ret < 0)
5556 goto out;
5557 }
5558 advance_right = ADVANCE;
5559 continue;
5560 } else if (right_end_reached) {
5561 if (left_level == 0) {
5562 ret = changed_cb(left_path, right_path,
5563 &left_key,
5564 BTRFS_COMPARE_TREE_NEW,
5565 ctx);
5566 if (ret < 0)
5567 goto out;
5568 }
5569 advance_left = ADVANCE;
5570 continue;
5571 }
5572
5573 if (left_level == 0 && right_level == 0) {
5574 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5575 if (cmp < 0) {
5576 ret = changed_cb(left_path, right_path,
5577 &left_key,
5578 BTRFS_COMPARE_TREE_NEW,
5579 ctx);
5580 if (ret < 0)
5581 goto out;
5582 advance_left = ADVANCE;
5583 } else if (cmp > 0) {
5584 ret = changed_cb(left_path, right_path,
5585 &right_key,
5586 BTRFS_COMPARE_TREE_DELETED,
5587 ctx);
5588 if (ret < 0)
5589 goto out;
5590 advance_right = ADVANCE;
5591 } else {
5592 enum btrfs_compare_tree_result result;
5593
5594 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5595 ret = tree_compare_item(left_path, right_path,
5596 tmp_buf);
5597 if (ret)
5598 result = BTRFS_COMPARE_TREE_CHANGED;
5599 else
5600 result = BTRFS_COMPARE_TREE_SAME;
5601 ret = changed_cb(left_path, right_path,
5602 &left_key, result, ctx);
5603 if (ret < 0)
5604 goto out;
5605 advance_left = ADVANCE;
5606 advance_right = ADVANCE;
5607 }
5608 } else if (left_level == right_level) {
5609 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5610 if (cmp < 0) {
5611 advance_left = ADVANCE;
5612 } else if (cmp > 0) {
5613 advance_right = ADVANCE;
5614 } else {
5615 left_blockptr = btrfs_node_blockptr(
5616 left_path->nodes[left_level],
5617 left_path->slots[left_level]);
5618 right_blockptr = btrfs_node_blockptr(
5619 right_path->nodes[right_level],
5620 right_path->slots[right_level]);
5621 left_gen = btrfs_node_ptr_generation(
5622 left_path->nodes[left_level],
5623 left_path->slots[left_level]);
5624 right_gen = btrfs_node_ptr_generation(
5625 right_path->nodes[right_level],
5626 right_path->slots[right_level]);
5627 if (left_blockptr == right_blockptr &&
5628 left_gen == right_gen) {
5629 /*
5630 * As we're on a shared block, don't
5631 * allow to go deeper.
5632 */
5633 advance_left = ADVANCE_ONLY_NEXT;
5634 advance_right = ADVANCE_ONLY_NEXT;
5635 } else {
5636 advance_left = ADVANCE;
5637 advance_right = ADVANCE;
5638 }
5639 }
5640 } else if (left_level < right_level) {
5641 advance_right = ADVANCE;
5642 } else {
5643 advance_left = ADVANCE;
5644 }
5645 }
5646
5647 out:
5648 btrfs_free_path(left_path);
5649 btrfs_free_path(right_path);
5650 kvfree(tmp_buf);
5651 return ret;
5652 }
5653
5654 /*
5655 * this is similar to btrfs_next_leaf, but does not try to preserve
5656 * and fixup the path. It looks for and returns the next key in the
5657 * tree based on the current path and the min_trans parameters.
5658 *
5659 * 0 is returned if another key is found, < 0 if there are any errors
5660 * and 1 is returned if there are no higher keys in the tree
5661 *
5662 * path->keep_locks should be set to 1 on the search made before
5663 * calling this function.
5664 */
5665 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5666 struct btrfs_key *key, int level, u64 min_trans)
5667 {
5668 int slot;
5669 struct extent_buffer *c;
5670
5671 WARN_ON(!path->keep_locks);
5672 while (level < BTRFS_MAX_LEVEL) {
5673 if (!path->nodes[level])
5674 return 1;
5675
5676 slot = path->slots[level] + 1;
5677 c = path->nodes[level];
5678 next:
5679 if (slot >= btrfs_header_nritems(c)) {
5680 int ret;
5681 int orig_lowest;
5682 struct btrfs_key cur_key;
5683 if (level + 1 >= BTRFS_MAX_LEVEL ||
5684 !path->nodes[level + 1])
5685 return 1;
5686
5687 if (path->locks[level + 1]) {
5688 level++;
5689 continue;
5690 }
5691
5692 slot = btrfs_header_nritems(c) - 1;
5693 if (level == 0)
5694 btrfs_item_key_to_cpu(c, &cur_key, slot);
5695 else
5696 btrfs_node_key_to_cpu(c, &cur_key, slot);
5697
5698 orig_lowest = path->lowest_level;
5699 btrfs_release_path(path);
5700 path->lowest_level = level;
5701 ret = btrfs_search_slot(NULL, root, &cur_key, path,
5702 0, 0);
5703 path->lowest_level = orig_lowest;
5704 if (ret < 0)
5705 return ret;
5706
5707 c = path->nodes[level];
5708 slot = path->slots[level];
5709 if (ret == 0)
5710 slot++;
5711 goto next;
5712 }
5713
5714 if (level == 0)
5715 btrfs_item_key_to_cpu(c, key, slot);
5716 else {
5717 u64 gen = btrfs_node_ptr_generation(c, slot);
5718
5719 if (gen < min_trans) {
5720 slot++;
5721 goto next;
5722 }
5723 btrfs_node_key_to_cpu(c, key, slot);
5724 }
5725 return 0;
5726 }
5727 return 1;
5728 }
5729
5730 /*
5731 * search the tree again to find a leaf with greater keys
5732 * returns 0 if it found something or 1 if there are no greater leaves.
5733 * returns < 0 on io errors.
5734 */
5735 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5736 {
5737 return btrfs_next_old_leaf(root, path, 0);
5738 }
5739
5740 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5741 u64 time_seq)
5742 {
5743 int slot;
5744 int level;
5745 struct extent_buffer *c;
5746 struct extent_buffer *next;
5747 struct btrfs_key key;
5748 u32 nritems;
5749 int ret;
5750 int old_spinning = path->leave_spinning;
5751 int next_rw_lock = 0;
5752
5753 nritems = btrfs_header_nritems(path->nodes[0]);
5754 if (nritems == 0)
5755 return 1;
5756
5757 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5758 again:
5759 level = 1;
5760 next = NULL;
5761 next_rw_lock = 0;
5762 btrfs_release_path(path);
5763
5764 path->keep_locks = 1;
5765 path->leave_spinning = 1;
5766
5767 if (time_seq)
5768 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5769 else
5770 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5771 path->keep_locks = 0;
5772
5773 if (ret < 0)
5774 return ret;
5775
5776 nritems = btrfs_header_nritems(path->nodes[0]);
5777 /*
5778 * by releasing the path above we dropped all our locks. A balance
5779 * could have added more items next to the key that used to be
5780 * at the very end of the block. So, check again here and
5781 * advance the path if there are now more items available.
5782 */
5783 if (nritems > 0 && path->slots[0] < nritems - 1) {
5784 if (ret == 0)
5785 path->slots[0]++;
5786 ret = 0;
5787 goto done;
5788 }
5789 /*
5790 * So the above check misses one case:
5791 * - after releasing the path above, someone has removed the item that
5792 * used to be at the very end of the block, and balance between leafs
5793 * gets another one with bigger key.offset to replace it.
5794 *
5795 * This one should be returned as well, or we can get leaf corruption
5796 * later(esp. in __btrfs_drop_extents()).
5797 *
5798 * And a bit more explanation about this check,
5799 * with ret > 0, the key isn't found, the path points to the slot
5800 * where it should be inserted, so the path->slots[0] item must be the
5801 * bigger one.
5802 */
5803 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5804 ret = 0;
5805 goto done;
5806 }
5807
5808 while (level < BTRFS_MAX_LEVEL) {
5809 if (!path->nodes[level]) {
5810 ret = 1;
5811 goto done;
5812 }
5813
5814 slot = path->slots[level] + 1;
5815 c = path->nodes[level];
5816 if (slot >= btrfs_header_nritems(c)) {
5817 level++;
5818 if (level == BTRFS_MAX_LEVEL) {
5819 ret = 1;
5820 goto done;
5821 }
5822 continue;
5823 }
5824
5825 if (next) {
5826 btrfs_tree_unlock_rw(next, next_rw_lock);
5827 free_extent_buffer(next);
5828 }
5829
5830 next = c;
5831 next_rw_lock = path->locks[level];
5832 ret = read_block_for_search(root, path, &next, level,
5833 slot, &key);
5834 if (ret == -EAGAIN)
5835 goto again;
5836
5837 if (ret < 0) {
5838 btrfs_release_path(path);
5839 goto done;
5840 }
5841
5842 if (!path->skip_locking) {
5843 ret = btrfs_try_tree_read_lock(next);
5844 if (!ret && time_seq) {
5845 /*
5846 * If we don't get the lock, we may be racing
5847 * with push_leaf_left, holding that lock while
5848 * itself waiting for the leaf we've currently
5849 * locked. To solve this situation, we give up
5850 * on our lock and cycle.
5851 */
5852 free_extent_buffer(next);
5853 btrfs_release_path(path);
5854 cond_resched();
5855 goto again;
5856 }
5857 if (!ret) {
5858 btrfs_set_path_blocking(path);
5859 btrfs_tree_read_lock(next);
5860 btrfs_clear_path_blocking(path, next,
5861 BTRFS_READ_LOCK);
5862 }
5863 next_rw_lock = BTRFS_READ_LOCK;
5864 }
5865 break;
5866 }
5867 path->slots[level] = slot;
5868 while (1) {
5869 level--;
5870 c = path->nodes[level];
5871 if (path->locks[level])
5872 btrfs_tree_unlock_rw(c, path->locks[level]);
5873
5874 free_extent_buffer(c);
5875 path->nodes[level] = next;
5876 path->slots[level] = 0;
5877 if (!path->skip_locking)
5878 path->locks[level] = next_rw_lock;
5879 if (!level)
5880 break;
5881
5882 ret = read_block_for_search(root, path, &next, level,
5883 0, &key);
5884 if (ret == -EAGAIN)
5885 goto again;
5886
5887 if (ret < 0) {
5888 btrfs_release_path(path);
5889 goto done;
5890 }
5891
5892 if (!path->skip_locking) {
5893 ret = btrfs_try_tree_read_lock(next);
5894 if (!ret) {
5895 btrfs_set_path_blocking(path);
5896 btrfs_tree_read_lock(next);
5897 btrfs_clear_path_blocking(path, next,
5898 BTRFS_READ_LOCK);
5899 }
5900 next_rw_lock = BTRFS_READ_LOCK;
5901 }
5902 }
5903 ret = 0;
5904 done:
5905 unlock_up(path, 0, 1, 0, NULL);
5906 path->leave_spinning = old_spinning;
5907 if (!old_spinning)
5908 btrfs_set_path_blocking(path);
5909
5910 return ret;
5911 }
5912
5913 /*
5914 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5915 * searching until it gets past min_objectid or finds an item of 'type'
5916 *
5917 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5918 */
5919 int btrfs_previous_item(struct btrfs_root *root,
5920 struct btrfs_path *path, u64 min_objectid,
5921 int type)
5922 {
5923 struct btrfs_key found_key;
5924 struct extent_buffer *leaf;
5925 u32 nritems;
5926 int ret;
5927
5928 while (1) {
5929 if (path->slots[0] == 0) {
5930 btrfs_set_path_blocking(path);
5931 ret = btrfs_prev_leaf(root, path);
5932 if (ret != 0)
5933 return ret;
5934 } else {
5935 path->slots[0]--;
5936 }
5937 leaf = path->nodes[0];
5938 nritems = btrfs_header_nritems(leaf);
5939 if (nritems == 0)
5940 return 1;
5941 if (path->slots[0] == nritems)
5942 path->slots[0]--;
5943
5944 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5945 if (found_key.objectid < min_objectid)
5946 break;
5947 if (found_key.type == type)
5948 return 0;
5949 if (found_key.objectid == min_objectid &&
5950 found_key.type < type)
5951 break;
5952 }
5953 return 1;
5954 }
5955
5956 /*
5957 * search in extent tree to find a previous Metadata/Data extent item with
5958 * min objecitd.
5959 *
5960 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5961 */
5962 int btrfs_previous_extent_item(struct btrfs_root *root,
5963 struct btrfs_path *path, u64 min_objectid)
5964 {
5965 struct btrfs_key found_key;
5966 struct extent_buffer *leaf;
5967 u32 nritems;
5968 int ret;
5969
5970 while (1) {
5971 if (path->slots[0] == 0) {
5972 btrfs_set_path_blocking(path);
5973 ret = btrfs_prev_leaf(root, path);
5974 if (ret != 0)
5975 return ret;
5976 } else {
5977 path->slots[0]--;
5978 }
5979 leaf = path->nodes[0];
5980 nritems = btrfs_header_nritems(leaf);
5981 if (nritems == 0)
5982 return 1;
5983 if (path->slots[0] == nritems)
5984 path->slots[0]--;
5985
5986 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5987 if (found_key.objectid < min_objectid)
5988 break;
5989 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5990 found_key.type == BTRFS_METADATA_ITEM_KEY)
5991 return 0;
5992 if (found_key.objectid == min_objectid &&
5993 found_key.type < BTRFS_EXTENT_ITEM_KEY)
5994 break;
5995 }
5996 return 1;
5997 }