]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/tree-log.c
btrfs: fix race that causes unnecessary logging of ancestor inodes
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / tree-log.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
e02119d5
CM
2/*
3 * Copyright (C) 2008 Oracle. All rights reserved.
e02119d5
CM
4 */
5
6#include <linux/sched.h>
5a0e3ad6 7#include <linux/slab.h>
c6adc9cc 8#include <linux/blkdev.h>
5dc562c5 9#include <linux/list_sort.h>
c7f88c4e 10#include <linux/iversion.h>
602cbe91 11#include "misc.h"
9678c543 12#include "ctree.h"
995946dd 13#include "tree-log.h"
e02119d5
CM
14#include "disk-io.h"
15#include "locking.h"
16#include "print-tree.h"
f186373f 17#include "backref.h"
ebb8765b 18#include "compression.h"
df2c95f3 19#include "qgroup.h"
6787bb9f
NB
20#include "block-group.h"
21#include "space-info.h"
e02119d5
CM
22
23/* magic values for the inode_only field in btrfs_log_inode:
24 *
25 * LOG_INODE_ALL means to log everything
26 * LOG_INODE_EXISTS means to log just enough to recreate the inode
27 * during log replay
28 */
e13976cf
DS
29enum {
30 LOG_INODE_ALL,
31 LOG_INODE_EXISTS,
32 LOG_OTHER_INODE,
33 LOG_OTHER_INODE_ALL,
34};
e02119d5 35
12fcfd22
CM
36/*
37 * directory trouble cases
38 *
39 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
40 * log, we must force a full commit before doing an fsync of the directory
41 * where the unlink was done.
42 * ---> record transid of last unlink/rename per directory
43 *
44 * mkdir foo/some_dir
45 * normal commit
46 * rename foo/some_dir foo2/some_dir
47 * mkdir foo/some_dir
48 * fsync foo/some_dir/some_file
49 *
50 * The fsync above will unlink the original some_dir without recording
51 * it in its new location (foo2). After a crash, some_dir will be gone
52 * unless the fsync of some_file forces a full commit
53 *
54 * 2) we must log any new names for any file or dir that is in the fsync
55 * log. ---> check inode while renaming/linking.
56 *
57 * 2a) we must log any new names for any file or dir during rename
58 * when the directory they are being removed from was logged.
59 * ---> check inode and old parent dir during rename
60 *
61 * 2a is actually the more important variant. With the extra logging
62 * a crash might unlink the old name without recreating the new one
63 *
64 * 3) after a crash, we must go through any directories with a link count
65 * of zero and redo the rm -rf
66 *
67 * mkdir f1/foo
68 * normal commit
69 * rm -rf f1/foo
70 * fsync(f1)
71 *
72 * The directory f1 was fully removed from the FS, but fsync was never
73 * called on f1, only its parent dir. After a crash the rm -rf must
74 * be replayed. This must be able to recurse down the entire
75 * directory tree. The inode link count fixup code takes care of the
76 * ugly details.
77 */
78
e02119d5
CM
79/*
80 * stages for the tree walking. The first
81 * stage (0) is to only pin down the blocks we find
82 * the second stage (1) is to make sure that all the inodes
83 * we find in the log are created in the subvolume.
84 *
85 * The last stage is to deal with directories and links and extents
86 * and all the other fun semantics
87 */
e13976cf
DS
88enum {
89 LOG_WALK_PIN_ONLY,
90 LOG_WALK_REPLAY_INODES,
91 LOG_WALK_REPLAY_DIR_INDEX,
92 LOG_WALK_REPLAY_ALL,
93};
e02119d5 94
12fcfd22 95static int btrfs_log_inode(struct btrfs_trans_handle *trans,
a59108a7 96 struct btrfs_root *root, struct btrfs_inode *inode,
49dae1bc 97 int inode_only,
8407f553 98 struct btrfs_log_ctx *ctx);
ec051c0f
YZ
99static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_path *path, u64 objectid);
12fcfd22
CM
102static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root,
104 struct btrfs_root *log,
105 struct btrfs_path *path,
106 u64 dirid, int del_all);
e02119d5
CM
107
108/*
109 * tree logging is a special write ahead log used to make sure that
110 * fsyncs and O_SYNCs can happen without doing full tree commits.
111 *
112 * Full tree commits are expensive because they require commonly
113 * modified blocks to be recowed, creating many dirty pages in the
114 * extent tree an 4x-6x higher write load than ext3.
115 *
116 * Instead of doing a tree commit on every fsync, we use the
117 * key ranges and transaction ids to find items for a given file or directory
118 * that have changed in this transaction. Those items are copied into
119 * a special tree (one per subvolume root), that tree is written to disk
120 * and then the fsync is considered complete.
121 *
122 * After a crash, items are copied out of the log-tree back into the
123 * subvolume tree. Any file data extents found are recorded in the extent
124 * allocation tree, and the log-tree freed.
125 *
126 * The log tree is read three times, once to pin down all the extents it is
127 * using in ram and once, once to create all the inodes logged in the tree
128 * and once to do all the other items.
129 */
130
e02119d5
CM
131/*
132 * start a sub transaction and setup the log tree
133 * this increments the log tree writer count to make the people
134 * syncing the tree wait for us to finish
135 */
136static int start_log_trans(struct btrfs_trans_handle *trans,
8b050d35
MX
137 struct btrfs_root *root,
138 struct btrfs_log_ctx *ctx)
e02119d5 139{
0b246afa 140 struct btrfs_fs_info *fs_info = root->fs_info;
34eb2a52 141 int ret = 0;
7237f183
YZ
142
143 mutex_lock(&root->log_mutex);
34eb2a52 144
7237f183 145 if (root->log_root) {
4884b8e8 146 if (btrfs_need_log_full_commit(trans)) {
50471a38
MX
147 ret = -EAGAIN;
148 goto out;
149 }
34eb2a52 150
ff782e0a 151 if (!root->log_start_pid) {
27cdeb70 152 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
34eb2a52 153 root->log_start_pid = current->pid;
ff782e0a 154 } else if (root->log_start_pid != current->pid) {
27cdeb70 155 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
ff782e0a 156 }
34eb2a52 157 } else {
0b246afa
JM
158 mutex_lock(&fs_info->tree_log_mutex);
159 if (!fs_info->log_root_tree)
160 ret = btrfs_init_log_root_tree(trans, fs_info);
161 mutex_unlock(&fs_info->tree_log_mutex);
34eb2a52
Z
162 if (ret)
163 goto out;
ff782e0a 164
e02119d5 165 ret = btrfs_add_log_tree(trans, root);
4a500fd1 166 if (ret)
e87ac136 167 goto out;
34eb2a52 168
e7a79811 169 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
34eb2a52
Z
170 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
171 root->log_start_pid = current->pid;
e02119d5 172 }
34eb2a52 173
7237f183 174 atomic_inc(&root->log_writers);
75b463d2 175 if (ctx && !ctx->logging_new_name) {
34eb2a52 176 int index = root->log_transid % 2;
8b050d35 177 list_add_tail(&ctx->list, &root->log_ctxs[index]);
d1433deb 178 ctx->log_transid = root->log_transid;
8b050d35 179 }
34eb2a52 180
e87ac136 181out:
7237f183 182 mutex_unlock(&root->log_mutex);
e87ac136 183 return ret;
e02119d5
CM
184}
185
186/*
187 * returns 0 if there was a log transaction running and we were able
188 * to join, or returns -ENOENT if there were not transactions
189 * in progress
190 */
191static int join_running_log_trans(struct btrfs_root *root)
192{
193 int ret = -ENOENT;
194
e7a79811
FM
195 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
196 return ret;
197
7237f183 198 mutex_lock(&root->log_mutex);
e02119d5
CM
199 if (root->log_root) {
200 ret = 0;
7237f183 201 atomic_inc(&root->log_writers);
e02119d5 202 }
7237f183 203 mutex_unlock(&root->log_mutex);
e02119d5
CM
204 return ret;
205}
206
12fcfd22
CM
207/*
208 * This either makes the current running log transaction wait
209 * until you call btrfs_end_log_trans() or it makes any future
210 * log transactions wait until you call btrfs_end_log_trans()
211 */
45128b08 212void btrfs_pin_log_trans(struct btrfs_root *root)
12fcfd22 213{
12fcfd22 214 atomic_inc(&root->log_writers);
12fcfd22
CM
215}
216
e02119d5
CM
217/*
218 * indicate we're done making changes to the log tree
219 * and wake up anyone waiting to do a sync
220 */
143bede5 221void btrfs_end_log_trans(struct btrfs_root *root)
e02119d5 222{
7237f183 223 if (atomic_dec_and_test(&root->log_writers)) {
093258e6
DS
224 /* atomic_dec_and_test implies a barrier */
225 cond_wake_up_nomb(&root->log_writer_wait);
7237f183 226 }
e02119d5
CM
227}
228
247462a5
DS
229static int btrfs_write_tree_block(struct extent_buffer *buf)
230{
231 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
232 buf->start + buf->len - 1);
233}
234
235static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
236{
237 filemap_fdatawait_range(buf->pages[0]->mapping,
238 buf->start, buf->start + buf->len - 1);
239}
e02119d5
CM
240
241/*
242 * the walk control struct is used to pass state down the chain when
243 * processing the log tree. The stage field tells us which part
244 * of the log tree processing we are currently doing. The others
245 * are state fields used for that specific part
246 */
247struct walk_control {
248 /* should we free the extent on disk when done? This is used
249 * at transaction commit time while freeing a log tree
250 */
251 int free;
252
253 /* should we write out the extent buffer? This is used
254 * while flushing the log tree to disk during a sync
255 */
256 int write;
257
258 /* should we wait for the extent buffer io to finish? Also used
259 * while flushing the log tree to disk for a sync
260 */
261 int wait;
262
263 /* pin only walk, we record which extents on disk belong to the
264 * log trees
265 */
266 int pin;
267
268 /* what stage of the replay code we're currently in */
269 int stage;
270
f2d72f42
FM
271 /*
272 * Ignore any items from the inode currently being processed. Needs
273 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
274 * the LOG_WALK_REPLAY_INODES stage.
275 */
276 bool ignore_cur_inode;
277
e02119d5
CM
278 /* the root we are currently replaying */
279 struct btrfs_root *replay_dest;
280
281 /* the trans handle for the current replay */
282 struct btrfs_trans_handle *trans;
283
284 /* the function that gets used to process blocks we find in the
285 * tree. Note the extent_buffer might not be up to date when it is
286 * passed in, and it must be checked or read if you need the data
287 * inside it
288 */
289 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
581c1760 290 struct walk_control *wc, u64 gen, int level);
e02119d5
CM
291};
292
293/*
294 * process_func used to pin down extents, write them or wait on them
295 */
296static int process_one_buffer(struct btrfs_root *log,
297 struct extent_buffer *eb,
581c1760 298 struct walk_control *wc, u64 gen, int level)
e02119d5 299{
0b246afa 300 struct btrfs_fs_info *fs_info = log->fs_info;
b50c6e25
JB
301 int ret = 0;
302
8c2a1a30
JB
303 /*
304 * If this fs is mixed then we need to be able to process the leaves to
305 * pin down any logged extents, so we have to read the block.
306 */
0b246afa 307 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
581c1760 308 ret = btrfs_read_buffer(eb, gen, level, NULL);
8c2a1a30
JB
309 if (ret)
310 return ret;
311 }
312
04018de5 313 if (wc->pin)
9fce5704 314 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
2ff7e61e 315 eb->len);
e02119d5 316
b50c6e25 317 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
8c2a1a30 318 if (wc->pin && btrfs_header_level(eb) == 0)
bcdc428c 319 ret = btrfs_exclude_logged_extents(eb);
e02119d5
CM
320 if (wc->write)
321 btrfs_write_tree_block(eb);
322 if (wc->wait)
323 btrfs_wait_tree_block_writeback(eb);
324 }
b50c6e25 325 return ret;
e02119d5
CM
326}
327
328/*
329 * Item overwrite used by replay and tree logging. eb, slot and key all refer
330 * to the src data we are copying out.
331 *
332 * root is the tree we are copying into, and path is a scratch
333 * path for use in this function (it should be released on entry and
334 * will be released on exit).
335 *
336 * If the key is already in the destination tree the existing item is
337 * overwritten. If the existing item isn't big enough, it is extended.
338 * If it is too large, it is truncated.
339 *
340 * If the key isn't in the destination yet, a new item is inserted.
341 */
342static noinline int overwrite_item(struct btrfs_trans_handle *trans,
343 struct btrfs_root *root,
344 struct btrfs_path *path,
345 struct extent_buffer *eb, int slot,
346 struct btrfs_key *key)
347{
348 int ret;
349 u32 item_size;
350 u64 saved_i_size = 0;
351 int save_old_i_size = 0;
352 unsigned long src_ptr;
353 unsigned long dst_ptr;
354 int overwrite_root = 0;
4bc4bee4 355 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
e02119d5
CM
356
357 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
358 overwrite_root = 1;
359
360 item_size = btrfs_item_size_nr(eb, slot);
361 src_ptr = btrfs_item_ptr_offset(eb, slot);
362
363 /* look for the key in the destination tree */
364 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
4bc4bee4
JB
365 if (ret < 0)
366 return ret;
367
e02119d5
CM
368 if (ret == 0) {
369 char *src_copy;
370 char *dst_copy;
371 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
372 path->slots[0]);
373 if (dst_size != item_size)
374 goto insert;
375
376 if (item_size == 0) {
b3b4aa74 377 btrfs_release_path(path);
e02119d5
CM
378 return 0;
379 }
380 dst_copy = kmalloc(item_size, GFP_NOFS);
381 src_copy = kmalloc(item_size, GFP_NOFS);
2a29edc6 382 if (!dst_copy || !src_copy) {
b3b4aa74 383 btrfs_release_path(path);
2a29edc6 384 kfree(dst_copy);
385 kfree(src_copy);
386 return -ENOMEM;
387 }
e02119d5
CM
388
389 read_extent_buffer(eb, src_copy, src_ptr, item_size);
390
391 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
392 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
393 item_size);
394 ret = memcmp(dst_copy, src_copy, item_size);
395
396 kfree(dst_copy);
397 kfree(src_copy);
398 /*
399 * they have the same contents, just return, this saves
400 * us from cowing blocks in the destination tree and doing
401 * extra writes that may not have been done by a previous
402 * sync
403 */
404 if (ret == 0) {
b3b4aa74 405 btrfs_release_path(path);
e02119d5
CM
406 return 0;
407 }
408
4bc4bee4
JB
409 /*
410 * We need to load the old nbytes into the inode so when we
411 * replay the extents we've logged we get the right nbytes.
412 */
413 if (inode_item) {
414 struct btrfs_inode_item *item;
415 u64 nbytes;
d555438b 416 u32 mode;
4bc4bee4
JB
417
418 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
419 struct btrfs_inode_item);
420 nbytes = btrfs_inode_nbytes(path->nodes[0], item);
421 item = btrfs_item_ptr(eb, slot,
422 struct btrfs_inode_item);
423 btrfs_set_inode_nbytes(eb, item, nbytes);
d555438b
JB
424
425 /*
426 * If this is a directory we need to reset the i_size to
427 * 0 so that we can set it up properly when replaying
428 * the rest of the items in this log.
429 */
430 mode = btrfs_inode_mode(eb, item);
431 if (S_ISDIR(mode))
432 btrfs_set_inode_size(eb, item, 0);
4bc4bee4
JB
433 }
434 } else if (inode_item) {
435 struct btrfs_inode_item *item;
d555438b 436 u32 mode;
4bc4bee4
JB
437
438 /*
439 * New inode, set nbytes to 0 so that the nbytes comes out
440 * properly when we replay the extents.
441 */
442 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
443 btrfs_set_inode_nbytes(eb, item, 0);
d555438b
JB
444
445 /*
446 * If this is a directory we need to reset the i_size to 0 so
447 * that we can set it up properly when replaying the rest of
448 * the items in this log.
449 */
450 mode = btrfs_inode_mode(eb, item);
451 if (S_ISDIR(mode))
452 btrfs_set_inode_size(eb, item, 0);
e02119d5
CM
453 }
454insert:
b3b4aa74 455 btrfs_release_path(path);
e02119d5 456 /* try to insert the key into the destination tree */
df8d116f 457 path->skip_release_on_error = 1;
e02119d5
CM
458 ret = btrfs_insert_empty_item(trans, root, path,
459 key, item_size);
df8d116f 460 path->skip_release_on_error = 0;
e02119d5
CM
461
462 /* make sure any existing item is the correct size */
df8d116f 463 if (ret == -EEXIST || ret == -EOVERFLOW) {
e02119d5
CM
464 u32 found_size;
465 found_size = btrfs_item_size_nr(path->nodes[0],
466 path->slots[0]);
143bede5 467 if (found_size > item_size)
78ac4f9e 468 btrfs_truncate_item(path, item_size, 1);
143bede5 469 else if (found_size < item_size)
c71dd880 470 btrfs_extend_item(path, item_size - found_size);
e02119d5 471 } else if (ret) {
4a500fd1 472 return ret;
e02119d5
CM
473 }
474 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
475 path->slots[0]);
476
477 /* don't overwrite an existing inode if the generation number
478 * was logged as zero. This is done when the tree logging code
479 * is just logging an inode to make sure it exists after recovery.
480 *
481 * Also, don't overwrite i_size on directories during replay.
482 * log replay inserts and removes directory items based on the
483 * state of the tree found in the subvolume, and i_size is modified
484 * as it goes
485 */
486 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
487 struct btrfs_inode_item *src_item;
488 struct btrfs_inode_item *dst_item;
489
490 src_item = (struct btrfs_inode_item *)src_ptr;
491 dst_item = (struct btrfs_inode_item *)dst_ptr;
492
1a4bcf47
FM
493 if (btrfs_inode_generation(eb, src_item) == 0) {
494 struct extent_buffer *dst_eb = path->nodes[0];
2f2ff0ee 495 const u64 ino_size = btrfs_inode_size(eb, src_item);
1a4bcf47 496
2f2ff0ee
FM
497 /*
498 * For regular files an ino_size == 0 is used only when
499 * logging that an inode exists, as part of a directory
500 * fsync, and the inode wasn't fsynced before. In this
501 * case don't set the size of the inode in the fs/subvol
502 * tree, otherwise we would be throwing valid data away.
503 */
1a4bcf47 504 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
2f2ff0ee 505 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
60d48e2e
DS
506 ino_size != 0)
507 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
e02119d5 508 goto no_copy;
1a4bcf47 509 }
e02119d5
CM
510
511 if (overwrite_root &&
512 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
513 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
514 save_old_i_size = 1;
515 saved_i_size = btrfs_inode_size(path->nodes[0],
516 dst_item);
517 }
518 }
519
520 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
521 src_ptr, item_size);
522
523 if (save_old_i_size) {
524 struct btrfs_inode_item *dst_item;
525 dst_item = (struct btrfs_inode_item *)dst_ptr;
526 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
527 }
528
529 /* make sure the generation is filled in */
530 if (key->type == BTRFS_INODE_ITEM_KEY) {
531 struct btrfs_inode_item *dst_item;
532 dst_item = (struct btrfs_inode_item *)dst_ptr;
533 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
534 btrfs_set_inode_generation(path->nodes[0], dst_item,
535 trans->transid);
536 }
537 }
538no_copy:
539 btrfs_mark_buffer_dirty(path->nodes[0]);
b3b4aa74 540 btrfs_release_path(path);
e02119d5
CM
541 return 0;
542}
543
544/*
545 * simple helper to read an inode off the disk from a given root
546 * This can only be called for subvolume roots and not for the log
547 */
548static noinline struct inode *read_one_inode(struct btrfs_root *root,
549 u64 objectid)
550{
551 struct inode *inode;
e02119d5 552
0202e83f 553 inode = btrfs_iget(root->fs_info->sb, objectid, root);
2e19f1f9 554 if (IS_ERR(inode))
5d4f98a2 555 inode = NULL;
e02119d5
CM
556 return inode;
557}
558
559/* replays a single extent in 'eb' at 'slot' with 'key' into the
560 * subvolume 'root'. path is released on entry and should be released
561 * on exit.
562 *
563 * extents in the log tree have not been allocated out of the extent
564 * tree yet. So, this completes the allocation, taking a reference
565 * as required if the extent already exists or creating a new extent
566 * if it isn't in the extent allocation tree yet.
567 *
568 * The extent is inserted into the file, dropping any existing extents
569 * from the file that overlap the new one.
570 */
571static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
572 struct btrfs_root *root,
573 struct btrfs_path *path,
574 struct extent_buffer *eb, int slot,
575 struct btrfs_key *key)
576{
5893dfb9 577 struct btrfs_drop_extents_args drop_args = { 0 };
0b246afa 578 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5 579 int found_type;
e02119d5 580 u64 extent_end;
e02119d5 581 u64 start = key->offset;
4bc4bee4 582 u64 nbytes = 0;
e02119d5
CM
583 struct btrfs_file_extent_item *item;
584 struct inode *inode = NULL;
585 unsigned long size;
586 int ret = 0;
587
588 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
589 found_type = btrfs_file_extent_type(eb, item);
590
d899e052 591 if (found_type == BTRFS_FILE_EXTENT_REG ||
4bc4bee4
JB
592 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
593 nbytes = btrfs_file_extent_num_bytes(eb, item);
594 extent_end = start + nbytes;
595
596 /*
597 * We don't add to the inodes nbytes if we are prealloc or a
598 * hole.
599 */
600 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
601 nbytes = 0;
602 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
e41ca589 603 size = btrfs_file_extent_ram_bytes(eb, item);
4bc4bee4 604 nbytes = btrfs_file_extent_ram_bytes(eb, item);
da17066c 605 extent_end = ALIGN(start + size,
0b246afa 606 fs_info->sectorsize);
e02119d5
CM
607 } else {
608 ret = 0;
609 goto out;
610 }
611
612 inode = read_one_inode(root, key->objectid);
613 if (!inode) {
614 ret = -EIO;
615 goto out;
616 }
617
618 /*
619 * first check to see if we already have this extent in the
620 * file. This must be done before the btrfs_drop_extents run
621 * so we don't try to drop this extent.
622 */
f85b7379
DS
623 ret = btrfs_lookup_file_extent(trans, root, path,
624 btrfs_ino(BTRFS_I(inode)), start, 0);
e02119d5 625
d899e052
YZ
626 if (ret == 0 &&
627 (found_type == BTRFS_FILE_EXTENT_REG ||
628 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
e02119d5
CM
629 struct btrfs_file_extent_item cmp1;
630 struct btrfs_file_extent_item cmp2;
631 struct btrfs_file_extent_item *existing;
632 struct extent_buffer *leaf;
633
634 leaf = path->nodes[0];
635 existing = btrfs_item_ptr(leaf, path->slots[0],
636 struct btrfs_file_extent_item);
637
638 read_extent_buffer(eb, &cmp1, (unsigned long)item,
639 sizeof(cmp1));
640 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
641 sizeof(cmp2));
642
643 /*
644 * we already have a pointer to this exact extent,
645 * we don't have to do anything
646 */
647 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
b3b4aa74 648 btrfs_release_path(path);
e02119d5
CM
649 goto out;
650 }
651 }
b3b4aa74 652 btrfs_release_path(path);
e02119d5
CM
653
654 /* drop any overlapping extents */
5893dfb9
FM
655 drop_args.start = start;
656 drop_args.end = extent_end;
657 drop_args.drop_cache = true;
658 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
3650860b
JB
659 if (ret)
660 goto out;
e02119d5 661
07d400a6
YZ
662 if (found_type == BTRFS_FILE_EXTENT_REG ||
663 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5d4f98a2 664 u64 offset;
07d400a6
YZ
665 unsigned long dest_offset;
666 struct btrfs_key ins;
667
3168021c
FM
668 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
669 btrfs_fs_incompat(fs_info, NO_HOLES))
670 goto update_inode;
671
07d400a6
YZ
672 ret = btrfs_insert_empty_item(trans, root, path, key,
673 sizeof(*item));
3650860b
JB
674 if (ret)
675 goto out;
07d400a6
YZ
676 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
677 path->slots[0]);
678 copy_extent_buffer(path->nodes[0], eb, dest_offset,
679 (unsigned long)item, sizeof(*item));
680
681 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
682 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
683 ins.type = BTRFS_EXTENT_ITEM_KEY;
5d4f98a2 684 offset = key->offset - btrfs_file_extent_offset(eb, item);
07d400a6 685
df2c95f3
QW
686 /*
687 * Manually record dirty extent, as here we did a shallow
688 * file extent item copy and skip normal backref update,
689 * but modifying extent tree all by ourselves.
690 * So need to manually record dirty extent for qgroup,
691 * as the owner of the file extent changed from log tree
692 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
693 */
a95f3aaf 694 ret = btrfs_qgroup_trace_extent(trans,
df2c95f3
QW
695 btrfs_file_extent_disk_bytenr(eb, item),
696 btrfs_file_extent_disk_num_bytes(eb, item),
697 GFP_NOFS);
698 if (ret < 0)
699 goto out;
700
07d400a6 701 if (ins.objectid > 0) {
82fa113f 702 struct btrfs_ref ref = { 0 };
07d400a6
YZ
703 u64 csum_start;
704 u64 csum_end;
705 LIST_HEAD(ordered_sums);
82fa113f 706
07d400a6
YZ
707 /*
708 * is this extent already allocated in the extent
709 * allocation tree? If so, just add a reference
710 */
2ff7e61e 711 ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
07d400a6
YZ
712 ins.offset);
713 if (ret == 0) {
82fa113f
QW
714 btrfs_init_generic_ref(&ref,
715 BTRFS_ADD_DELAYED_REF,
716 ins.objectid, ins.offset, 0);
717 btrfs_init_data_ref(&ref,
718 root->root_key.objectid,
b06c4bf5 719 key->objectid, offset);
82fa113f 720 ret = btrfs_inc_extent_ref(trans, &ref);
b50c6e25
JB
721 if (ret)
722 goto out;
07d400a6
YZ
723 } else {
724 /*
725 * insert the extent pointer in the extent
726 * allocation tree
727 */
5d4f98a2 728 ret = btrfs_alloc_logged_file_extent(trans,
2ff7e61e 729 root->root_key.objectid,
5d4f98a2 730 key->objectid, offset, &ins);
b50c6e25
JB
731 if (ret)
732 goto out;
07d400a6 733 }
b3b4aa74 734 btrfs_release_path(path);
07d400a6
YZ
735
736 if (btrfs_file_extent_compression(eb, item)) {
737 csum_start = ins.objectid;
738 csum_end = csum_start + ins.offset;
739 } else {
740 csum_start = ins.objectid +
741 btrfs_file_extent_offset(eb, item);
742 csum_end = csum_start +
743 btrfs_file_extent_num_bytes(eb, item);
744 }
745
746 ret = btrfs_lookup_csums_range(root->log_root,
747 csum_start, csum_end - 1,
a2de733c 748 &ordered_sums, 0);
3650860b
JB
749 if (ret)
750 goto out;
b84b8390
FM
751 /*
752 * Now delete all existing cums in the csum root that
753 * cover our range. We do this because we can have an
754 * extent that is completely referenced by one file
755 * extent item and partially referenced by another
756 * file extent item (like after using the clone or
757 * extent_same ioctls). In this case if we end up doing
758 * the replay of the one that partially references the
759 * extent first, and we do not do the csum deletion
760 * below, we can get 2 csum items in the csum tree that
761 * overlap each other. For example, imagine our log has
762 * the two following file extent items:
763 *
764 * key (257 EXTENT_DATA 409600)
765 * extent data disk byte 12845056 nr 102400
766 * extent data offset 20480 nr 20480 ram 102400
767 *
768 * key (257 EXTENT_DATA 819200)
769 * extent data disk byte 12845056 nr 102400
770 * extent data offset 0 nr 102400 ram 102400
771 *
772 * Where the second one fully references the 100K extent
773 * that starts at disk byte 12845056, and the log tree
774 * has a single csum item that covers the entire range
775 * of the extent:
776 *
777 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
778 *
779 * After the first file extent item is replayed, the
780 * csum tree gets the following csum item:
781 *
782 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
783 *
784 * Which covers the 20K sub-range starting at offset 20K
785 * of our extent. Now when we replay the second file
786 * extent item, if we do not delete existing csum items
787 * that cover any of its blocks, we end up getting two
788 * csum items in our csum tree that overlap each other:
789 *
790 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
791 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
792 *
793 * Which is a problem, because after this anyone trying
794 * to lookup up for the checksum of any block of our
795 * extent starting at an offset of 40K or higher, will
796 * end up looking at the second csum item only, which
797 * does not contain the checksum for any block starting
798 * at offset 40K or higher of our extent.
799 */
07d400a6
YZ
800 while (!list_empty(&ordered_sums)) {
801 struct btrfs_ordered_sum *sums;
802 sums = list_entry(ordered_sums.next,
803 struct btrfs_ordered_sum,
804 list);
b84b8390 805 if (!ret)
40e046ac
FM
806 ret = btrfs_del_csums(trans,
807 fs_info->csum_root,
5b4aacef
JM
808 sums->bytenr,
809 sums->len);
3650860b
JB
810 if (!ret)
811 ret = btrfs_csum_file_blocks(trans,
0b246afa 812 fs_info->csum_root, sums);
07d400a6
YZ
813 list_del(&sums->list);
814 kfree(sums);
815 }
3650860b
JB
816 if (ret)
817 goto out;
07d400a6 818 } else {
b3b4aa74 819 btrfs_release_path(path);
07d400a6
YZ
820 }
821 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
822 /* inline extents are easy, we just overwrite them */
823 ret = overwrite_item(trans, root, path, eb, slot, key);
3650860b
JB
824 if (ret)
825 goto out;
07d400a6 826 }
e02119d5 827
9ddc959e
JB
828 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
829 extent_end - start);
830 if (ret)
831 goto out;
832
3168021c 833update_inode:
2766ff61 834 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
9a56fcd1 835 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
e02119d5
CM
836out:
837 if (inode)
838 iput(inode);
839 return ret;
840}
841
842/*
843 * when cleaning up conflicts between the directory names in the
844 * subvolume, directory names in the log and directory names in the
845 * inode back references, we may have to unlink inodes from directories.
846 *
847 * This is a helper function to do the unlink of a specific directory
848 * item
849 */
850static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
851 struct btrfs_root *root,
852 struct btrfs_path *path,
207e7d92 853 struct btrfs_inode *dir,
e02119d5
CM
854 struct btrfs_dir_item *di)
855{
856 struct inode *inode;
857 char *name;
858 int name_len;
859 struct extent_buffer *leaf;
860 struct btrfs_key location;
861 int ret;
862
863 leaf = path->nodes[0];
864
865 btrfs_dir_item_key_to_cpu(leaf, di, &location);
866 name_len = btrfs_dir_name_len(leaf, di);
867 name = kmalloc(name_len, GFP_NOFS);
2a29edc6 868 if (!name)
869 return -ENOMEM;
870
e02119d5 871 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
b3b4aa74 872 btrfs_release_path(path);
e02119d5
CM
873
874 inode = read_one_inode(root, location.objectid);
c00e9493 875 if (!inode) {
3650860b
JB
876 ret = -EIO;
877 goto out;
c00e9493 878 }
e02119d5 879
ec051c0f 880 ret = link_to_fixup_dir(trans, root, path, location.objectid);
3650860b
JB
881 if (ret)
882 goto out;
12fcfd22 883
207e7d92
NB
884 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
885 name_len);
3650860b
JB
886 if (ret)
887 goto out;
ada9af21 888 else
e5c304e6 889 ret = btrfs_run_delayed_items(trans);
3650860b 890out:
e02119d5 891 kfree(name);
e02119d5
CM
892 iput(inode);
893 return ret;
894}
895
896/*
897 * helper function to see if a given name and sequence number found
898 * in an inode back reference are already in a directory and correctly
899 * point to this inode
900 */
901static noinline int inode_in_dir(struct btrfs_root *root,
902 struct btrfs_path *path,
903 u64 dirid, u64 objectid, u64 index,
904 const char *name, int name_len)
905{
906 struct btrfs_dir_item *di;
907 struct btrfs_key location;
908 int match = 0;
909
910 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
911 index, name, name_len, 0);
912 if (di && !IS_ERR(di)) {
913 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
914 if (location.objectid != objectid)
915 goto out;
916 } else
917 goto out;
b3b4aa74 918 btrfs_release_path(path);
e02119d5
CM
919
920 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
921 if (di && !IS_ERR(di)) {
922 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
923 if (location.objectid != objectid)
924 goto out;
925 } else
926 goto out;
927 match = 1;
928out:
b3b4aa74 929 btrfs_release_path(path);
e02119d5
CM
930 return match;
931}
932
933/*
934 * helper function to check a log tree for a named back reference in
935 * an inode. This is used to decide if a back reference that is
936 * found in the subvolume conflicts with what we find in the log.
937 *
938 * inode backreferences may have multiple refs in a single item,
939 * during replay we process one reference at a time, and we don't
940 * want to delete valid links to a file from the subvolume if that
941 * link is also in the log.
942 */
943static noinline int backref_in_log(struct btrfs_root *log,
944 struct btrfs_key *key,
f186373f 945 u64 ref_objectid,
df8d116f 946 const char *name, int namelen)
e02119d5
CM
947{
948 struct btrfs_path *path;
e02119d5 949 int ret;
e02119d5
CM
950
951 path = btrfs_alloc_path();
2a29edc6 952 if (!path)
953 return -ENOMEM;
954
e02119d5 955 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
d3316c82
NB
956 if (ret < 0) {
957 goto out;
958 } else if (ret == 1) {
89cbf5f6 959 ret = 0;
f186373f
MF
960 goto out;
961 }
962
89cbf5f6
NB
963 if (key->type == BTRFS_INODE_EXTREF_KEY)
964 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
965 path->slots[0],
966 ref_objectid,
967 name, namelen);
968 else
969 ret = !!btrfs_find_name_in_backref(path->nodes[0],
970 path->slots[0],
971 name, namelen);
e02119d5
CM
972out:
973 btrfs_free_path(path);
89cbf5f6 974 return ret;
e02119d5
CM
975}
976
5a1d7843 977static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
e02119d5 978 struct btrfs_root *root,
e02119d5 979 struct btrfs_path *path,
5a1d7843 980 struct btrfs_root *log_root,
94c91a1f
NB
981 struct btrfs_inode *dir,
982 struct btrfs_inode *inode,
f186373f
MF
983 u64 inode_objectid, u64 parent_objectid,
984 u64 ref_index, char *name, int namelen,
985 int *search_done)
e02119d5 986{
34f3e4f2 987 int ret;
f186373f
MF
988 char *victim_name;
989 int victim_name_len;
990 struct extent_buffer *leaf;
5a1d7843 991 struct btrfs_dir_item *di;
f186373f
MF
992 struct btrfs_key search_key;
993 struct btrfs_inode_extref *extref;
c622ae60 994
f186373f
MF
995again:
996 /* Search old style refs */
997 search_key.objectid = inode_objectid;
998 search_key.type = BTRFS_INODE_REF_KEY;
999 search_key.offset = parent_objectid;
1000 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
e02119d5 1001 if (ret == 0) {
e02119d5
CM
1002 struct btrfs_inode_ref *victim_ref;
1003 unsigned long ptr;
1004 unsigned long ptr_end;
f186373f
MF
1005
1006 leaf = path->nodes[0];
e02119d5
CM
1007
1008 /* are we trying to overwrite a back ref for the root directory
1009 * if so, just jump out, we're done
1010 */
f186373f 1011 if (search_key.objectid == search_key.offset)
5a1d7843 1012 return 1;
e02119d5
CM
1013
1014 /* check all the names in this back reference to see
1015 * if they are in the log. if so, we allow them to stay
1016 * otherwise they must be unlinked as a conflict
1017 */
1018 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1019 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
d397712b 1020 while (ptr < ptr_end) {
e02119d5
CM
1021 victim_ref = (struct btrfs_inode_ref *)ptr;
1022 victim_name_len = btrfs_inode_ref_name_len(leaf,
1023 victim_ref);
1024 victim_name = kmalloc(victim_name_len, GFP_NOFS);
3650860b
JB
1025 if (!victim_name)
1026 return -ENOMEM;
e02119d5
CM
1027
1028 read_extent_buffer(leaf, victim_name,
1029 (unsigned long)(victim_ref + 1),
1030 victim_name_len);
1031
d3316c82
NB
1032 ret = backref_in_log(log_root, &search_key,
1033 parent_objectid, victim_name,
1034 victim_name_len);
1035 if (ret < 0) {
1036 kfree(victim_name);
1037 return ret;
1038 } else if (!ret) {
94c91a1f 1039 inc_nlink(&inode->vfs_inode);
b3b4aa74 1040 btrfs_release_path(path);
12fcfd22 1041
94c91a1f 1042 ret = btrfs_unlink_inode(trans, root, dir, inode,
4ec5934e 1043 victim_name, victim_name_len);
f186373f 1044 kfree(victim_name);
3650860b
JB
1045 if (ret)
1046 return ret;
e5c304e6 1047 ret = btrfs_run_delayed_items(trans);
ada9af21
FDBM
1048 if (ret)
1049 return ret;
f186373f
MF
1050 *search_done = 1;
1051 goto again;
e02119d5
CM
1052 }
1053 kfree(victim_name);
f186373f 1054
e02119d5
CM
1055 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1056 }
e02119d5 1057
c622ae60 1058 /*
1059 * NOTE: we have searched root tree and checked the
bb7ab3b9 1060 * corresponding ref, it does not need to check again.
c622ae60 1061 */
5a1d7843 1062 *search_done = 1;
e02119d5 1063 }
b3b4aa74 1064 btrfs_release_path(path);
e02119d5 1065
f186373f
MF
1066 /* Same search but for extended refs */
1067 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1068 inode_objectid, parent_objectid, 0,
1069 0);
1070 if (!IS_ERR_OR_NULL(extref)) {
1071 u32 item_size;
1072 u32 cur_offset = 0;
1073 unsigned long base;
1074 struct inode *victim_parent;
1075
1076 leaf = path->nodes[0];
1077
1078 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1079 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1080
1081 while (cur_offset < item_size) {
dd9ef135 1082 extref = (struct btrfs_inode_extref *)(base + cur_offset);
f186373f
MF
1083
1084 victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1085
1086 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1087 goto next;
1088
1089 victim_name = kmalloc(victim_name_len, GFP_NOFS);
3650860b
JB
1090 if (!victim_name)
1091 return -ENOMEM;
f186373f
MF
1092 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1093 victim_name_len);
1094
1095 search_key.objectid = inode_objectid;
1096 search_key.type = BTRFS_INODE_EXTREF_KEY;
1097 search_key.offset = btrfs_extref_hash(parent_objectid,
1098 victim_name,
1099 victim_name_len);
d3316c82
NB
1100 ret = backref_in_log(log_root, &search_key,
1101 parent_objectid, victim_name,
1102 victim_name_len);
1103 if (ret < 0) {
1104 return ret;
1105 } else if (!ret) {
f186373f
MF
1106 ret = -ENOENT;
1107 victim_parent = read_one_inode(root,
94c91a1f 1108 parent_objectid);
f186373f 1109 if (victim_parent) {
94c91a1f 1110 inc_nlink(&inode->vfs_inode);
f186373f
MF
1111 btrfs_release_path(path);
1112
1113 ret = btrfs_unlink_inode(trans, root,
4ec5934e 1114 BTRFS_I(victim_parent),
94c91a1f 1115 inode,
4ec5934e
NB
1116 victim_name,
1117 victim_name_len);
ada9af21
FDBM
1118 if (!ret)
1119 ret = btrfs_run_delayed_items(
e5c304e6 1120 trans);
f186373f 1121 }
f186373f
MF
1122 iput(victim_parent);
1123 kfree(victim_name);
3650860b
JB
1124 if (ret)
1125 return ret;
f186373f
MF
1126 *search_done = 1;
1127 goto again;
1128 }
1129 kfree(victim_name);
f186373f
MF
1130next:
1131 cur_offset += victim_name_len + sizeof(*extref);
1132 }
1133 *search_done = 1;
1134 }
1135 btrfs_release_path(path);
1136
34f3e4f2 1137 /* look for a conflicting sequence number */
94c91a1f 1138 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
f186373f 1139 ref_index, name, namelen, 0);
34f3e4f2 1140 if (di && !IS_ERR(di)) {
94c91a1f 1141 ret = drop_one_dir_item(trans, root, path, dir, di);
3650860b
JB
1142 if (ret)
1143 return ret;
34f3e4f2 1144 }
1145 btrfs_release_path(path);
1146
52042d8e 1147 /* look for a conflicting name */
94c91a1f 1148 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
34f3e4f2 1149 name, namelen, 0);
1150 if (di && !IS_ERR(di)) {
94c91a1f 1151 ret = drop_one_dir_item(trans, root, path, dir, di);
3650860b
JB
1152 if (ret)
1153 return ret;
34f3e4f2 1154 }
1155 btrfs_release_path(path);
1156
5a1d7843
JS
1157 return 0;
1158}
e02119d5 1159
bae15d95
QW
1160static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1161 u32 *namelen, char **name, u64 *index,
1162 u64 *parent_objectid)
f186373f
MF
1163{
1164 struct btrfs_inode_extref *extref;
1165
1166 extref = (struct btrfs_inode_extref *)ref_ptr;
1167
1168 *namelen = btrfs_inode_extref_name_len(eb, extref);
1169 *name = kmalloc(*namelen, GFP_NOFS);
1170 if (*name == NULL)
1171 return -ENOMEM;
1172
1173 read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1174 *namelen);
1175
1f250e92
FM
1176 if (index)
1177 *index = btrfs_inode_extref_index(eb, extref);
f186373f
MF
1178 if (parent_objectid)
1179 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1180
1181 return 0;
1182}
1183
bae15d95
QW
1184static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1185 u32 *namelen, char **name, u64 *index)
f186373f
MF
1186{
1187 struct btrfs_inode_ref *ref;
1188
1189 ref = (struct btrfs_inode_ref *)ref_ptr;
1190
1191 *namelen = btrfs_inode_ref_name_len(eb, ref);
1192 *name = kmalloc(*namelen, GFP_NOFS);
1193 if (*name == NULL)
1194 return -ENOMEM;
1195
1196 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1197
1f250e92
FM
1198 if (index)
1199 *index = btrfs_inode_ref_index(eb, ref);
f186373f
MF
1200
1201 return 0;
1202}
1203
1f250e92
FM
1204/*
1205 * Take an inode reference item from the log tree and iterate all names from the
1206 * inode reference item in the subvolume tree with the same key (if it exists).
1207 * For any name that is not in the inode reference item from the log tree, do a
1208 * proper unlink of that name (that is, remove its entry from the inode
1209 * reference item and both dir index keys).
1210 */
1211static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1212 struct btrfs_root *root,
1213 struct btrfs_path *path,
1214 struct btrfs_inode *inode,
1215 struct extent_buffer *log_eb,
1216 int log_slot,
1217 struct btrfs_key *key)
1218{
1219 int ret;
1220 unsigned long ref_ptr;
1221 unsigned long ref_end;
1222 struct extent_buffer *eb;
1223
1224again:
1225 btrfs_release_path(path);
1226 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1227 if (ret > 0) {
1228 ret = 0;
1229 goto out;
1230 }
1231 if (ret < 0)
1232 goto out;
1233
1234 eb = path->nodes[0];
1235 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1236 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1237 while (ref_ptr < ref_end) {
1238 char *name = NULL;
1239 int namelen;
1240 u64 parent_id;
1241
1242 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1243 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1244 NULL, &parent_id);
1245 } else {
1246 parent_id = key->offset;
1247 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1248 NULL);
1249 }
1250 if (ret)
1251 goto out;
1252
1253 if (key->type == BTRFS_INODE_EXTREF_KEY)
6ff49c6a
NB
1254 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1255 parent_id, name,
1256 namelen);
1f250e92 1257 else
9bb8407f
NB
1258 ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1259 name, namelen);
1f250e92
FM
1260
1261 if (!ret) {
1262 struct inode *dir;
1263
1264 btrfs_release_path(path);
1265 dir = read_one_inode(root, parent_id);
1266 if (!dir) {
1267 ret = -ENOENT;
1268 kfree(name);
1269 goto out;
1270 }
1271 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1272 inode, name, namelen);
1273 kfree(name);
1274 iput(dir);
1275 if (ret)
1276 goto out;
1277 goto again;
1278 }
1279
1280 kfree(name);
1281 ref_ptr += namelen;
1282 if (key->type == BTRFS_INODE_EXTREF_KEY)
1283 ref_ptr += sizeof(struct btrfs_inode_extref);
1284 else
1285 ref_ptr += sizeof(struct btrfs_inode_ref);
1286 }
1287 ret = 0;
1288 out:
1289 btrfs_release_path(path);
1290 return ret;
1291}
1292
0d836392
FM
1293static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1294 const u8 ref_type, const char *name,
1295 const int namelen)
1296{
1297 struct btrfs_key key;
1298 struct btrfs_path *path;
1299 const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1300 int ret;
1301
1302 path = btrfs_alloc_path();
1303 if (!path)
1304 return -ENOMEM;
1305
1306 key.objectid = btrfs_ino(BTRFS_I(inode));
1307 key.type = ref_type;
1308 if (key.type == BTRFS_INODE_REF_KEY)
1309 key.offset = parent_id;
1310 else
1311 key.offset = btrfs_extref_hash(parent_id, name, namelen);
1312
1313 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1314 if (ret < 0)
1315 goto out;
1316 if (ret > 0) {
1317 ret = 0;
1318 goto out;
1319 }
1320 if (key.type == BTRFS_INODE_EXTREF_KEY)
6ff49c6a
NB
1321 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1322 path->slots[0], parent_id, name, namelen);
0d836392 1323 else
9bb8407f
NB
1324 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1325 name, namelen);
0d836392
FM
1326
1327out:
1328 btrfs_free_path(path);
1329 return ret;
1330}
1331
6b5fc433
FM
1332static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1333 struct inode *dir, struct inode *inode, const char *name,
1334 int namelen, u64 ref_index)
1335{
1336 struct btrfs_dir_item *dir_item;
1337 struct btrfs_key key;
1338 struct btrfs_path *path;
1339 struct inode *other_inode = NULL;
1340 int ret;
1341
1342 path = btrfs_alloc_path();
1343 if (!path)
1344 return -ENOMEM;
1345
1346 dir_item = btrfs_lookup_dir_item(NULL, root, path,
1347 btrfs_ino(BTRFS_I(dir)),
1348 name, namelen, 0);
1349 if (!dir_item) {
1350 btrfs_release_path(path);
1351 goto add_link;
1352 } else if (IS_ERR(dir_item)) {
1353 ret = PTR_ERR(dir_item);
1354 goto out;
1355 }
1356
1357 /*
1358 * Our inode's dentry collides with the dentry of another inode which is
1359 * in the log but not yet processed since it has a higher inode number.
1360 * So delete that other dentry.
1361 */
1362 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1363 btrfs_release_path(path);
1364 other_inode = read_one_inode(root, key.objectid);
1365 if (!other_inode) {
1366 ret = -ENOENT;
1367 goto out;
1368 }
1369 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1370 name, namelen);
1371 if (ret)
1372 goto out;
1373 /*
1374 * If we dropped the link count to 0, bump it so that later the iput()
1375 * on the inode will not free it. We will fixup the link count later.
1376 */
1377 if (other_inode->i_nlink == 0)
1378 inc_nlink(other_inode);
1379
1380 ret = btrfs_run_delayed_items(trans);
1381 if (ret)
1382 goto out;
1383add_link:
1384 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1385 name, namelen, 0, ref_index);
1386out:
1387 iput(other_inode);
1388 btrfs_free_path(path);
1389
1390 return ret;
1391}
1392
5a1d7843
JS
1393/*
1394 * replay one inode back reference item found in the log tree.
1395 * eb, slot and key refer to the buffer and key found in the log tree.
1396 * root is the destination we are replaying into, and path is for temp
1397 * use by this function. (it should be released on return).
1398 */
1399static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1400 struct btrfs_root *root,
1401 struct btrfs_root *log,
1402 struct btrfs_path *path,
1403 struct extent_buffer *eb, int slot,
1404 struct btrfs_key *key)
1405{
03b2f08b
GB
1406 struct inode *dir = NULL;
1407 struct inode *inode = NULL;
5a1d7843
JS
1408 unsigned long ref_ptr;
1409 unsigned long ref_end;
03b2f08b 1410 char *name = NULL;
5a1d7843
JS
1411 int namelen;
1412 int ret;
1413 int search_done = 0;
f186373f
MF
1414 int log_ref_ver = 0;
1415 u64 parent_objectid;
1416 u64 inode_objectid;
f46dbe3d 1417 u64 ref_index = 0;
f186373f
MF
1418 int ref_struct_size;
1419
1420 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1421 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1422
1423 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1424 struct btrfs_inode_extref *r;
1425
1426 ref_struct_size = sizeof(struct btrfs_inode_extref);
1427 log_ref_ver = 1;
1428 r = (struct btrfs_inode_extref *)ref_ptr;
1429 parent_objectid = btrfs_inode_extref_parent(eb, r);
1430 } else {
1431 ref_struct_size = sizeof(struct btrfs_inode_ref);
1432 parent_objectid = key->offset;
1433 }
1434 inode_objectid = key->objectid;
e02119d5 1435
5a1d7843
JS
1436 /*
1437 * it is possible that we didn't log all the parent directories
1438 * for a given inode. If we don't find the dir, just don't
1439 * copy the back ref in. The link count fixup code will take
1440 * care of the rest
1441 */
f186373f 1442 dir = read_one_inode(root, parent_objectid);
03b2f08b
GB
1443 if (!dir) {
1444 ret = -ENOENT;
1445 goto out;
1446 }
5a1d7843 1447
f186373f 1448 inode = read_one_inode(root, inode_objectid);
5a1d7843 1449 if (!inode) {
03b2f08b
GB
1450 ret = -EIO;
1451 goto out;
5a1d7843
JS
1452 }
1453
5a1d7843 1454 while (ref_ptr < ref_end) {
f186373f 1455 if (log_ref_ver) {
bae15d95
QW
1456 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1457 &ref_index, &parent_objectid);
f186373f
MF
1458 /*
1459 * parent object can change from one array
1460 * item to another.
1461 */
1462 if (!dir)
1463 dir = read_one_inode(root, parent_objectid);
03b2f08b
GB
1464 if (!dir) {
1465 ret = -ENOENT;
1466 goto out;
1467 }
f186373f 1468 } else {
bae15d95
QW
1469 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1470 &ref_index);
f186373f
MF
1471 }
1472 if (ret)
03b2f08b 1473 goto out;
5a1d7843
JS
1474
1475 /* if we already have a perfect match, we're done */
f85b7379
DS
1476 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1477 btrfs_ino(BTRFS_I(inode)), ref_index,
1478 name, namelen)) {
5a1d7843
JS
1479 /*
1480 * look for a conflicting back reference in the
1481 * metadata. if we find one we have to unlink that name
1482 * of the file before we add our new link. Later on, we
1483 * overwrite any existing back reference, and we don't
1484 * want to create dangling pointers in the directory.
1485 */
1486
1487 if (!search_done) {
1488 ret = __add_inode_ref(trans, root, path, log,
94c91a1f 1489 BTRFS_I(dir),
d75eefdf 1490 BTRFS_I(inode),
f186373f
MF
1491 inode_objectid,
1492 parent_objectid,
1493 ref_index, name, namelen,
5a1d7843 1494 &search_done);
03b2f08b
GB
1495 if (ret) {
1496 if (ret == 1)
1497 ret = 0;
3650860b
JB
1498 goto out;
1499 }
5a1d7843
JS
1500 }
1501
0d836392
FM
1502 /*
1503 * If a reference item already exists for this inode
1504 * with the same parent and name, but different index,
1505 * drop it and the corresponding directory index entries
1506 * from the parent before adding the new reference item
1507 * and dir index entries, otherwise we would fail with
1508 * -EEXIST returned from btrfs_add_link() below.
1509 */
1510 ret = btrfs_inode_ref_exists(inode, dir, key->type,
1511 name, namelen);
1512 if (ret > 0) {
1513 ret = btrfs_unlink_inode(trans, root,
1514 BTRFS_I(dir),
1515 BTRFS_I(inode),
1516 name, namelen);
1517 /*
1518 * If we dropped the link count to 0, bump it so
1519 * that later the iput() on the inode will not
1520 * free it. We will fixup the link count later.
1521 */
1522 if (!ret && inode->i_nlink == 0)
1523 inc_nlink(inode);
1524 }
1525 if (ret < 0)
1526 goto out;
1527
5a1d7843 1528 /* insert our name */
6b5fc433
FM
1529 ret = add_link(trans, root, dir, inode, name, namelen,
1530 ref_index);
3650860b
JB
1531 if (ret)
1532 goto out;
5a1d7843 1533
9a56fcd1 1534 btrfs_update_inode(trans, root, BTRFS_I(inode));
5a1d7843
JS
1535 }
1536
f186373f 1537 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
5a1d7843 1538 kfree(name);
03b2f08b 1539 name = NULL;
f186373f
MF
1540 if (log_ref_ver) {
1541 iput(dir);
1542 dir = NULL;
1543 }
5a1d7843 1544 }
e02119d5 1545
1f250e92
FM
1546 /*
1547 * Before we overwrite the inode reference item in the subvolume tree
1548 * with the item from the log tree, we must unlink all names from the
1549 * parent directory that are in the subvolume's tree inode reference
1550 * item, otherwise we end up with an inconsistent subvolume tree where
1551 * dir index entries exist for a name but there is no inode reference
1552 * item with the same name.
1553 */
1554 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1555 key);
1556 if (ret)
1557 goto out;
1558
e02119d5
CM
1559 /* finally write the back reference in the inode */
1560 ret = overwrite_item(trans, root, path, eb, slot, key);
5a1d7843 1561out:
b3b4aa74 1562 btrfs_release_path(path);
03b2f08b 1563 kfree(name);
e02119d5
CM
1564 iput(dir);
1565 iput(inode);
3650860b 1566 return ret;
e02119d5
CM
1567}
1568
f186373f 1569static int count_inode_extrefs(struct btrfs_root *root,
36283658 1570 struct btrfs_inode *inode, struct btrfs_path *path)
f186373f
MF
1571{
1572 int ret = 0;
1573 int name_len;
1574 unsigned int nlink = 0;
1575 u32 item_size;
1576 u32 cur_offset = 0;
36283658 1577 u64 inode_objectid = btrfs_ino(inode);
f186373f
MF
1578 u64 offset = 0;
1579 unsigned long ptr;
1580 struct btrfs_inode_extref *extref;
1581 struct extent_buffer *leaf;
1582
1583 while (1) {
1584 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1585 &extref, &offset);
1586 if (ret)
1587 break;
c71bf099 1588
f186373f
MF
1589 leaf = path->nodes[0];
1590 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1591 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
2c2c452b 1592 cur_offset = 0;
f186373f
MF
1593
1594 while (cur_offset < item_size) {
1595 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1596 name_len = btrfs_inode_extref_name_len(leaf, extref);
1597
1598 nlink++;
1599
1600 cur_offset += name_len + sizeof(*extref);
1601 }
1602
1603 offset++;
1604 btrfs_release_path(path);
1605 }
1606 btrfs_release_path(path);
1607
2c2c452b 1608 if (ret < 0 && ret != -ENOENT)
f186373f
MF
1609 return ret;
1610 return nlink;
1611}
1612
1613static int count_inode_refs(struct btrfs_root *root,
f329e319 1614 struct btrfs_inode *inode, struct btrfs_path *path)
e02119d5 1615{
e02119d5
CM
1616 int ret;
1617 struct btrfs_key key;
f186373f 1618 unsigned int nlink = 0;
e02119d5
CM
1619 unsigned long ptr;
1620 unsigned long ptr_end;
1621 int name_len;
f329e319 1622 u64 ino = btrfs_ino(inode);
e02119d5 1623
33345d01 1624 key.objectid = ino;
e02119d5
CM
1625 key.type = BTRFS_INODE_REF_KEY;
1626 key.offset = (u64)-1;
1627
d397712b 1628 while (1) {
e02119d5
CM
1629 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1630 if (ret < 0)
1631 break;
1632 if (ret > 0) {
1633 if (path->slots[0] == 0)
1634 break;
1635 path->slots[0]--;
1636 }
e93ae26f 1637process_slot:
e02119d5
CM
1638 btrfs_item_key_to_cpu(path->nodes[0], &key,
1639 path->slots[0]);
33345d01 1640 if (key.objectid != ino ||
e02119d5
CM
1641 key.type != BTRFS_INODE_REF_KEY)
1642 break;
1643 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1644 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1645 path->slots[0]);
d397712b 1646 while (ptr < ptr_end) {
e02119d5
CM
1647 struct btrfs_inode_ref *ref;
1648
1649 ref = (struct btrfs_inode_ref *)ptr;
1650 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1651 ref);
1652 ptr = (unsigned long)(ref + 1) + name_len;
1653 nlink++;
1654 }
1655
1656 if (key.offset == 0)
1657 break;
e93ae26f
FDBM
1658 if (path->slots[0] > 0) {
1659 path->slots[0]--;
1660 goto process_slot;
1661 }
e02119d5 1662 key.offset--;
b3b4aa74 1663 btrfs_release_path(path);
e02119d5 1664 }
b3b4aa74 1665 btrfs_release_path(path);
f186373f
MF
1666
1667 return nlink;
1668}
1669
1670/*
1671 * There are a few corners where the link count of the file can't
1672 * be properly maintained during replay. So, instead of adding
1673 * lots of complexity to the log code, we just scan the backrefs
1674 * for any file that has been through replay.
1675 *
1676 * The scan will update the link count on the inode to reflect the
1677 * number of back refs found. If it goes down to zero, the iput
1678 * will free the inode.
1679 */
1680static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1681 struct btrfs_root *root,
1682 struct inode *inode)
1683{
1684 struct btrfs_path *path;
1685 int ret;
1686 u64 nlink = 0;
4a0cc7ca 1687 u64 ino = btrfs_ino(BTRFS_I(inode));
f186373f
MF
1688
1689 path = btrfs_alloc_path();
1690 if (!path)
1691 return -ENOMEM;
1692
f329e319 1693 ret = count_inode_refs(root, BTRFS_I(inode), path);
f186373f
MF
1694 if (ret < 0)
1695 goto out;
1696
1697 nlink = ret;
1698
36283658 1699 ret = count_inode_extrefs(root, BTRFS_I(inode), path);
f186373f
MF
1700 if (ret < 0)
1701 goto out;
1702
1703 nlink += ret;
1704
1705 ret = 0;
1706
e02119d5 1707 if (nlink != inode->i_nlink) {
bfe86848 1708 set_nlink(inode, nlink);
9a56fcd1 1709 btrfs_update_inode(trans, root, BTRFS_I(inode));
e02119d5 1710 }
8d5bf1cb 1711 BTRFS_I(inode)->index_cnt = (u64)-1;
e02119d5 1712
c71bf099
YZ
1713 if (inode->i_nlink == 0) {
1714 if (S_ISDIR(inode->i_mode)) {
1715 ret = replay_dir_deletes(trans, root, NULL, path,
33345d01 1716 ino, 1);
3650860b
JB
1717 if (ret)
1718 goto out;
c71bf099 1719 }
ecdcf3c2
NB
1720 ret = btrfs_insert_orphan_item(trans, root, ino);
1721 if (ret == -EEXIST)
1722 ret = 0;
12fcfd22 1723 }
12fcfd22 1724
f186373f
MF
1725out:
1726 btrfs_free_path(path);
1727 return ret;
e02119d5
CM
1728}
1729
1730static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1731 struct btrfs_root *root,
1732 struct btrfs_path *path)
1733{
1734 int ret;
1735 struct btrfs_key key;
1736 struct inode *inode;
1737
1738 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1739 key.type = BTRFS_ORPHAN_ITEM_KEY;
1740 key.offset = (u64)-1;
d397712b 1741 while (1) {
e02119d5
CM
1742 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1743 if (ret < 0)
1744 break;
1745
1746 if (ret == 1) {
1747 if (path->slots[0] == 0)
1748 break;
1749 path->slots[0]--;
1750 }
1751
1752 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1753 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1754 key.type != BTRFS_ORPHAN_ITEM_KEY)
1755 break;
1756
1757 ret = btrfs_del_item(trans, root, path);
65a246c5
TI
1758 if (ret)
1759 goto out;
e02119d5 1760
b3b4aa74 1761 btrfs_release_path(path);
e02119d5 1762 inode = read_one_inode(root, key.offset);
c00e9493
TI
1763 if (!inode)
1764 return -EIO;
e02119d5
CM
1765
1766 ret = fixup_inode_link_count(trans, root, inode);
e02119d5 1767 iput(inode);
3650860b
JB
1768 if (ret)
1769 goto out;
e02119d5 1770
12fcfd22
CM
1771 /*
1772 * fixup on a directory may create new entries,
1773 * make sure we always look for the highset possible
1774 * offset
1775 */
1776 key.offset = (u64)-1;
e02119d5 1777 }
65a246c5
TI
1778 ret = 0;
1779out:
b3b4aa74 1780 btrfs_release_path(path);
65a246c5 1781 return ret;
e02119d5
CM
1782}
1783
1784
1785/*
1786 * record a given inode in the fixup dir so we can check its link
1787 * count when replay is done. The link count is incremented here
1788 * so the inode won't go away until we check it
1789 */
1790static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1791 struct btrfs_root *root,
1792 struct btrfs_path *path,
1793 u64 objectid)
1794{
1795 struct btrfs_key key;
1796 int ret = 0;
1797 struct inode *inode;
1798
1799 inode = read_one_inode(root, objectid);
c00e9493
TI
1800 if (!inode)
1801 return -EIO;
e02119d5
CM
1802
1803 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
962a298f 1804 key.type = BTRFS_ORPHAN_ITEM_KEY;
e02119d5
CM
1805 key.offset = objectid;
1806
1807 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1808
b3b4aa74 1809 btrfs_release_path(path);
e02119d5 1810 if (ret == 0) {
9bf7a489
JB
1811 if (!inode->i_nlink)
1812 set_nlink(inode, 1);
1813 else
8b558c5f 1814 inc_nlink(inode);
9a56fcd1 1815 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
e02119d5
CM
1816 } else if (ret == -EEXIST) {
1817 ret = 0;
1818 } else {
3650860b 1819 BUG(); /* Logic Error */
e02119d5
CM
1820 }
1821 iput(inode);
1822
1823 return ret;
1824}
1825
1826/*
1827 * when replaying the log for a directory, we only insert names
1828 * for inodes that actually exist. This means an fsync on a directory
1829 * does not implicitly fsync all the new files in it
1830 */
1831static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1832 struct btrfs_root *root,
e02119d5 1833 u64 dirid, u64 index,
60d53eb3 1834 char *name, int name_len,
e02119d5
CM
1835 struct btrfs_key *location)
1836{
1837 struct inode *inode;
1838 struct inode *dir;
1839 int ret;
1840
1841 inode = read_one_inode(root, location->objectid);
1842 if (!inode)
1843 return -ENOENT;
1844
1845 dir = read_one_inode(root, dirid);
1846 if (!dir) {
1847 iput(inode);
1848 return -EIO;
1849 }
d555438b 1850
db0a669f
NB
1851 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1852 name_len, 1, index);
e02119d5
CM
1853
1854 /* FIXME, put inode into FIXUP list */
1855
1856 iput(inode);
1857 iput(dir);
1858 return ret;
1859}
1860
1861/*
1862 * take a single entry in a log directory item and replay it into
1863 * the subvolume.
1864 *
1865 * if a conflicting item exists in the subdirectory already,
1866 * the inode it points to is unlinked and put into the link count
1867 * fix up tree.
1868 *
1869 * If a name from the log points to a file or directory that does
1870 * not exist in the FS, it is skipped. fsyncs on directories
1871 * do not force down inodes inside that directory, just changes to the
1872 * names or unlinks in a directory.
bb53eda9
FM
1873 *
1874 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1875 * non-existing inode) and 1 if the name was replayed.
e02119d5
CM
1876 */
1877static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1878 struct btrfs_root *root,
1879 struct btrfs_path *path,
1880 struct extent_buffer *eb,
1881 struct btrfs_dir_item *di,
1882 struct btrfs_key *key)
1883{
1884 char *name;
1885 int name_len;
1886 struct btrfs_dir_item *dst_di;
1887 struct btrfs_key found_key;
1888 struct btrfs_key log_key;
1889 struct inode *dir;
e02119d5 1890 u8 log_type;
4bef0848 1891 int exists;
3650860b 1892 int ret = 0;
d555438b 1893 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bb53eda9 1894 bool name_added = false;
e02119d5
CM
1895
1896 dir = read_one_inode(root, key->objectid);
c00e9493
TI
1897 if (!dir)
1898 return -EIO;
e02119d5
CM
1899
1900 name_len = btrfs_dir_name_len(eb, di);
1901 name = kmalloc(name_len, GFP_NOFS);
2bac325e
FDBM
1902 if (!name) {
1903 ret = -ENOMEM;
1904 goto out;
1905 }
2a29edc6 1906
e02119d5
CM
1907 log_type = btrfs_dir_type(eb, di);
1908 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1909 name_len);
1910
1911 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
4bef0848
CM
1912 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1913 if (exists == 0)
1914 exists = 1;
1915 else
1916 exists = 0;
b3b4aa74 1917 btrfs_release_path(path);
4bef0848 1918
e02119d5
CM
1919 if (key->type == BTRFS_DIR_ITEM_KEY) {
1920 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1921 name, name_len, 1);
d397712b 1922 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
e02119d5
CM
1923 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1924 key->objectid,
1925 key->offset, name,
1926 name_len, 1);
1927 } else {
3650860b
JB
1928 /* Corruption */
1929 ret = -EINVAL;
1930 goto out;
e02119d5 1931 }
c704005d 1932 if (IS_ERR_OR_NULL(dst_di)) {
e02119d5
CM
1933 /* we need a sequence number to insert, so we only
1934 * do inserts for the BTRFS_DIR_INDEX_KEY types
1935 */
1936 if (key->type != BTRFS_DIR_INDEX_KEY)
1937 goto out;
1938 goto insert;
1939 }
1940
1941 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1942 /* the existing item matches the logged item */
1943 if (found_key.objectid == log_key.objectid &&
1944 found_key.type == log_key.type &&
1945 found_key.offset == log_key.offset &&
1946 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
a2cc11db 1947 update_size = false;
e02119d5
CM
1948 goto out;
1949 }
1950
1951 /*
1952 * don't drop the conflicting directory entry if the inode
1953 * for the new entry doesn't exist
1954 */
4bef0848 1955 if (!exists)
e02119d5
CM
1956 goto out;
1957
207e7d92 1958 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
3650860b
JB
1959 if (ret)
1960 goto out;
e02119d5
CM
1961
1962 if (key->type == BTRFS_DIR_INDEX_KEY)
1963 goto insert;
1964out:
b3b4aa74 1965 btrfs_release_path(path);
d555438b 1966 if (!ret && update_size) {
6ef06d27 1967 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
9a56fcd1 1968 ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
d555438b 1969 }
e02119d5
CM
1970 kfree(name);
1971 iput(dir);
bb53eda9
FM
1972 if (!ret && name_added)
1973 ret = 1;
3650860b 1974 return ret;
e02119d5
CM
1975
1976insert:
725af92a
NB
1977 /*
1978 * Check if the inode reference exists in the log for the given name,
1979 * inode and parent inode
1980 */
1981 found_key.objectid = log_key.objectid;
1982 found_key.type = BTRFS_INODE_REF_KEY;
1983 found_key.offset = key->objectid;
1984 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
1985 if (ret < 0) {
1986 goto out;
1987 } else if (ret) {
1988 /* The dentry will be added later. */
1989 ret = 0;
1990 update_size = false;
1991 goto out;
1992 }
1993
1994 found_key.objectid = log_key.objectid;
1995 found_key.type = BTRFS_INODE_EXTREF_KEY;
1996 found_key.offset = key->objectid;
1997 ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
1998 name_len);
1999 if (ret < 0) {
2000 goto out;
2001 } else if (ret) {
df8d116f
FM
2002 /* The dentry will be added later. */
2003 ret = 0;
2004 update_size = false;
2005 goto out;
2006 }
b3b4aa74 2007 btrfs_release_path(path);
60d53eb3
Z
2008 ret = insert_one_name(trans, root, key->objectid, key->offset,
2009 name, name_len, &log_key);
df8d116f 2010 if (ret && ret != -ENOENT && ret != -EEXIST)
3650860b 2011 goto out;
bb53eda9
FM
2012 if (!ret)
2013 name_added = true;
d555438b 2014 update_size = false;
3650860b 2015 ret = 0;
e02119d5
CM
2016 goto out;
2017}
2018
2019/*
2020 * find all the names in a directory item and reconcile them into
2021 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
2022 * one name in a directory item, but the same code gets used for
2023 * both directory index types
2024 */
2025static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2026 struct btrfs_root *root,
2027 struct btrfs_path *path,
2028 struct extent_buffer *eb, int slot,
2029 struct btrfs_key *key)
2030{
bb53eda9 2031 int ret = 0;
e02119d5
CM
2032 u32 item_size = btrfs_item_size_nr(eb, slot);
2033 struct btrfs_dir_item *di;
2034 int name_len;
2035 unsigned long ptr;
2036 unsigned long ptr_end;
bb53eda9 2037 struct btrfs_path *fixup_path = NULL;
e02119d5
CM
2038
2039 ptr = btrfs_item_ptr_offset(eb, slot);
2040 ptr_end = ptr + item_size;
d397712b 2041 while (ptr < ptr_end) {
e02119d5
CM
2042 di = (struct btrfs_dir_item *)ptr;
2043 name_len = btrfs_dir_name_len(eb, di);
2044 ret = replay_one_name(trans, root, path, eb, di, key);
bb53eda9
FM
2045 if (ret < 0)
2046 break;
e02119d5
CM
2047 ptr = (unsigned long)(di + 1);
2048 ptr += name_len;
bb53eda9
FM
2049
2050 /*
2051 * If this entry refers to a non-directory (directories can not
2052 * have a link count > 1) and it was added in the transaction
2053 * that was not committed, make sure we fixup the link count of
2054 * the inode it the entry points to. Otherwise something like
2055 * the following would result in a directory pointing to an
2056 * inode with a wrong link that does not account for this dir
2057 * entry:
2058 *
2059 * mkdir testdir
2060 * touch testdir/foo
2061 * touch testdir/bar
2062 * sync
2063 *
2064 * ln testdir/bar testdir/bar_link
2065 * ln testdir/foo testdir/foo_link
2066 * xfs_io -c "fsync" testdir/bar
2067 *
2068 * <power failure>
2069 *
2070 * mount fs, log replay happens
2071 *
2072 * File foo would remain with a link count of 1 when it has two
2073 * entries pointing to it in the directory testdir. This would
2074 * make it impossible to ever delete the parent directory has
2075 * it would result in stale dentries that can never be deleted.
2076 */
2077 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2078 struct btrfs_key di_key;
2079
2080 if (!fixup_path) {
2081 fixup_path = btrfs_alloc_path();
2082 if (!fixup_path) {
2083 ret = -ENOMEM;
2084 break;
2085 }
2086 }
2087
2088 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2089 ret = link_to_fixup_dir(trans, root, fixup_path,
2090 di_key.objectid);
2091 if (ret)
2092 break;
2093 }
2094 ret = 0;
e02119d5 2095 }
bb53eda9
FM
2096 btrfs_free_path(fixup_path);
2097 return ret;
e02119d5
CM
2098}
2099
2100/*
2101 * directory replay has two parts. There are the standard directory
2102 * items in the log copied from the subvolume, and range items
2103 * created in the log while the subvolume was logged.
2104 *
2105 * The range items tell us which parts of the key space the log
2106 * is authoritative for. During replay, if a key in the subvolume
2107 * directory is in a logged range item, but not actually in the log
2108 * that means it was deleted from the directory before the fsync
2109 * and should be removed.
2110 */
2111static noinline int find_dir_range(struct btrfs_root *root,
2112 struct btrfs_path *path,
2113 u64 dirid, int key_type,
2114 u64 *start_ret, u64 *end_ret)
2115{
2116 struct btrfs_key key;
2117 u64 found_end;
2118 struct btrfs_dir_log_item *item;
2119 int ret;
2120 int nritems;
2121
2122 if (*start_ret == (u64)-1)
2123 return 1;
2124
2125 key.objectid = dirid;
2126 key.type = key_type;
2127 key.offset = *start_ret;
2128
2129 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2130 if (ret < 0)
2131 goto out;
2132 if (ret > 0) {
2133 if (path->slots[0] == 0)
2134 goto out;
2135 path->slots[0]--;
2136 }
2137 if (ret != 0)
2138 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2139
2140 if (key.type != key_type || key.objectid != dirid) {
2141 ret = 1;
2142 goto next;
2143 }
2144 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2145 struct btrfs_dir_log_item);
2146 found_end = btrfs_dir_log_end(path->nodes[0], item);
2147
2148 if (*start_ret >= key.offset && *start_ret <= found_end) {
2149 ret = 0;
2150 *start_ret = key.offset;
2151 *end_ret = found_end;
2152 goto out;
2153 }
2154 ret = 1;
2155next:
2156 /* check the next slot in the tree to see if it is a valid item */
2157 nritems = btrfs_header_nritems(path->nodes[0]);
2a7bf53f 2158 path->slots[0]++;
e02119d5
CM
2159 if (path->slots[0] >= nritems) {
2160 ret = btrfs_next_leaf(root, path);
2161 if (ret)
2162 goto out;
e02119d5
CM
2163 }
2164
2165 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2166
2167 if (key.type != key_type || key.objectid != dirid) {
2168 ret = 1;
2169 goto out;
2170 }
2171 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2172 struct btrfs_dir_log_item);
2173 found_end = btrfs_dir_log_end(path->nodes[0], item);
2174 *start_ret = key.offset;
2175 *end_ret = found_end;
2176 ret = 0;
2177out:
b3b4aa74 2178 btrfs_release_path(path);
e02119d5
CM
2179 return ret;
2180}
2181
2182/*
2183 * this looks for a given directory item in the log. If the directory
2184 * item is not in the log, the item is removed and the inode it points
2185 * to is unlinked
2186 */
2187static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2188 struct btrfs_root *root,
2189 struct btrfs_root *log,
2190 struct btrfs_path *path,
2191 struct btrfs_path *log_path,
2192 struct inode *dir,
2193 struct btrfs_key *dir_key)
2194{
2195 int ret;
2196 struct extent_buffer *eb;
2197 int slot;
2198 u32 item_size;
2199 struct btrfs_dir_item *di;
2200 struct btrfs_dir_item *log_di;
2201 int name_len;
2202 unsigned long ptr;
2203 unsigned long ptr_end;
2204 char *name;
2205 struct inode *inode;
2206 struct btrfs_key location;
2207
2208again:
2209 eb = path->nodes[0];
2210 slot = path->slots[0];
2211 item_size = btrfs_item_size_nr(eb, slot);
2212 ptr = btrfs_item_ptr_offset(eb, slot);
2213 ptr_end = ptr + item_size;
d397712b 2214 while (ptr < ptr_end) {
e02119d5
CM
2215 di = (struct btrfs_dir_item *)ptr;
2216 name_len = btrfs_dir_name_len(eb, di);
2217 name = kmalloc(name_len, GFP_NOFS);
2218 if (!name) {
2219 ret = -ENOMEM;
2220 goto out;
2221 }
2222 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2223 name_len);
2224 log_di = NULL;
12fcfd22 2225 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
e02119d5
CM
2226 log_di = btrfs_lookup_dir_item(trans, log, log_path,
2227 dir_key->objectid,
2228 name, name_len, 0);
12fcfd22 2229 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
e02119d5
CM
2230 log_di = btrfs_lookup_dir_index_item(trans, log,
2231 log_path,
2232 dir_key->objectid,
2233 dir_key->offset,
2234 name, name_len, 0);
2235 }
8d9e220c 2236 if (!log_di || log_di == ERR_PTR(-ENOENT)) {
e02119d5 2237 btrfs_dir_item_key_to_cpu(eb, di, &location);
b3b4aa74
DS
2238 btrfs_release_path(path);
2239 btrfs_release_path(log_path);
e02119d5 2240 inode = read_one_inode(root, location.objectid);
c00e9493
TI
2241 if (!inode) {
2242 kfree(name);
2243 return -EIO;
2244 }
e02119d5
CM
2245
2246 ret = link_to_fixup_dir(trans, root,
2247 path, location.objectid);
3650860b
JB
2248 if (ret) {
2249 kfree(name);
2250 iput(inode);
2251 goto out;
2252 }
2253
8b558c5f 2254 inc_nlink(inode);
4ec5934e
NB
2255 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2256 BTRFS_I(inode), name, name_len);
3650860b 2257 if (!ret)
e5c304e6 2258 ret = btrfs_run_delayed_items(trans);
e02119d5
CM
2259 kfree(name);
2260 iput(inode);
3650860b
JB
2261 if (ret)
2262 goto out;
e02119d5
CM
2263
2264 /* there might still be more names under this key
2265 * check and repeat if required
2266 */
2267 ret = btrfs_search_slot(NULL, root, dir_key, path,
2268 0, 0);
2269 if (ret == 0)
2270 goto again;
2271 ret = 0;
2272 goto out;
269d040f
FDBM
2273 } else if (IS_ERR(log_di)) {
2274 kfree(name);
2275 return PTR_ERR(log_di);
e02119d5 2276 }
b3b4aa74 2277 btrfs_release_path(log_path);
e02119d5
CM
2278 kfree(name);
2279
2280 ptr = (unsigned long)(di + 1);
2281 ptr += name_len;
2282 }
2283 ret = 0;
2284out:
b3b4aa74
DS
2285 btrfs_release_path(path);
2286 btrfs_release_path(log_path);
e02119d5
CM
2287 return ret;
2288}
2289
4f764e51
FM
2290static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2291 struct btrfs_root *root,
2292 struct btrfs_root *log,
2293 struct btrfs_path *path,
2294 const u64 ino)
2295{
2296 struct btrfs_key search_key;
2297 struct btrfs_path *log_path;
2298 int i;
2299 int nritems;
2300 int ret;
2301
2302 log_path = btrfs_alloc_path();
2303 if (!log_path)
2304 return -ENOMEM;
2305
2306 search_key.objectid = ino;
2307 search_key.type = BTRFS_XATTR_ITEM_KEY;
2308 search_key.offset = 0;
2309again:
2310 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2311 if (ret < 0)
2312 goto out;
2313process_leaf:
2314 nritems = btrfs_header_nritems(path->nodes[0]);
2315 for (i = path->slots[0]; i < nritems; i++) {
2316 struct btrfs_key key;
2317 struct btrfs_dir_item *di;
2318 struct btrfs_dir_item *log_di;
2319 u32 total_size;
2320 u32 cur;
2321
2322 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2323 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2324 ret = 0;
2325 goto out;
2326 }
2327
2328 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2329 total_size = btrfs_item_size_nr(path->nodes[0], i);
2330 cur = 0;
2331 while (cur < total_size) {
2332 u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2333 u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2334 u32 this_len = sizeof(*di) + name_len + data_len;
2335 char *name;
2336
2337 name = kmalloc(name_len, GFP_NOFS);
2338 if (!name) {
2339 ret = -ENOMEM;
2340 goto out;
2341 }
2342 read_extent_buffer(path->nodes[0], name,
2343 (unsigned long)(di + 1), name_len);
2344
2345 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2346 name, name_len, 0);
2347 btrfs_release_path(log_path);
2348 if (!log_di) {
2349 /* Doesn't exist in log tree, so delete it. */
2350 btrfs_release_path(path);
2351 di = btrfs_lookup_xattr(trans, root, path, ino,
2352 name, name_len, -1);
2353 kfree(name);
2354 if (IS_ERR(di)) {
2355 ret = PTR_ERR(di);
2356 goto out;
2357 }
2358 ASSERT(di);
2359 ret = btrfs_delete_one_dir_name(trans, root,
2360 path, di);
2361 if (ret)
2362 goto out;
2363 btrfs_release_path(path);
2364 search_key = key;
2365 goto again;
2366 }
2367 kfree(name);
2368 if (IS_ERR(log_di)) {
2369 ret = PTR_ERR(log_di);
2370 goto out;
2371 }
2372 cur += this_len;
2373 di = (struct btrfs_dir_item *)((char *)di + this_len);
2374 }
2375 }
2376 ret = btrfs_next_leaf(root, path);
2377 if (ret > 0)
2378 ret = 0;
2379 else if (ret == 0)
2380 goto process_leaf;
2381out:
2382 btrfs_free_path(log_path);
2383 btrfs_release_path(path);
2384 return ret;
2385}
2386
2387
e02119d5
CM
2388/*
2389 * deletion replay happens before we copy any new directory items
2390 * out of the log or out of backreferences from inodes. It
2391 * scans the log to find ranges of keys that log is authoritative for,
2392 * and then scans the directory to find items in those ranges that are
2393 * not present in the log.
2394 *
2395 * Anything we don't find in the log is unlinked and removed from the
2396 * directory.
2397 */
2398static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2399 struct btrfs_root *root,
2400 struct btrfs_root *log,
2401 struct btrfs_path *path,
12fcfd22 2402 u64 dirid, int del_all)
e02119d5
CM
2403{
2404 u64 range_start;
2405 u64 range_end;
2406 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2407 int ret = 0;
2408 struct btrfs_key dir_key;
2409 struct btrfs_key found_key;
2410 struct btrfs_path *log_path;
2411 struct inode *dir;
2412
2413 dir_key.objectid = dirid;
2414 dir_key.type = BTRFS_DIR_ITEM_KEY;
2415 log_path = btrfs_alloc_path();
2416 if (!log_path)
2417 return -ENOMEM;
2418
2419 dir = read_one_inode(root, dirid);
2420 /* it isn't an error if the inode isn't there, that can happen
2421 * because we replay the deletes before we copy in the inode item
2422 * from the log
2423 */
2424 if (!dir) {
2425 btrfs_free_path(log_path);
2426 return 0;
2427 }
2428again:
2429 range_start = 0;
2430 range_end = 0;
d397712b 2431 while (1) {
12fcfd22
CM
2432 if (del_all)
2433 range_end = (u64)-1;
2434 else {
2435 ret = find_dir_range(log, path, dirid, key_type,
2436 &range_start, &range_end);
2437 if (ret != 0)
2438 break;
2439 }
e02119d5
CM
2440
2441 dir_key.offset = range_start;
d397712b 2442 while (1) {
e02119d5
CM
2443 int nritems;
2444 ret = btrfs_search_slot(NULL, root, &dir_key, path,
2445 0, 0);
2446 if (ret < 0)
2447 goto out;
2448
2449 nritems = btrfs_header_nritems(path->nodes[0]);
2450 if (path->slots[0] >= nritems) {
2451 ret = btrfs_next_leaf(root, path);
b98def7c 2452 if (ret == 1)
e02119d5 2453 break;
b98def7c
LB
2454 else if (ret < 0)
2455 goto out;
e02119d5
CM
2456 }
2457 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2458 path->slots[0]);
2459 if (found_key.objectid != dirid ||
2460 found_key.type != dir_key.type)
2461 goto next_type;
2462
2463 if (found_key.offset > range_end)
2464 break;
2465
2466 ret = check_item_in_log(trans, root, log, path,
12fcfd22
CM
2467 log_path, dir,
2468 &found_key);
3650860b
JB
2469 if (ret)
2470 goto out;
e02119d5
CM
2471 if (found_key.offset == (u64)-1)
2472 break;
2473 dir_key.offset = found_key.offset + 1;
2474 }
b3b4aa74 2475 btrfs_release_path(path);
e02119d5
CM
2476 if (range_end == (u64)-1)
2477 break;
2478 range_start = range_end + 1;
2479 }
2480
2481next_type:
2482 ret = 0;
2483 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2484 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2485 dir_key.type = BTRFS_DIR_INDEX_KEY;
b3b4aa74 2486 btrfs_release_path(path);
e02119d5
CM
2487 goto again;
2488 }
2489out:
b3b4aa74 2490 btrfs_release_path(path);
e02119d5
CM
2491 btrfs_free_path(log_path);
2492 iput(dir);
2493 return ret;
2494}
2495
2496/*
2497 * the process_func used to replay items from the log tree. This
2498 * gets called in two different stages. The first stage just looks
2499 * for inodes and makes sure they are all copied into the subvolume.
2500 *
2501 * The second stage copies all the other item types from the log into
2502 * the subvolume. The two stage approach is slower, but gets rid of
2503 * lots of complexity around inodes referencing other inodes that exist
2504 * only in the log (references come from either directory items or inode
2505 * back refs).
2506 */
2507static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
581c1760 2508 struct walk_control *wc, u64 gen, int level)
e02119d5
CM
2509{
2510 int nritems;
2511 struct btrfs_path *path;
2512 struct btrfs_root *root = wc->replay_dest;
2513 struct btrfs_key key;
e02119d5
CM
2514 int i;
2515 int ret;
2516
581c1760 2517 ret = btrfs_read_buffer(eb, gen, level, NULL);
018642a1
TI
2518 if (ret)
2519 return ret;
e02119d5
CM
2520
2521 level = btrfs_header_level(eb);
2522
2523 if (level != 0)
2524 return 0;
2525
2526 path = btrfs_alloc_path();
1e5063d0
MF
2527 if (!path)
2528 return -ENOMEM;
e02119d5
CM
2529
2530 nritems = btrfs_header_nritems(eb);
2531 for (i = 0; i < nritems; i++) {
2532 btrfs_item_key_to_cpu(eb, &key, i);
e02119d5
CM
2533
2534 /* inode keys are done during the first stage */
2535 if (key.type == BTRFS_INODE_ITEM_KEY &&
2536 wc->stage == LOG_WALK_REPLAY_INODES) {
e02119d5
CM
2537 struct btrfs_inode_item *inode_item;
2538 u32 mode;
2539
2540 inode_item = btrfs_item_ptr(eb, i,
2541 struct btrfs_inode_item);
f2d72f42
FM
2542 /*
2543 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2544 * and never got linked before the fsync, skip it, as
2545 * replaying it is pointless since it would be deleted
2546 * later. We skip logging tmpfiles, but it's always
2547 * possible we are replaying a log created with a kernel
2548 * that used to log tmpfiles.
2549 */
2550 if (btrfs_inode_nlink(eb, inode_item) == 0) {
2551 wc->ignore_cur_inode = true;
2552 continue;
2553 } else {
2554 wc->ignore_cur_inode = false;
2555 }
4f764e51
FM
2556 ret = replay_xattr_deletes(wc->trans, root, log,
2557 path, key.objectid);
2558 if (ret)
2559 break;
e02119d5
CM
2560 mode = btrfs_inode_mode(eb, inode_item);
2561 if (S_ISDIR(mode)) {
2562 ret = replay_dir_deletes(wc->trans,
12fcfd22 2563 root, log, path, key.objectid, 0);
b50c6e25
JB
2564 if (ret)
2565 break;
e02119d5
CM
2566 }
2567 ret = overwrite_item(wc->trans, root, path,
2568 eb, i, &key);
b50c6e25
JB
2569 if (ret)
2570 break;
e02119d5 2571
471d557a
FM
2572 /*
2573 * Before replaying extents, truncate the inode to its
2574 * size. We need to do it now and not after log replay
2575 * because before an fsync we can have prealloc extents
2576 * added beyond the inode's i_size. If we did it after,
2577 * through orphan cleanup for example, we would drop
2578 * those prealloc extents just after replaying them.
e02119d5
CM
2579 */
2580 if (S_ISREG(mode)) {
5893dfb9 2581 struct btrfs_drop_extents_args drop_args = { 0 };
471d557a
FM
2582 struct inode *inode;
2583 u64 from;
2584
2585 inode = read_one_inode(root, key.objectid);
2586 if (!inode) {
2587 ret = -EIO;
2588 break;
2589 }
2590 from = ALIGN(i_size_read(inode),
2591 root->fs_info->sectorsize);
5893dfb9
FM
2592 drop_args.start = from;
2593 drop_args.end = (u64)-1;
2594 drop_args.drop_cache = true;
2595 ret = btrfs_drop_extents(wc->trans, root,
2596 BTRFS_I(inode),
2597 &drop_args);
471d557a 2598 if (!ret) {
2766ff61
FM
2599 inode_sub_bytes(inode,
2600 drop_args.bytes_found);
f2d72f42 2601 /* Update the inode's nbytes. */
471d557a 2602 ret = btrfs_update_inode(wc->trans,
9a56fcd1 2603 root, BTRFS_I(inode));
471d557a
FM
2604 }
2605 iput(inode);
b50c6e25
JB
2606 if (ret)
2607 break;
e02119d5 2608 }
c71bf099 2609
e02119d5
CM
2610 ret = link_to_fixup_dir(wc->trans, root,
2611 path, key.objectid);
b50c6e25
JB
2612 if (ret)
2613 break;
e02119d5 2614 }
dd8e7217 2615
f2d72f42
FM
2616 if (wc->ignore_cur_inode)
2617 continue;
2618
dd8e7217
JB
2619 if (key.type == BTRFS_DIR_INDEX_KEY &&
2620 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2621 ret = replay_one_dir_item(wc->trans, root, path,
2622 eb, i, &key);
2623 if (ret)
2624 break;
2625 }
2626
e02119d5
CM
2627 if (wc->stage < LOG_WALK_REPLAY_ALL)
2628 continue;
2629
2630 /* these keys are simply copied */
2631 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2632 ret = overwrite_item(wc->trans, root, path,
2633 eb, i, &key);
b50c6e25
JB
2634 if (ret)
2635 break;
2da1c669
LB
2636 } else if (key.type == BTRFS_INODE_REF_KEY ||
2637 key.type == BTRFS_INODE_EXTREF_KEY) {
f186373f
MF
2638 ret = add_inode_ref(wc->trans, root, log, path,
2639 eb, i, &key);
b50c6e25
JB
2640 if (ret && ret != -ENOENT)
2641 break;
2642 ret = 0;
e02119d5
CM
2643 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2644 ret = replay_one_extent(wc->trans, root, path,
2645 eb, i, &key);
b50c6e25
JB
2646 if (ret)
2647 break;
dd8e7217 2648 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
e02119d5
CM
2649 ret = replay_one_dir_item(wc->trans, root, path,
2650 eb, i, &key);
b50c6e25
JB
2651 if (ret)
2652 break;
e02119d5
CM
2653 }
2654 }
2655 btrfs_free_path(path);
b50c6e25 2656 return ret;
e02119d5
CM
2657}
2658
6787bb9f
NB
2659/*
2660 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2661 */
2662static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2663{
2664 struct btrfs_block_group *cache;
2665
2666 cache = btrfs_lookup_block_group(fs_info, start);
2667 if (!cache) {
2668 btrfs_err(fs_info, "unable to find block group for %llu", start);
2669 return;
2670 }
2671
2672 spin_lock(&cache->space_info->lock);
2673 spin_lock(&cache->lock);
2674 cache->reserved -= fs_info->nodesize;
2675 cache->space_info->bytes_reserved -= fs_info->nodesize;
2676 spin_unlock(&cache->lock);
2677 spin_unlock(&cache->space_info->lock);
2678
2679 btrfs_put_block_group(cache);
2680}
2681
d397712b 2682static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
e02119d5
CM
2683 struct btrfs_root *root,
2684 struct btrfs_path *path, int *level,
2685 struct walk_control *wc)
2686{
0b246afa 2687 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5
CM
2688 u64 bytenr;
2689 u64 ptr_gen;
2690 struct extent_buffer *next;
2691 struct extent_buffer *cur;
e02119d5
CM
2692 u32 blocksize;
2693 int ret = 0;
2694
d397712b 2695 while (*level > 0) {
581c1760
QW
2696 struct btrfs_key first_key;
2697
e02119d5
CM
2698 cur = path->nodes[*level];
2699
fae7f21c 2700 WARN_ON(btrfs_header_level(cur) != *level);
e02119d5
CM
2701
2702 if (path->slots[*level] >=
2703 btrfs_header_nritems(cur))
2704 break;
2705
2706 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2707 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
581c1760 2708 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
0b246afa 2709 blocksize = fs_info->nodesize;
e02119d5 2710
3fbaf258
JB
2711 next = btrfs_find_create_tree_block(fs_info, bytenr,
2712 btrfs_header_owner(cur),
2713 *level - 1);
c871b0f2
LB
2714 if (IS_ERR(next))
2715 return PTR_ERR(next);
e02119d5 2716
e02119d5 2717 if (*level == 1) {
581c1760
QW
2718 ret = wc->process_func(root, next, wc, ptr_gen,
2719 *level - 1);
b50c6e25
JB
2720 if (ret) {
2721 free_extent_buffer(next);
1e5063d0 2722 return ret;
b50c6e25 2723 }
4a500fd1 2724
e02119d5
CM
2725 path->slots[*level]++;
2726 if (wc->free) {
581c1760
QW
2727 ret = btrfs_read_buffer(next, ptr_gen,
2728 *level - 1, &first_key);
018642a1
TI
2729 if (ret) {
2730 free_extent_buffer(next);
2731 return ret;
2732 }
e02119d5 2733
681ae509
JB
2734 if (trans) {
2735 btrfs_tree_lock(next);
6a884d7d 2736 btrfs_clean_tree_block(next);
681ae509
JB
2737 btrfs_wait_tree_block_writeback(next);
2738 btrfs_tree_unlock(next);
7bfc1007 2739 ret = btrfs_pin_reserved_extent(trans,
10e958d5
NB
2740 bytenr, blocksize);
2741 if (ret) {
2742 free_extent_buffer(next);
2743 return ret;
2744 }
1846430c
LB
2745 } else {
2746 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2747 clear_extent_buffer_dirty(next);
10e958d5 2748 unaccount_log_buffer(fs_info, bytenr);
3650860b 2749 }
e02119d5
CM
2750 }
2751 free_extent_buffer(next);
2752 continue;
2753 }
581c1760 2754 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
018642a1
TI
2755 if (ret) {
2756 free_extent_buffer(next);
2757 return ret;
2758 }
e02119d5 2759
e02119d5
CM
2760 if (path->nodes[*level-1])
2761 free_extent_buffer(path->nodes[*level-1]);
2762 path->nodes[*level-1] = next;
2763 *level = btrfs_header_level(next);
2764 path->slots[*level] = 0;
2765 cond_resched();
2766 }
4a500fd1 2767 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
e02119d5
CM
2768
2769 cond_resched();
2770 return 0;
2771}
2772
d397712b 2773static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
e02119d5
CM
2774 struct btrfs_root *root,
2775 struct btrfs_path *path, int *level,
2776 struct walk_control *wc)
2777{
0b246afa 2778 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5
CM
2779 int i;
2780 int slot;
2781 int ret;
2782
d397712b 2783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
e02119d5 2784 slot = path->slots[i];
4a500fd1 2785 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
e02119d5
CM
2786 path->slots[i]++;
2787 *level = i;
2788 WARN_ON(*level == 0);
2789 return 0;
2790 } else {
1e5063d0 2791 ret = wc->process_func(root, path->nodes[*level], wc,
581c1760
QW
2792 btrfs_header_generation(path->nodes[*level]),
2793 *level);
1e5063d0
MF
2794 if (ret)
2795 return ret;
2796
e02119d5
CM
2797 if (wc->free) {
2798 struct extent_buffer *next;
2799
2800 next = path->nodes[*level];
2801
681ae509
JB
2802 if (trans) {
2803 btrfs_tree_lock(next);
6a884d7d 2804 btrfs_clean_tree_block(next);
681ae509
JB
2805 btrfs_wait_tree_block_writeback(next);
2806 btrfs_tree_unlock(next);
7bfc1007 2807 ret = btrfs_pin_reserved_extent(trans,
10e958d5
NB
2808 path->nodes[*level]->start,
2809 path->nodes[*level]->len);
2810 if (ret)
2811 return ret;
1846430c
LB
2812 } else {
2813 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2814 clear_extent_buffer_dirty(next);
e02119d5 2815
10e958d5
NB
2816 unaccount_log_buffer(fs_info,
2817 path->nodes[*level]->start);
2818 }
e02119d5
CM
2819 }
2820 free_extent_buffer(path->nodes[*level]);
2821 path->nodes[*level] = NULL;
2822 *level = i + 1;
2823 }
2824 }
2825 return 1;
2826}
2827
2828/*
2829 * drop the reference count on the tree rooted at 'snap'. This traverses
2830 * the tree freeing any blocks that have a ref count of zero after being
2831 * decremented.
2832 */
2833static int walk_log_tree(struct btrfs_trans_handle *trans,
2834 struct btrfs_root *log, struct walk_control *wc)
2835{
2ff7e61e 2836 struct btrfs_fs_info *fs_info = log->fs_info;
e02119d5
CM
2837 int ret = 0;
2838 int wret;
2839 int level;
2840 struct btrfs_path *path;
e02119d5
CM
2841 int orig_level;
2842
2843 path = btrfs_alloc_path();
db5b493a
TI
2844 if (!path)
2845 return -ENOMEM;
e02119d5
CM
2846
2847 level = btrfs_header_level(log->node);
2848 orig_level = level;
2849 path->nodes[level] = log->node;
67439dad 2850 atomic_inc(&log->node->refs);
e02119d5
CM
2851 path->slots[level] = 0;
2852
d397712b 2853 while (1) {
e02119d5
CM
2854 wret = walk_down_log_tree(trans, log, path, &level, wc);
2855 if (wret > 0)
2856 break;
79787eaa 2857 if (wret < 0) {
e02119d5 2858 ret = wret;
79787eaa
JM
2859 goto out;
2860 }
e02119d5
CM
2861
2862 wret = walk_up_log_tree(trans, log, path, &level, wc);
2863 if (wret > 0)
2864 break;
79787eaa 2865 if (wret < 0) {
e02119d5 2866 ret = wret;
79787eaa
JM
2867 goto out;
2868 }
e02119d5
CM
2869 }
2870
2871 /* was the root node processed? if not, catch it here */
2872 if (path->nodes[orig_level]) {
79787eaa 2873 ret = wc->process_func(log, path->nodes[orig_level], wc,
581c1760
QW
2874 btrfs_header_generation(path->nodes[orig_level]),
2875 orig_level);
79787eaa
JM
2876 if (ret)
2877 goto out;
e02119d5
CM
2878 if (wc->free) {
2879 struct extent_buffer *next;
2880
2881 next = path->nodes[orig_level];
2882
681ae509
JB
2883 if (trans) {
2884 btrfs_tree_lock(next);
6a884d7d 2885 btrfs_clean_tree_block(next);
681ae509
JB
2886 btrfs_wait_tree_block_writeback(next);
2887 btrfs_tree_unlock(next);
7bfc1007 2888 ret = btrfs_pin_reserved_extent(trans,
10e958d5
NB
2889 next->start, next->len);
2890 if (ret)
2891 goto out;
1846430c
LB
2892 } else {
2893 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2894 clear_extent_buffer_dirty(next);
10e958d5 2895 unaccount_log_buffer(fs_info, next->start);
681ae509 2896 }
e02119d5
CM
2897 }
2898 }
2899
79787eaa 2900out:
e02119d5 2901 btrfs_free_path(path);
e02119d5
CM
2902 return ret;
2903}
2904
7237f183
YZ
2905/*
2906 * helper function to update the item for a given subvolumes log root
2907 * in the tree of log roots
2908 */
2909static int update_log_root(struct btrfs_trans_handle *trans,
4203e968
JB
2910 struct btrfs_root *log,
2911 struct btrfs_root_item *root_item)
7237f183 2912{
0b246afa 2913 struct btrfs_fs_info *fs_info = log->fs_info;
7237f183
YZ
2914 int ret;
2915
2916 if (log->log_transid == 1) {
2917 /* insert root item on the first sync */
0b246afa 2918 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
4203e968 2919 &log->root_key, root_item);
7237f183 2920 } else {
0b246afa 2921 ret = btrfs_update_root(trans, fs_info->log_root_tree,
4203e968 2922 &log->root_key, root_item);
7237f183
YZ
2923 }
2924 return ret;
2925}
2926
60d53eb3 2927static void wait_log_commit(struct btrfs_root *root, int transid)
e02119d5
CM
2928{
2929 DEFINE_WAIT(wait);
7237f183 2930 int index = transid % 2;
e02119d5 2931
7237f183
YZ
2932 /*
2933 * we only allow two pending log transactions at a time,
2934 * so we know that if ours is more than 2 older than the
2935 * current transaction, we're done
2936 */
49e83f57 2937 for (;;) {
7237f183
YZ
2938 prepare_to_wait(&root->log_commit_wait[index],
2939 &wait, TASK_UNINTERRUPTIBLE);
12fcfd22 2940
49e83f57
LB
2941 if (!(root->log_transid_committed < transid &&
2942 atomic_read(&root->log_commit[index])))
2943 break;
12fcfd22 2944
49e83f57
LB
2945 mutex_unlock(&root->log_mutex);
2946 schedule();
7237f183 2947 mutex_lock(&root->log_mutex);
49e83f57
LB
2948 }
2949 finish_wait(&root->log_commit_wait[index], &wait);
7237f183
YZ
2950}
2951
60d53eb3 2952static void wait_for_writer(struct btrfs_root *root)
7237f183
YZ
2953{
2954 DEFINE_WAIT(wait);
8b050d35 2955
49e83f57
LB
2956 for (;;) {
2957 prepare_to_wait(&root->log_writer_wait, &wait,
2958 TASK_UNINTERRUPTIBLE);
2959 if (!atomic_read(&root->log_writers))
2960 break;
2961
7237f183 2962 mutex_unlock(&root->log_mutex);
49e83f57 2963 schedule();
575849ec 2964 mutex_lock(&root->log_mutex);
7237f183 2965 }
49e83f57 2966 finish_wait(&root->log_writer_wait, &wait);
e02119d5
CM
2967}
2968
8b050d35
MX
2969static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2970 struct btrfs_log_ctx *ctx)
2971{
2972 if (!ctx)
2973 return;
2974
2975 mutex_lock(&root->log_mutex);
2976 list_del_init(&ctx->list);
2977 mutex_unlock(&root->log_mutex);
2978}
2979
2980/*
2981 * Invoked in log mutex context, or be sure there is no other task which
2982 * can access the list.
2983 */
2984static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2985 int index, int error)
2986{
2987 struct btrfs_log_ctx *ctx;
570dd450 2988 struct btrfs_log_ctx *safe;
8b050d35 2989
570dd450
CM
2990 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2991 list_del_init(&ctx->list);
8b050d35 2992 ctx->log_ret = error;
570dd450 2993 }
8b050d35
MX
2994
2995 INIT_LIST_HEAD(&root->log_ctxs[index]);
2996}
2997
e02119d5
CM
2998/*
2999 * btrfs_sync_log does sends a given tree log down to the disk and
3000 * updates the super blocks to record it. When this call is done,
12fcfd22
CM
3001 * you know that any inodes previously logged are safely on disk only
3002 * if it returns 0.
3003 *
3004 * Any other return value means you need to call btrfs_commit_transaction.
3005 * Some of the edge cases for fsyncing directories that have had unlinks
3006 * or renames done in the past mean that sometimes the only safe
3007 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
3008 * that has happened.
e02119d5
CM
3009 */
3010int btrfs_sync_log(struct btrfs_trans_handle *trans,
8b050d35 3011 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
e02119d5 3012{
7237f183
YZ
3013 int index1;
3014 int index2;
8cef4e16 3015 int mark;
e02119d5 3016 int ret;
0b246afa 3017 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5 3018 struct btrfs_root *log = root->log_root;
0b246afa 3019 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
4203e968 3020 struct btrfs_root_item new_root_item;
bb14a59b 3021 int log_transid = 0;
8b050d35 3022 struct btrfs_log_ctx root_log_ctx;
c6adc9cc 3023 struct blk_plug plug;
e02119d5 3024
7237f183 3025 mutex_lock(&root->log_mutex);
d1433deb
MX
3026 log_transid = ctx->log_transid;
3027 if (root->log_transid_committed >= log_transid) {
3028 mutex_unlock(&root->log_mutex);
3029 return ctx->log_ret;
3030 }
3031
3032 index1 = log_transid % 2;
7237f183 3033 if (atomic_read(&root->log_commit[index1])) {
60d53eb3 3034 wait_log_commit(root, log_transid);
7237f183 3035 mutex_unlock(&root->log_mutex);
8b050d35 3036 return ctx->log_ret;
e02119d5 3037 }
d1433deb 3038 ASSERT(log_transid == root->log_transid);
7237f183
YZ
3039 atomic_set(&root->log_commit[index1], 1);
3040
3041 /* wait for previous tree log sync to complete */
3042 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
60d53eb3 3043 wait_log_commit(root, log_transid - 1);
48cab2e0 3044
86df7eb9 3045 while (1) {
2ecb7923 3046 int batch = atomic_read(&root->log_batch);
cd354ad6 3047 /* when we're on an ssd, just kick the log commit out */
0b246afa 3048 if (!btrfs_test_opt(fs_info, SSD) &&
27cdeb70 3049 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
86df7eb9
YZ
3050 mutex_unlock(&root->log_mutex);
3051 schedule_timeout_uninterruptible(1);
3052 mutex_lock(&root->log_mutex);
3053 }
60d53eb3 3054 wait_for_writer(root);
2ecb7923 3055 if (batch == atomic_read(&root->log_batch))
e02119d5
CM
3056 break;
3057 }
e02119d5 3058
12fcfd22 3059 /* bail out if we need to do a full commit */
4884b8e8 3060 if (btrfs_need_log_full_commit(trans)) {
12fcfd22
CM
3061 ret = -EAGAIN;
3062 mutex_unlock(&root->log_mutex);
3063 goto out;
3064 }
3065
8cef4e16
YZ
3066 if (log_transid % 2 == 0)
3067 mark = EXTENT_DIRTY;
3068 else
3069 mark = EXTENT_NEW;
3070
690587d1
CM
3071 /* we start IO on all the marked extents here, but we don't actually
3072 * wait for them until later.
3073 */
c6adc9cc 3074 blk_start_plug(&plug);
2ff7e61e 3075 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
79787eaa 3076 if (ret) {
c6adc9cc 3077 blk_finish_plug(&plug);
66642832 3078 btrfs_abort_transaction(trans, ret);
90787766 3079 btrfs_set_log_full_commit(trans);
79787eaa
JM
3080 mutex_unlock(&root->log_mutex);
3081 goto out;
3082 }
7237f183 3083
4203e968
JB
3084 /*
3085 * We _must_ update under the root->log_mutex in order to make sure we
3086 * have a consistent view of the log root we are trying to commit at
3087 * this moment.
3088 *
3089 * We _must_ copy this into a local copy, because we are not holding the
3090 * log_root_tree->log_mutex yet. This is important because when we
3091 * commit the log_root_tree we must have a consistent view of the
3092 * log_root_tree when we update the super block to point at the
3093 * log_root_tree bytenr. If we update the log_root_tree here we'll race
3094 * with the commit and possibly point at the new block which we may not
3095 * have written out.
3096 */
5d4f98a2 3097 btrfs_set_root_node(&log->root_item, log->node);
4203e968 3098 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
7237f183 3099
7237f183
YZ
3100 root->log_transid++;
3101 log->log_transid = root->log_transid;
ff782e0a 3102 root->log_start_pid = 0;
7237f183 3103 /*
8cef4e16
YZ
3104 * IO has been started, blocks of the log tree have WRITTEN flag set
3105 * in their headers. new modifications of the log will be written to
3106 * new positions. so it's safe to allow log writers to go in.
7237f183
YZ
3107 */
3108 mutex_unlock(&root->log_mutex);
3109
28a23593 3110 btrfs_init_log_ctx(&root_log_ctx, NULL);
d1433deb 3111
7237f183 3112 mutex_lock(&log_root_tree->log_mutex);
d1433deb
MX
3113
3114 index2 = log_root_tree->log_transid % 2;
3115 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3116 root_log_ctx.log_transid = log_root_tree->log_transid;
3117
4203e968
JB
3118 /*
3119 * Now we are safe to update the log_root_tree because we're under the
3120 * log_mutex, and we're a current writer so we're holding the commit
3121 * open until we drop the log_mutex.
3122 */
3123 ret = update_log_root(trans, log, &new_root_item);
4a500fd1 3124 if (ret) {
d1433deb
MX
3125 if (!list_empty(&root_log_ctx.list))
3126 list_del_init(&root_log_ctx.list);
3127
c6adc9cc 3128 blk_finish_plug(&plug);
90787766 3129 btrfs_set_log_full_commit(trans);
995946dd 3130
79787eaa 3131 if (ret != -ENOSPC) {
66642832 3132 btrfs_abort_transaction(trans, ret);
79787eaa
JM
3133 mutex_unlock(&log_root_tree->log_mutex);
3134 goto out;
3135 }
bf89d38f 3136 btrfs_wait_tree_log_extents(log, mark);
4a500fd1
YZ
3137 mutex_unlock(&log_root_tree->log_mutex);
3138 ret = -EAGAIN;
3139 goto out;
3140 }
3141
d1433deb 3142 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3da5ab56 3143 blk_finish_plug(&plug);
cbd60aa7 3144 list_del_init(&root_log_ctx.list);
d1433deb
MX
3145 mutex_unlock(&log_root_tree->log_mutex);
3146 ret = root_log_ctx.log_ret;
3147 goto out;
3148 }
8b050d35 3149
d1433deb 3150 index2 = root_log_ctx.log_transid % 2;
7237f183 3151 if (atomic_read(&log_root_tree->log_commit[index2])) {
c6adc9cc 3152 blk_finish_plug(&plug);
bf89d38f 3153 ret = btrfs_wait_tree_log_extents(log, mark);
60d53eb3 3154 wait_log_commit(log_root_tree,
d1433deb 3155 root_log_ctx.log_transid);
7237f183 3156 mutex_unlock(&log_root_tree->log_mutex);
5ab5e44a
FM
3157 if (!ret)
3158 ret = root_log_ctx.log_ret;
7237f183
YZ
3159 goto out;
3160 }
d1433deb 3161 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
7237f183
YZ
3162 atomic_set(&log_root_tree->log_commit[index2], 1);
3163
12fcfd22 3164 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
60d53eb3 3165 wait_log_commit(log_root_tree,
d1433deb 3166 root_log_ctx.log_transid - 1);
12fcfd22
CM
3167 }
3168
12fcfd22
CM
3169 /*
3170 * now that we've moved on to the tree of log tree roots,
3171 * check the full commit flag again
3172 */
4884b8e8 3173 if (btrfs_need_log_full_commit(trans)) {
c6adc9cc 3174 blk_finish_plug(&plug);
bf89d38f 3175 btrfs_wait_tree_log_extents(log, mark);
12fcfd22
CM
3176 mutex_unlock(&log_root_tree->log_mutex);
3177 ret = -EAGAIN;
3178 goto out_wake_log_root;
3179 }
7237f183 3180
2ff7e61e 3181 ret = btrfs_write_marked_extents(fs_info,
c6adc9cc
MX
3182 &log_root_tree->dirty_log_pages,
3183 EXTENT_DIRTY | EXTENT_NEW);
3184 blk_finish_plug(&plug);
79787eaa 3185 if (ret) {
90787766 3186 btrfs_set_log_full_commit(trans);
66642832 3187 btrfs_abort_transaction(trans, ret);
79787eaa
JM
3188 mutex_unlock(&log_root_tree->log_mutex);
3189 goto out_wake_log_root;
3190 }
bf89d38f 3191 ret = btrfs_wait_tree_log_extents(log, mark);
5ab5e44a 3192 if (!ret)
bf89d38f
JM
3193 ret = btrfs_wait_tree_log_extents(log_root_tree,
3194 EXTENT_NEW | EXTENT_DIRTY);
5ab5e44a 3195 if (ret) {
90787766 3196 btrfs_set_log_full_commit(trans);
5ab5e44a
FM
3197 mutex_unlock(&log_root_tree->log_mutex);
3198 goto out_wake_log_root;
3199 }
e02119d5 3200
0b246afa
JM
3201 btrfs_set_super_log_root(fs_info->super_for_commit,
3202 log_root_tree->node->start);
3203 btrfs_set_super_log_root_level(fs_info->super_for_commit,
3204 btrfs_header_level(log_root_tree->node));
e02119d5 3205
7237f183 3206 log_root_tree->log_transid++;
7237f183
YZ
3207 mutex_unlock(&log_root_tree->log_mutex);
3208
3209 /*
52042d8e 3210 * Nobody else is going to jump in and write the ctree
7237f183
YZ
3211 * super here because the log_commit atomic below is protecting
3212 * us. We must be called with a transaction handle pinning
3213 * the running transaction open, so a full commit can't hop
3214 * in and cause problems either.
3215 */
eece6a9c 3216 ret = write_all_supers(fs_info, 1);
5af3e8cc 3217 if (ret) {
90787766 3218 btrfs_set_log_full_commit(trans);
66642832 3219 btrfs_abort_transaction(trans, ret);
5af3e8cc
SB
3220 goto out_wake_log_root;
3221 }
7237f183 3222
257c62e1
CM
3223 mutex_lock(&root->log_mutex);
3224 if (root->last_log_commit < log_transid)
3225 root->last_log_commit = log_transid;
3226 mutex_unlock(&root->log_mutex);
3227
12fcfd22 3228out_wake_log_root:
570dd450 3229 mutex_lock(&log_root_tree->log_mutex);
8b050d35
MX
3230 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3231
d1433deb 3232 log_root_tree->log_transid_committed++;
7237f183 3233 atomic_set(&log_root_tree->log_commit[index2], 0);
d1433deb
MX
3234 mutex_unlock(&log_root_tree->log_mutex);
3235
33a9eca7 3236 /*
093258e6
DS
3237 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3238 * all the updates above are seen by the woken threads. It might not be
3239 * necessary, but proving that seems to be hard.
33a9eca7 3240 */
093258e6 3241 cond_wake_up(&log_root_tree->log_commit_wait[index2]);
e02119d5 3242out:
d1433deb 3243 mutex_lock(&root->log_mutex);
570dd450 3244 btrfs_remove_all_log_ctxs(root, index1, ret);
d1433deb 3245 root->log_transid_committed++;
7237f183 3246 atomic_set(&root->log_commit[index1], 0);
d1433deb 3247 mutex_unlock(&root->log_mutex);
8b050d35 3248
33a9eca7 3249 /*
093258e6
DS
3250 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3251 * all the updates above are seen by the woken threads. It might not be
3252 * necessary, but proving that seems to be hard.
33a9eca7 3253 */
093258e6 3254 cond_wake_up(&root->log_commit_wait[index1]);
b31eabd8 3255 return ret;
e02119d5
CM
3256}
3257
4a500fd1
YZ
3258static void free_log_tree(struct btrfs_trans_handle *trans,
3259 struct btrfs_root *log)
e02119d5
CM
3260{
3261 int ret;
e02119d5
CM
3262 struct walk_control wc = {
3263 .free = 1,
3264 .process_func = process_one_buffer
3265 };
3266
681ae509 3267 ret = walk_log_tree(trans, log, &wc);
374b0e2d
JM
3268 if (ret) {
3269 if (trans)
3270 btrfs_abort_transaction(trans, ret);
3271 else
3272 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3273 }
e02119d5 3274
59b0713a
FM
3275 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3276 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
e289f03e 3277 extent_io_tree_release(&log->log_csum_range);
00246528 3278 btrfs_put_root(log);
4a500fd1
YZ
3279}
3280
3281/*
3282 * free all the extents used by the tree log. This should be called
3283 * at commit time of the full transaction
3284 */
3285int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3286{
3287 if (root->log_root) {
3288 free_log_tree(trans, root->log_root);
3289 root->log_root = NULL;
e7a79811 3290 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
4a500fd1
YZ
3291 }
3292 return 0;
3293}
3294
3295int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3296 struct btrfs_fs_info *fs_info)
3297{
3298 if (fs_info->log_root_tree) {
3299 free_log_tree(trans, fs_info->log_root_tree);
3300 fs_info->log_root_tree = NULL;
3301 }
e02119d5
CM
3302 return 0;
3303}
3304
803f0f64
FM
3305/*
3306 * Check if an inode was logged in the current transaction. We can't always rely
3307 * on an inode's logged_trans value, because it's an in-memory only field and
3308 * therefore not persisted. This means that its value is lost if the inode gets
3309 * evicted and loaded again from disk (in which case it has a value of 0, and
3310 * certainly it is smaller then any possible transaction ID), when that happens
3311 * the full_sync flag is set in the inode's runtime flags, so on that case we
3312 * assume eviction happened and ignore the logged_trans value, assuming the
3313 * worst case, that the inode was logged before in the current transaction.
3314 */
3315static bool inode_logged(struct btrfs_trans_handle *trans,
3316 struct btrfs_inode *inode)
3317{
3318 if (inode->logged_trans == trans->transid)
3319 return true;
3320
3321 if (inode->last_trans == trans->transid &&
3322 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3323 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3324 return true;
3325
3326 return false;
3327}
3328
e02119d5
CM
3329/*
3330 * If both a file and directory are logged, and unlinks or renames are
3331 * mixed in, we have a few interesting corners:
3332 *
3333 * create file X in dir Y
3334 * link file X to X.link in dir Y
3335 * fsync file X
3336 * unlink file X but leave X.link
3337 * fsync dir Y
3338 *
3339 * After a crash we would expect only X.link to exist. But file X
3340 * didn't get fsync'd again so the log has back refs for X and X.link.
3341 *
3342 * We solve this by removing directory entries and inode backrefs from the
3343 * log when a file that was logged in the current transaction is
3344 * unlinked. Any later fsync will include the updated log entries, and
3345 * we'll be able to reconstruct the proper directory items from backrefs.
3346 *
3347 * This optimizations allows us to avoid relogging the entire inode
3348 * or the entire directory.
3349 */
3350int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3351 struct btrfs_root *root,
3352 const char *name, int name_len,
49f34d1f 3353 struct btrfs_inode *dir, u64 index)
e02119d5
CM
3354{
3355 struct btrfs_root *log;
3356 struct btrfs_dir_item *di;
3357 struct btrfs_path *path;
3358 int ret;
4a500fd1 3359 int err = 0;
e02119d5 3360 int bytes_del = 0;
49f34d1f 3361 u64 dir_ino = btrfs_ino(dir);
e02119d5 3362
803f0f64 3363 if (!inode_logged(trans, dir))
3a5f1d45
CM
3364 return 0;
3365
e02119d5
CM
3366 ret = join_running_log_trans(root);
3367 if (ret)
3368 return 0;
3369
49f34d1f 3370 mutex_lock(&dir->log_mutex);
e02119d5
CM
3371
3372 log = root->log_root;
3373 path = btrfs_alloc_path();
a62f44a5
TI
3374 if (!path) {
3375 err = -ENOMEM;
3376 goto out_unlock;
3377 }
2a29edc6 3378
33345d01 3379 di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
e02119d5 3380 name, name_len, -1);
4a500fd1
YZ
3381 if (IS_ERR(di)) {
3382 err = PTR_ERR(di);
3383 goto fail;
3384 }
3385 if (di) {
e02119d5
CM
3386 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3387 bytes_del += name_len;
3650860b
JB
3388 if (ret) {
3389 err = ret;
3390 goto fail;
3391 }
e02119d5 3392 }
b3b4aa74 3393 btrfs_release_path(path);
33345d01 3394 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
e02119d5 3395 index, name, name_len, -1);
4a500fd1
YZ
3396 if (IS_ERR(di)) {
3397 err = PTR_ERR(di);
3398 goto fail;
3399 }
3400 if (di) {
e02119d5
CM
3401 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3402 bytes_del += name_len;
3650860b
JB
3403 if (ret) {
3404 err = ret;
3405 goto fail;
3406 }
e02119d5
CM
3407 }
3408
3409 /* update the directory size in the log to reflect the names
3410 * we have removed
3411 */
3412 if (bytes_del) {
3413 struct btrfs_key key;
3414
33345d01 3415 key.objectid = dir_ino;
e02119d5
CM
3416 key.offset = 0;
3417 key.type = BTRFS_INODE_ITEM_KEY;
b3b4aa74 3418 btrfs_release_path(path);
e02119d5
CM
3419
3420 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
4a500fd1
YZ
3421 if (ret < 0) {
3422 err = ret;
3423 goto fail;
3424 }
e02119d5
CM
3425 if (ret == 0) {
3426 struct btrfs_inode_item *item;
3427 u64 i_size;
3428
3429 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3430 struct btrfs_inode_item);
3431 i_size = btrfs_inode_size(path->nodes[0], item);
3432 if (i_size > bytes_del)
3433 i_size -= bytes_del;
3434 else
3435 i_size = 0;
3436 btrfs_set_inode_size(path->nodes[0], item, i_size);
3437 btrfs_mark_buffer_dirty(path->nodes[0]);
3438 } else
3439 ret = 0;
b3b4aa74 3440 btrfs_release_path(path);
e02119d5 3441 }
4a500fd1 3442fail:
e02119d5 3443 btrfs_free_path(path);
a62f44a5 3444out_unlock:
49f34d1f 3445 mutex_unlock(&dir->log_mutex);
fb2fecba 3446 if (err == -ENOSPC) {
90787766 3447 btrfs_set_log_full_commit(trans);
fb2fecba
JB
3448 err = 0;
3449 } else if (err < 0 && err != -ENOENT) {
3450 /* ENOENT can be returned if the entry hasn't been fsynced yet */
3451 btrfs_abort_transaction(trans, err);
3452 }
79787eaa 3453
12fcfd22 3454 btrfs_end_log_trans(root);
e02119d5 3455
411fc6bc 3456 return err;
e02119d5
CM
3457}
3458
3459/* see comments for btrfs_del_dir_entries_in_log */
3460int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3461 struct btrfs_root *root,
3462 const char *name, int name_len,
a491abb2 3463 struct btrfs_inode *inode, u64 dirid)
e02119d5
CM
3464{
3465 struct btrfs_root *log;
3466 u64 index;
3467 int ret;
3468
803f0f64 3469 if (!inode_logged(trans, inode))
3a5f1d45
CM
3470 return 0;
3471
e02119d5
CM
3472 ret = join_running_log_trans(root);
3473 if (ret)
3474 return 0;
3475 log = root->log_root;
a491abb2 3476 mutex_lock(&inode->log_mutex);
e02119d5 3477
a491abb2 3478 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
e02119d5 3479 dirid, &index);
a491abb2 3480 mutex_unlock(&inode->log_mutex);
4a500fd1 3481 if (ret == -ENOSPC) {
90787766 3482 btrfs_set_log_full_commit(trans);
4a500fd1 3483 ret = 0;
79787eaa 3484 } else if (ret < 0 && ret != -ENOENT)
66642832 3485 btrfs_abort_transaction(trans, ret);
12fcfd22 3486 btrfs_end_log_trans(root);
e02119d5 3487
e02119d5
CM
3488 return ret;
3489}
3490
3491/*
3492 * creates a range item in the log for 'dirid'. first_offset and
3493 * last_offset tell us which parts of the key space the log should
3494 * be considered authoritative for.
3495 */
3496static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3497 struct btrfs_root *log,
3498 struct btrfs_path *path,
3499 int key_type, u64 dirid,
3500 u64 first_offset, u64 last_offset)
3501{
3502 int ret;
3503 struct btrfs_key key;
3504 struct btrfs_dir_log_item *item;
3505
3506 key.objectid = dirid;
3507 key.offset = first_offset;
3508 if (key_type == BTRFS_DIR_ITEM_KEY)
3509 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3510 else
3511 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3512 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
4a500fd1
YZ
3513 if (ret)
3514 return ret;
e02119d5
CM
3515
3516 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3517 struct btrfs_dir_log_item);
3518 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3519 btrfs_mark_buffer_dirty(path->nodes[0]);
b3b4aa74 3520 btrfs_release_path(path);
e02119d5
CM
3521 return 0;
3522}
3523
3524/*
3525 * log all the items included in the current transaction for a given
3526 * directory. This also creates the range items in the log tree required
3527 * to replay anything deleted before the fsync
3528 */
3529static noinline int log_dir_items(struct btrfs_trans_handle *trans,
684a5773 3530 struct btrfs_root *root, struct btrfs_inode *inode,
e02119d5
CM
3531 struct btrfs_path *path,
3532 struct btrfs_path *dst_path, int key_type,
2f2ff0ee 3533 struct btrfs_log_ctx *ctx,
e02119d5
CM
3534 u64 min_offset, u64 *last_offset_ret)
3535{
3536 struct btrfs_key min_key;
e02119d5
CM
3537 struct btrfs_root *log = root->log_root;
3538 struct extent_buffer *src;
4a500fd1 3539 int err = 0;
e02119d5
CM
3540 int ret;
3541 int i;
3542 int nritems;
3543 u64 first_offset = min_offset;
3544 u64 last_offset = (u64)-1;
684a5773 3545 u64 ino = btrfs_ino(inode);
e02119d5
CM
3546
3547 log = root->log_root;
e02119d5 3548
33345d01 3549 min_key.objectid = ino;
e02119d5
CM
3550 min_key.type = key_type;
3551 min_key.offset = min_offset;
3552
6174d3cb 3553 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
e02119d5
CM
3554
3555 /*
3556 * we didn't find anything from this transaction, see if there
3557 * is anything at all
3558 */
33345d01
LZ
3559 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3560 min_key.objectid = ino;
e02119d5
CM
3561 min_key.type = key_type;
3562 min_key.offset = (u64)-1;
b3b4aa74 3563 btrfs_release_path(path);
e02119d5
CM
3564 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3565 if (ret < 0) {
b3b4aa74 3566 btrfs_release_path(path);
e02119d5
CM
3567 return ret;
3568 }
33345d01 3569 ret = btrfs_previous_item(root, path, ino, key_type);
e02119d5
CM
3570
3571 /* if ret == 0 there are items for this type,
3572 * create a range to tell us the last key of this type.
3573 * otherwise, there are no items in this directory after
3574 * *min_offset, and we create a range to indicate that.
3575 */
3576 if (ret == 0) {
3577 struct btrfs_key tmp;
3578 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3579 path->slots[0]);
d397712b 3580 if (key_type == tmp.type)
e02119d5 3581 first_offset = max(min_offset, tmp.offset) + 1;
e02119d5
CM
3582 }
3583 goto done;
3584 }
3585
3586 /* go backward to find any previous key */
33345d01 3587 ret = btrfs_previous_item(root, path, ino, key_type);
e02119d5
CM
3588 if (ret == 0) {
3589 struct btrfs_key tmp;
3590 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3591 if (key_type == tmp.type) {
3592 first_offset = tmp.offset;
3593 ret = overwrite_item(trans, log, dst_path,
3594 path->nodes[0], path->slots[0],
3595 &tmp);
4a500fd1
YZ
3596 if (ret) {
3597 err = ret;
3598 goto done;
3599 }
e02119d5
CM
3600 }
3601 }
b3b4aa74 3602 btrfs_release_path(path);
e02119d5 3603
2cc83342
JB
3604 /*
3605 * Find the first key from this transaction again. See the note for
3606 * log_new_dir_dentries, if we're logging a directory recursively we
3607 * won't be holding its i_mutex, which means we can modify the directory
3608 * while we're logging it. If we remove an entry between our first
3609 * search and this search we'll not find the key again and can just
3610 * bail.
3611 */
bb56f02f 3612search:
e02119d5 3613 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2cc83342 3614 if (ret != 0)
e02119d5 3615 goto done;
e02119d5
CM
3616
3617 /*
3618 * we have a block from this transaction, log every item in it
3619 * from our directory
3620 */
d397712b 3621 while (1) {
e02119d5
CM
3622 struct btrfs_key tmp;
3623 src = path->nodes[0];
3624 nritems = btrfs_header_nritems(src);
3625 for (i = path->slots[0]; i < nritems; i++) {
2f2ff0ee
FM
3626 struct btrfs_dir_item *di;
3627
e02119d5
CM
3628 btrfs_item_key_to_cpu(src, &min_key, i);
3629
33345d01 3630 if (min_key.objectid != ino || min_key.type != key_type)
e02119d5 3631 goto done;
bb56f02f
FM
3632
3633 if (need_resched()) {
3634 btrfs_release_path(path);
3635 cond_resched();
3636 goto search;
3637 }
3638
e02119d5
CM
3639 ret = overwrite_item(trans, log, dst_path, src, i,
3640 &min_key);
4a500fd1
YZ
3641 if (ret) {
3642 err = ret;
3643 goto done;
3644 }
2f2ff0ee
FM
3645
3646 /*
3647 * We must make sure that when we log a directory entry,
3648 * the corresponding inode, after log replay, has a
3649 * matching link count. For example:
3650 *
3651 * touch foo
3652 * mkdir mydir
3653 * sync
3654 * ln foo mydir/bar
3655 * xfs_io -c "fsync" mydir
3656 * <crash>
3657 * <mount fs and log replay>
3658 *
3659 * Would result in a fsync log that when replayed, our
3660 * file inode would have a link count of 1, but we get
3661 * two directory entries pointing to the same inode.
3662 * After removing one of the names, it would not be
3663 * possible to remove the other name, which resulted
3664 * always in stale file handle errors, and would not
3665 * be possible to rmdir the parent directory, since
3666 * its i_size could never decrement to the value
3667 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3668 */
3669 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3670 btrfs_dir_item_key_to_cpu(src, di, &tmp);
3671 if (ctx &&
3672 (btrfs_dir_transid(src, di) == trans->transid ||
3673 btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3674 tmp.type != BTRFS_ROOT_ITEM_KEY)
3675 ctx->log_new_dentries = true;
e02119d5
CM
3676 }
3677 path->slots[0] = nritems;
3678
3679 /*
3680 * look ahead to the next item and see if it is also
3681 * from this directory and from this transaction
3682 */
3683 ret = btrfs_next_leaf(root, path);
80c0b421
LB
3684 if (ret) {
3685 if (ret == 1)
3686 last_offset = (u64)-1;
3687 else
3688 err = ret;
e02119d5
CM
3689 goto done;
3690 }
3691 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
33345d01 3692 if (tmp.objectid != ino || tmp.type != key_type) {
e02119d5
CM
3693 last_offset = (u64)-1;
3694 goto done;
3695 }
3696 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3697 ret = overwrite_item(trans, log, dst_path,
3698 path->nodes[0], path->slots[0],
3699 &tmp);
4a500fd1
YZ
3700 if (ret)
3701 err = ret;
3702 else
3703 last_offset = tmp.offset;
e02119d5
CM
3704 goto done;
3705 }
3706 }
3707done:
b3b4aa74
DS
3708 btrfs_release_path(path);
3709 btrfs_release_path(dst_path);
e02119d5 3710
4a500fd1
YZ
3711 if (err == 0) {
3712 *last_offset_ret = last_offset;
3713 /*
3714 * insert the log range keys to indicate where the log
3715 * is valid
3716 */
3717 ret = insert_dir_log_key(trans, log, path, key_type,
33345d01 3718 ino, first_offset, last_offset);
4a500fd1
YZ
3719 if (ret)
3720 err = ret;
3721 }
3722 return err;
e02119d5
CM
3723}
3724
3725/*
3726 * logging directories is very similar to logging inodes, We find all the items
3727 * from the current transaction and write them to the log.
3728 *
3729 * The recovery code scans the directory in the subvolume, and if it finds a
3730 * key in the range logged that is not present in the log tree, then it means
3731 * that dir entry was unlinked during the transaction.
3732 *
3733 * In order for that scan to work, we must include one key smaller than
3734 * the smallest logged by this transaction and one key larger than the largest
3735 * key logged by this transaction.
3736 */
3737static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
dbf39ea4 3738 struct btrfs_root *root, struct btrfs_inode *inode,
e02119d5 3739 struct btrfs_path *path,
2f2ff0ee
FM
3740 struct btrfs_path *dst_path,
3741 struct btrfs_log_ctx *ctx)
e02119d5
CM
3742{
3743 u64 min_key;
3744 u64 max_key;
3745 int ret;
3746 int key_type = BTRFS_DIR_ITEM_KEY;
3747
3748again:
3749 min_key = 0;
3750 max_key = 0;
d397712b 3751 while (1) {
dbf39ea4
NB
3752 ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3753 ctx, min_key, &max_key);
4a500fd1
YZ
3754 if (ret)
3755 return ret;
e02119d5
CM
3756 if (max_key == (u64)-1)
3757 break;
3758 min_key = max_key + 1;
3759 }
3760
3761 if (key_type == BTRFS_DIR_ITEM_KEY) {
3762 key_type = BTRFS_DIR_INDEX_KEY;
3763 goto again;
3764 }
3765 return 0;
3766}
3767
3768/*
3769 * a helper function to drop items from the log before we relog an
3770 * inode. max_key_type indicates the highest item type to remove.
3771 * This cannot be run for file data extents because it does not
3772 * free the extents they point to.
3773 */
3774static int drop_objectid_items(struct btrfs_trans_handle *trans,
3775 struct btrfs_root *log,
3776 struct btrfs_path *path,
3777 u64 objectid, int max_key_type)
3778{
3779 int ret;
3780 struct btrfs_key key;
3781 struct btrfs_key found_key;
18ec90d6 3782 int start_slot;
e02119d5
CM
3783
3784 key.objectid = objectid;
3785 key.type = max_key_type;
3786 key.offset = (u64)-1;
3787
d397712b 3788 while (1) {
e02119d5 3789 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3650860b 3790 BUG_ON(ret == 0); /* Logic error */
4a500fd1 3791 if (ret < 0)
e02119d5
CM
3792 break;
3793
3794 if (path->slots[0] == 0)
3795 break;
3796
3797 path->slots[0]--;
3798 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3799 path->slots[0]);
3800
3801 if (found_key.objectid != objectid)
3802 break;
3803
18ec90d6
JB
3804 found_key.offset = 0;
3805 found_key.type = 0;
e3b83361 3806 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
cbca7d59
FM
3807 if (ret < 0)
3808 break;
18ec90d6
JB
3809
3810 ret = btrfs_del_items(trans, log, path, start_slot,
3811 path->slots[0] - start_slot + 1);
3812 /*
3813 * If start slot isn't 0 then we don't need to re-search, we've
3814 * found the last guy with the objectid in this tree.
3815 */
3816 if (ret || start_slot != 0)
65a246c5 3817 break;
b3b4aa74 3818 btrfs_release_path(path);
e02119d5 3819 }
b3b4aa74 3820 btrfs_release_path(path);
5bdbeb21
JB
3821 if (ret > 0)
3822 ret = 0;
4a500fd1 3823 return ret;
e02119d5
CM
3824}
3825
94edf4ae
JB
3826static void fill_inode_item(struct btrfs_trans_handle *trans,
3827 struct extent_buffer *leaf,
3828 struct btrfs_inode_item *item,
1a4bcf47
FM
3829 struct inode *inode, int log_inode_only,
3830 u64 logged_isize)
94edf4ae 3831{
0b1c6cca
JB
3832 struct btrfs_map_token token;
3833
c82f823c 3834 btrfs_init_map_token(&token, leaf);
94edf4ae
JB
3835
3836 if (log_inode_only) {
3837 /* set the generation to zero so the recover code
3838 * can tell the difference between an logging
3839 * just to say 'this inode exists' and a logging
3840 * to say 'update this inode with these values'
3841 */
cc4c13d5
DS
3842 btrfs_set_token_inode_generation(&token, item, 0);
3843 btrfs_set_token_inode_size(&token, item, logged_isize);
94edf4ae 3844 } else {
cc4c13d5
DS
3845 btrfs_set_token_inode_generation(&token, item,
3846 BTRFS_I(inode)->generation);
3847 btrfs_set_token_inode_size(&token, item, inode->i_size);
0b1c6cca
JB
3848 }
3849
cc4c13d5
DS
3850 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3851 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3852 btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3853 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3854
3855 btrfs_set_token_timespec_sec(&token, &item->atime,
3856 inode->i_atime.tv_sec);
3857 btrfs_set_token_timespec_nsec(&token, &item->atime,
3858 inode->i_atime.tv_nsec);
3859
3860 btrfs_set_token_timespec_sec(&token, &item->mtime,
3861 inode->i_mtime.tv_sec);
3862 btrfs_set_token_timespec_nsec(&token, &item->mtime,
3863 inode->i_mtime.tv_nsec);
3864
3865 btrfs_set_token_timespec_sec(&token, &item->ctime,
3866 inode->i_ctime.tv_sec);
3867 btrfs_set_token_timespec_nsec(&token, &item->ctime,
3868 inode->i_ctime.tv_nsec);
3869
3870 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3871
3872 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3873 btrfs_set_token_inode_transid(&token, item, trans->transid);
3874 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3875 btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3876 btrfs_set_token_inode_block_group(&token, item, 0);
94edf4ae
JB
3877}
3878
a95249b3
JB
3879static int log_inode_item(struct btrfs_trans_handle *trans,
3880 struct btrfs_root *log, struct btrfs_path *path,
6d889a3b 3881 struct btrfs_inode *inode)
a95249b3
JB
3882{
3883 struct btrfs_inode_item *inode_item;
a95249b3
JB
3884 int ret;
3885
efd0c405 3886 ret = btrfs_insert_empty_item(trans, log, path,
6d889a3b 3887 &inode->location, sizeof(*inode_item));
a95249b3
JB
3888 if (ret && ret != -EEXIST)
3889 return ret;
3890 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3891 struct btrfs_inode_item);
6d889a3b
NB
3892 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3893 0, 0);
a95249b3
JB
3894 btrfs_release_path(path);
3895 return 0;
3896}
3897
40e046ac 3898static int log_csums(struct btrfs_trans_handle *trans,
3ebac17c 3899 struct btrfs_inode *inode,
40e046ac
FM
3900 struct btrfs_root *log_root,
3901 struct btrfs_ordered_sum *sums)
3902{
e289f03e
FM
3903 const u64 lock_end = sums->bytenr + sums->len - 1;
3904 struct extent_state *cached_state = NULL;
40e046ac
FM
3905 int ret;
3906
3ebac17c
FM
3907 /*
3908 * If this inode was not used for reflink operations in the current
3909 * transaction with new extents, then do the fast path, no need to
3910 * worry about logging checksum items with overlapping ranges.
3911 */
3912 if (inode->last_reflink_trans < trans->transid)
3913 return btrfs_csum_file_blocks(trans, log_root, sums);
3914
e289f03e
FM
3915 /*
3916 * Serialize logging for checksums. This is to avoid racing with the
3917 * same checksum being logged by another task that is logging another
3918 * file which happens to refer to the same extent as well. Such races
3919 * can leave checksum items in the log with overlapping ranges.
3920 */
3921 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
3922 lock_end, &cached_state);
3923 if (ret)
3924 return ret;
40e046ac
FM
3925 /*
3926 * Due to extent cloning, we might have logged a csum item that covers a
3927 * subrange of a cloned extent, and later we can end up logging a csum
3928 * item for a larger subrange of the same extent or the entire range.
3929 * This would leave csum items in the log tree that cover the same range
3930 * and break the searches for checksums in the log tree, resulting in
3931 * some checksums missing in the fs/subvolume tree. So just delete (or
3932 * trim and adjust) any existing csum items in the log for this range.
3933 */
3934 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
e289f03e
FM
3935 if (!ret)
3936 ret = btrfs_csum_file_blocks(trans, log_root, sums);
40e046ac 3937
e289f03e
FM
3938 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
3939 &cached_state);
3940
3941 return ret;
40e046ac
FM
3942}
3943
31ff1cd2 3944static noinline int copy_items(struct btrfs_trans_handle *trans,
44d70e19 3945 struct btrfs_inode *inode,
31ff1cd2 3946 struct btrfs_path *dst_path,
0e56315c 3947 struct btrfs_path *src_path,
1a4bcf47
FM
3948 int start_slot, int nr, int inode_only,
3949 u64 logged_isize)
31ff1cd2 3950{
3ffbd68c 3951 struct btrfs_fs_info *fs_info = trans->fs_info;
31ff1cd2
CM
3952 unsigned long src_offset;
3953 unsigned long dst_offset;
44d70e19 3954 struct btrfs_root *log = inode->root->log_root;
31ff1cd2
CM
3955 struct btrfs_file_extent_item *extent;
3956 struct btrfs_inode_item *inode_item;
16e7549f 3957 struct extent_buffer *src = src_path->nodes[0];
31ff1cd2
CM
3958 int ret;
3959 struct btrfs_key *ins_keys;
3960 u32 *ins_sizes;
3961 char *ins_data;
3962 int i;
d20f7043 3963 struct list_head ordered_sums;
44d70e19 3964 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
d20f7043
CM
3965
3966 INIT_LIST_HEAD(&ordered_sums);
31ff1cd2
CM
3967
3968 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3969 nr * sizeof(u32), GFP_NOFS);
2a29edc6 3970 if (!ins_data)
3971 return -ENOMEM;
3972
31ff1cd2
CM
3973 ins_sizes = (u32 *)ins_data;
3974 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3975
3976 for (i = 0; i < nr; i++) {
3977 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3978 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3979 }
3980 ret = btrfs_insert_empty_items(trans, log, dst_path,
3981 ins_keys, ins_sizes, nr);
4a500fd1
YZ
3982 if (ret) {
3983 kfree(ins_data);
3984 return ret;
3985 }
31ff1cd2 3986
5d4f98a2 3987 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
31ff1cd2
CM
3988 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3989 dst_path->slots[0]);
3990
3991 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3992
94edf4ae 3993 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
31ff1cd2
CM
3994 inode_item = btrfs_item_ptr(dst_path->nodes[0],
3995 dst_path->slots[0],
3996 struct btrfs_inode_item);
94edf4ae 3997 fill_inode_item(trans, dst_path->nodes[0], inode_item,
f85b7379
DS
3998 &inode->vfs_inode,
3999 inode_only == LOG_INODE_EXISTS,
1a4bcf47 4000 logged_isize);
94edf4ae
JB
4001 } else {
4002 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4003 src_offset, ins_sizes[i]);
31ff1cd2 4004 }
94edf4ae 4005
31ff1cd2
CM
4006 /* take a reference on file data extents so that truncates
4007 * or deletes of this inode don't have to relog the inode
4008 * again
4009 */
962a298f 4010 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
d2794405 4011 !skip_csum) {
31ff1cd2
CM
4012 int found_type;
4013 extent = btrfs_item_ptr(src, start_slot + i,
4014 struct btrfs_file_extent_item);
4015
8e531cdf 4016 if (btrfs_file_extent_generation(src, extent) < trans->transid)
4017 continue;
4018
31ff1cd2 4019 found_type = btrfs_file_extent_type(src, extent);
6f1fed77 4020 if (found_type == BTRFS_FILE_EXTENT_REG) {
5d4f98a2
YZ
4021 u64 ds, dl, cs, cl;
4022 ds = btrfs_file_extent_disk_bytenr(src,
4023 extent);
4024 /* ds == 0 is a hole */
4025 if (ds == 0)
4026 continue;
4027
4028 dl = btrfs_file_extent_disk_num_bytes(src,
4029 extent);
4030 cs = btrfs_file_extent_offset(src, extent);
4031 cl = btrfs_file_extent_num_bytes(src,
a419aef8 4032 extent);
580afd76
CM
4033 if (btrfs_file_extent_compression(src,
4034 extent)) {
4035 cs = 0;
4036 cl = dl;
4037 }
5d4f98a2
YZ
4038
4039 ret = btrfs_lookup_csums_range(
0b246afa 4040 fs_info->csum_root,
5d4f98a2 4041 ds + cs, ds + cs + cl - 1,
a2de733c 4042 &ordered_sums, 0);
4f26433e
FM
4043 if (ret)
4044 break;
31ff1cd2
CM
4045 }
4046 }
31ff1cd2
CM
4047 }
4048
4049 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
b3b4aa74 4050 btrfs_release_path(dst_path);
31ff1cd2 4051 kfree(ins_data);
d20f7043
CM
4052
4053 /*
4054 * we have to do this after the loop above to avoid changing the
4055 * log tree while trying to change the log tree.
4056 */
d397712b 4057 while (!list_empty(&ordered_sums)) {
d20f7043
CM
4058 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4059 struct btrfs_ordered_sum,
4060 list);
4a500fd1 4061 if (!ret)
3ebac17c 4062 ret = log_csums(trans, inode, log, sums);
d20f7043
CM
4063 list_del(&sums->list);
4064 kfree(sums);
4065 }
16e7549f 4066
4a500fd1 4067 return ret;
31ff1cd2
CM
4068}
4069
5dc562c5
JB
4070static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
4071{
4072 struct extent_map *em1, *em2;
4073
4074 em1 = list_entry(a, struct extent_map, list);
4075 em2 = list_entry(b, struct extent_map, list);
4076
4077 if (em1->start < em2->start)
4078 return -1;
4079 else if (em1->start > em2->start)
4080 return 1;
4081 return 0;
4082}
4083
e7175a69
JB
4084static int log_extent_csums(struct btrfs_trans_handle *trans,
4085 struct btrfs_inode *inode,
a9ecb653 4086 struct btrfs_root *log_root,
48778179
FM
4087 const struct extent_map *em,
4088 struct btrfs_log_ctx *ctx)
5dc562c5 4089{
48778179 4090 struct btrfs_ordered_extent *ordered;
2ab28f32
JB
4091 u64 csum_offset;
4092 u64 csum_len;
48778179
FM
4093 u64 mod_start = em->mod_start;
4094 u64 mod_len = em->mod_len;
8407f553
FM
4095 LIST_HEAD(ordered_sums);
4096 int ret = 0;
0aa4a17d 4097
e7175a69
JB
4098 if (inode->flags & BTRFS_INODE_NODATASUM ||
4099 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
8407f553 4100 em->block_start == EXTENT_MAP_HOLE)
70c8a91c 4101 return 0;
5dc562c5 4102
48778179
FM
4103 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4104 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4105 const u64 mod_end = mod_start + mod_len;
4106 struct btrfs_ordered_sum *sums;
4107
4108 if (mod_len == 0)
4109 break;
4110
4111 if (ordered_end <= mod_start)
4112 continue;
4113 if (mod_end <= ordered->file_offset)
4114 break;
4115
4116 /*
4117 * We are going to copy all the csums on this ordered extent, so
4118 * go ahead and adjust mod_start and mod_len in case this ordered
4119 * extent has already been logged.
4120 */
4121 if (ordered->file_offset > mod_start) {
4122 if (ordered_end >= mod_end)
4123 mod_len = ordered->file_offset - mod_start;
4124 /*
4125 * If we have this case
4126 *
4127 * |--------- logged extent ---------|
4128 * |----- ordered extent ----|
4129 *
4130 * Just don't mess with mod_start and mod_len, we'll
4131 * just end up logging more csums than we need and it
4132 * will be ok.
4133 */
4134 } else {
4135 if (ordered_end < mod_end) {
4136 mod_len = mod_end - ordered_end;
4137 mod_start = ordered_end;
4138 } else {
4139 mod_len = 0;
4140 }
4141 }
4142
4143 /*
4144 * To keep us from looping for the above case of an ordered
4145 * extent that falls inside of the logged extent.
4146 */
4147 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4148 continue;
4149
4150 list_for_each_entry(sums, &ordered->list, list) {
4151 ret = log_csums(trans, inode, log_root, sums);
4152 if (ret)
4153 return ret;
4154 }
4155 }
4156
4157 /* We're done, found all csums in the ordered extents. */
4158 if (mod_len == 0)
4159 return 0;
4160
e7175a69 4161 /* If we're compressed we have to save the entire range of csums. */
488111aa
FDBM
4162 if (em->compress_type) {
4163 csum_offset = 0;
8407f553 4164 csum_len = max(em->block_len, em->orig_block_len);
488111aa 4165 } else {
48778179
FM
4166 csum_offset = mod_start - em->start;
4167 csum_len = mod_len;
488111aa 4168 }
2ab28f32 4169
70c8a91c 4170 /* block start is already adjusted for the file extent offset. */
a9ecb653 4171 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
70c8a91c
JB
4172 em->block_start + csum_offset,
4173 em->block_start + csum_offset +
4174 csum_len - 1, &ordered_sums, 0);
4175 if (ret)
4176 return ret;
5dc562c5 4177
70c8a91c
JB
4178 while (!list_empty(&ordered_sums)) {
4179 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4180 struct btrfs_ordered_sum,
4181 list);
4182 if (!ret)
3ebac17c 4183 ret = log_csums(trans, inode, log_root, sums);
70c8a91c
JB
4184 list_del(&sums->list);
4185 kfree(sums);
5dc562c5
JB
4186 }
4187
70c8a91c 4188 return ret;
5dc562c5
JB
4189}
4190
8407f553 4191static int log_one_extent(struct btrfs_trans_handle *trans,
9d122629 4192 struct btrfs_inode *inode, struct btrfs_root *root,
8407f553
FM
4193 const struct extent_map *em,
4194 struct btrfs_path *path,
8407f553
FM
4195 struct btrfs_log_ctx *ctx)
4196{
5893dfb9 4197 struct btrfs_drop_extents_args drop_args = { 0 };
8407f553
FM
4198 struct btrfs_root *log = root->log_root;
4199 struct btrfs_file_extent_item *fi;
4200 struct extent_buffer *leaf;
4201 struct btrfs_map_token token;
4202 struct btrfs_key key;
4203 u64 extent_offset = em->start - em->orig_start;
4204 u64 block_len;
4205 int ret;
8407f553 4206
48778179 4207 ret = log_extent_csums(trans, inode, log, em, ctx);
8407f553
FM
4208 if (ret)
4209 return ret;
4210
5893dfb9
FM
4211 drop_args.path = path;
4212 drop_args.start = em->start;
4213 drop_args.end = em->start + em->len;
4214 drop_args.replace_extent = true;
4215 drop_args.extent_item_size = sizeof(*fi);
4216 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
8407f553
FM
4217 if (ret)
4218 return ret;
4219
5893dfb9 4220 if (!drop_args.extent_inserted) {
9d122629 4221 key.objectid = btrfs_ino(inode);
8407f553
FM
4222 key.type = BTRFS_EXTENT_DATA_KEY;
4223 key.offset = em->start;
4224
4225 ret = btrfs_insert_empty_item(trans, log, path, &key,
4226 sizeof(*fi));
4227 if (ret)
4228 return ret;
4229 }
4230 leaf = path->nodes[0];
c82f823c 4231 btrfs_init_map_token(&token, leaf);
8407f553
FM
4232 fi = btrfs_item_ptr(leaf, path->slots[0],
4233 struct btrfs_file_extent_item);
4234
cc4c13d5 4235 btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
8407f553 4236 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
cc4c13d5
DS
4237 btrfs_set_token_file_extent_type(&token, fi,
4238 BTRFS_FILE_EXTENT_PREALLOC);
8407f553 4239 else
cc4c13d5
DS
4240 btrfs_set_token_file_extent_type(&token, fi,
4241 BTRFS_FILE_EXTENT_REG);
8407f553
FM
4242
4243 block_len = max(em->block_len, em->orig_block_len);
4244 if (em->compress_type != BTRFS_COMPRESS_NONE) {
cc4c13d5
DS
4245 btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4246 em->block_start);
4247 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
8407f553 4248 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
cc4c13d5 4249 btrfs_set_token_file_extent_disk_bytenr(&token, fi,
8407f553 4250 em->block_start -
cc4c13d5
DS
4251 extent_offset);
4252 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
8407f553 4253 } else {
cc4c13d5
DS
4254 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4255 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
8407f553
FM
4256 }
4257
cc4c13d5
DS
4258 btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4259 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4260 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4261 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4262 btrfs_set_token_file_extent_encryption(&token, fi, 0);
4263 btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
8407f553
FM
4264 btrfs_mark_buffer_dirty(leaf);
4265
4266 btrfs_release_path(path);
4267
4268 return ret;
4269}
4270
31d11b83
FM
4271/*
4272 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4273 * lose them after doing a fast fsync and replaying the log. We scan the
4274 * subvolume's root instead of iterating the inode's extent map tree because
4275 * otherwise we can log incorrect extent items based on extent map conversion.
4276 * That can happen due to the fact that extent maps are merged when they
4277 * are not in the extent map tree's list of modified extents.
4278 */
4279static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4280 struct btrfs_inode *inode,
4281 struct btrfs_path *path)
4282{
4283 struct btrfs_root *root = inode->root;
4284 struct btrfs_key key;
4285 const u64 i_size = i_size_read(&inode->vfs_inode);
4286 const u64 ino = btrfs_ino(inode);
4287 struct btrfs_path *dst_path = NULL;
0e56315c 4288 bool dropped_extents = false;
f135cea3
FM
4289 u64 truncate_offset = i_size;
4290 struct extent_buffer *leaf;
4291 int slot;
31d11b83
FM
4292 int ins_nr = 0;
4293 int start_slot;
4294 int ret;
4295
4296 if (!(inode->flags & BTRFS_INODE_PREALLOC))
4297 return 0;
4298
4299 key.objectid = ino;
4300 key.type = BTRFS_EXTENT_DATA_KEY;
4301 key.offset = i_size;
4302 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4303 if (ret < 0)
4304 goto out;
4305
f135cea3
FM
4306 /*
4307 * We must check if there is a prealloc extent that starts before the
4308 * i_size and crosses the i_size boundary. This is to ensure later we
4309 * truncate down to the end of that extent and not to the i_size, as
4310 * otherwise we end up losing part of the prealloc extent after a log
4311 * replay and with an implicit hole if there is another prealloc extent
4312 * that starts at an offset beyond i_size.
4313 */
4314 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4315 if (ret < 0)
4316 goto out;
4317
4318 if (ret == 0) {
4319 struct btrfs_file_extent_item *ei;
4320
4321 leaf = path->nodes[0];
4322 slot = path->slots[0];
4323 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4324
4325 if (btrfs_file_extent_type(leaf, ei) ==
4326 BTRFS_FILE_EXTENT_PREALLOC) {
4327 u64 extent_end;
4328
4329 btrfs_item_key_to_cpu(leaf, &key, slot);
4330 extent_end = key.offset +
4331 btrfs_file_extent_num_bytes(leaf, ei);
4332
4333 if (extent_end > i_size)
4334 truncate_offset = extent_end;
4335 }
4336 } else {
4337 ret = 0;
4338 }
4339
31d11b83 4340 while (true) {
f135cea3
FM
4341 leaf = path->nodes[0];
4342 slot = path->slots[0];
31d11b83
FM
4343
4344 if (slot >= btrfs_header_nritems(leaf)) {
4345 if (ins_nr > 0) {
4346 ret = copy_items(trans, inode, dst_path, path,
0e56315c 4347 start_slot, ins_nr, 1, 0);
31d11b83
FM
4348 if (ret < 0)
4349 goto out;
4350 ins_nr = 0;
4351 }
4352 ret = btrfs_next_leaf(root, path);
4353 if (ret < 0)
4354 goto out;
4355 if (ret > 0) {
4356 ret = 0;
4357 break;
4358 }
4359 continue;
4360 }
4361
4362 btrfs_item_key_to_cpu(leaf, &key, slot);
4363 if (key.objectid > ino)
4364 break;
4365 if (WARN_ON_ONCE(key.objectid < ino) ||
4366 key.type < BTRFS_EXTENT_DATA_KEY ||
4367 key.offset < i_size) {
4368 path->slots[0]++;
4369 continue;
4370 }
0e56315c 4371 if (!dropped_extents) {
31d11b83
FM
4372 /*
4373 * Avoid logging extent items logged in past fsync calls
4374 * and leading to duplicate keys in the log tree.
4375 */
4376 do {
4377 ret = btrfs_truncate_inode_items(trans,
4378 root->log_root,
50743398 4379 inode, truncate_offset,
31d11b83
FM
4380 BTRFS_EXTENT_DATA_KEY);
4381 } while (ret == -EAGAIN);
4382 if (ret)
4383 goto out;
0e56315c 4384 dropped_extents = true;
31d11b83
FM
4385 }
4386 if (ins_nr == 0)
4387 start_slot = slot;
4388 ins_nr++;
4389 path->slots[0]++;
4390 if (!dst_path) {
4391 dst_path = btrfs_alloc_path();
4392 if (!dst_path) {
4393 ret = -ENOMEM;
4394 goto out;
4395 }
4396 }
4397 }
0bc2d3c0 4398 if (ins_nr > 0)
0e56315c 4399 ret = copy_items(trans, inode, dst_path, path,
31d11b83 4400 start_slot, ins_nr, 1, 0);
31d11b83
FM
4401out:
4402 btrfs_release_path(path);
4403 btrfs_free_path(dst_path);
4404 return ret;
4405}
4406
5dc562c5
JB
4407static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4408 struct btrfs_root *root,
9d122629 4409 struct btrfs_inode *inode,
827463c4 4410 struct btrfs_path *path,
48778179 4411 struct btrfs_log_ctx *ctx)
5dc562c5 4412{
48778179
FM
4413 struct btrfs_ordered_extent *ordered;
4414 struct btrfs_ordered_extent *tmp;
5dc562c5
JB
4415 struct extent_map *em, *n;
4416 struct list_head extents;
9d122629 4417 struct extent_map_tree *tree = &inode->extent_tree;
5dc562c5 4418 int ret = 0;
2ab28f32 4419 int num = 0;
5dc562c5
JB
4420
4421 INIT_LIST_HEAD(&extents);
4422
5dc562c5 4423 write_lock(&tree->lock);
5dc562c5
JB
4424
4425 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4426 list_del_init(&em->list);
2ab28f32
JB
4427 /*
4428 * Just an arbitrary number, this can be really CPU intensive
4429 * once we start getting a lot of extents, and really once we
4430 * have a bunch of extents we just want to commit since it will
4431 * be faster.
4432 */
4433 if (++num > 32768) {
4434 list_del_init(&tree->modified_extents);
4435 ret = -EFBIG;
4436 goto process;
4437 }
4438
5f96bfb7 4439 if (em->generation < trans->transid)
5dc562c5 4440 continue;
8c6c5928 4441
31d11b83
FM
4442 /* We log prealloc extents beyond eof later. */
4443 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4444 em->start >= i_size_read(&inode->vfs_inode))
4445 continue;
4446
ff44c6e3 4447 /* Need a ref to keep it from getting evicted from cache */
490b54d6 4448 refcount_inc(&em->refs);
ff44c6e3 4449 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
5dc562c5 4450 list_add_tail(&em->list, &extents);
2ab28f32 4451 num++;
5dc562c5
JB
4452 }
4453
4454 list_sort(NULL, &extents, extent_cmp);
2ab28f32 4455process:
5dc562c5
JB
4456 while (!list_empty(&extents)) {
4457 em = list_entry(extents.next, struct extent_map, list);
4458
4459 list_del_init(&em->list);
4460
4461 /*
4462 * If we had an error we just need to delete everybody from our
4463 * private list.
4464 */
ff44c6e3 4465 if (ret) {
201a9038 4466 clear_em_logging(tree, em);
ff44c6e3 4467 free_extent_map(em);
5dc562c5 4468 continue;
ff44c6e3
JB
4469 }
4470
4471 write_unlock(&tree->lock);
5dc562c5 4472
a2120a47 4473 ret = log_one_extent(trans, inode, root, em, path, ctx);
ff44c6e3 4474 write_lock(&tree->lock);
201a9038
JB
4475 clear_em_logging(tree, em);
4476 free_extent_map(em);
5dc562c5 4477 }
ff44c6e3
JB
4478 WARN_ON(!list_empty(&extents));
4479 write_unlock(&tree->lock);
5dc562c5 4480
5dc562c5 4481 btrfs_release_path(path);
31d11b83
FM
4482 if (!ret)
4483 ret = btrfs_log_prealloc_extents(trans, inode, path);
48778179
FM
4484 if (ret)
4485 return ret;
31d11b83 4486
48778179
FM
4487 /*
4488 * We have logged all extents successfully, now make sure the commit of
4489 * the current transaction waits for the ordered extents to complete
4490 * before it commits and wipes out the log trees, otherwise we would
4491 * lose data if an ordered extents completes after the transaction
4492 * commits and a power failure happens after the transaction commit.
4493 */
4494 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4495 list_del_init(&ordered->log_list);
4496 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4497
4498 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4499 spin_lock_irq(&inode->ordered_tree.lock);
4500 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4501 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4502 atomic_inc(&trans->transaction->pending_ordered);
4503 }
4504 spin_unlock_irq(&inode->ordered_tree.lock);
4505 }
4506 btrfs_put_ordered_extent(ordered);
4507 }
4508
4509 return 0;
5dc562c5
JB
4510}
4511
481b01c0 4512static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
1a4bcf47
FM
4513 struct btrfs_path *path, u64 *size_ret)
4514{
4515 struct btrfs_key key;
4516 int ret;
4517
481b01c0 4518 key.objectid = btrfs_ino(inode);
1a4bcf47
FM
4519 key.type = BTRFS_INODE_ITEM_KEY;
4520 key.offset = 0;
4521
4522 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4523 if (ret < 0) {
4524 return ret;
4525 } else if (ret > 0) {
2f2ff0ee 4526 *size_ret = 0;
1a4bcf47
FM
4527 } else {
4528 struct btrfs_inode_item *item;
4529
4530 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4531 struct btrfs_inode_item);
4532 *size_ret = btrfs_inode_size(path->nodes[0], item);
bf504110
FM
4533 /*
4534 * If the in-memory inode's i_size is smaller then the inode
4535 * size stored in the btree, return the inode's i_size, so
4536 * that we get a correct inode size after replaying the log
4537 * when before a power failure we had a shrinking truncate
4538 * followed by addition of a new name (rename / new hard link).
4539 * Otherwise return the inode size from the btree, to avoid
4540 * data loss when replaying a log due to previously doing a
4541 * write that expands the inode's size and logging a new name
4542 * immediately after.
4543 */
4544 if (*size_ret > inode->vfs_inode.i_size)
4545 *size_ret = inode->vfs_inode.i_size;
1a4bcf47
FM
4546 }
4547
4548 btrfs_release_path(path);
4549 return 0;
4550}
4551
36283bf7
FM
4552/*
4553 * At the moment we always log all xattrs. This is to figure out at log replay
4554 * time which xattrs must have their deletion replayed. If a xattr is missing
4555 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4556 * because if a xattr is deleted, the inode is fsynced and a power failure
4557 * happens, causing the log to be replayed the next time the fs is mounted,
4558 * we want the xattr to not exist anymore (same behaviour as other filesystems
4559 * with a journal, ext3/4, xfs, f2fs, etc).
4560 */
4561static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4562 struct btrfs_root *root,
1a93c36a 4563 struct btrfs_inode *inode,
36283bf7
FM
4564 struct btrfs_path *path,
4565 struct btrfs_path *dst_path)
4566{
4567 int ret;
4568 struct btrfs_key key;
1a93c36a 4569 const u64 ino = btrfs_ino(inode);
36283bf7
FM
4570 int ins_nr = 0;
4571 int start_slot = 0;
f2f121ab
FM
4572 bool found_xattrs = false;
4573
4574 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4575 return 0;
36283bf7
FM
4576
4577 key.objectid = ino;
4578 key.type = BTRFS_XATTR_ITEM_KEY;
4579 key.offset = 0;
4580
4581 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4582 if (ret < 0)
4583 return ret;
4584
4585 while (true) {
4586 int slot = path->slots[0];
4587 struct extent_buffer *leaf = path->nodes[0];
4588 int nritems = btrfs_header_nritems(leaf);
4589
4590 if (slot >= nritems) {
4591 if (ins_nr > 0) {
1a93c36a 4592 ret = copy_items(trans, inode, dst_path, path,
0e56315c 4593 start_slot, ins_nr, 1, 0);
36283bf7
FM
4594 if (ret < 0)
4595 return ret;
4596 ins_nr = 0;
4597 }
4598 ret = btrfs_next_leaf(root, path);
4599 if (ret < 0)
4600 return ret;
4601 else if (ret > 0)
4602 break;
4603 continue;
4604 }
4605
4606 btrfs_item_key_to_cpu(leaf, &key, slot);
4607 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4608 break;
4609
4610 if (ins_nr == 0)
4611 start_slot = slot;
4612 ins_nr++;
4613 path->slots[0]++;
f2f121ab 4614 found_xattrs = true;
36283bf7
FM
4615 cond_resched();
4616 }
4617 if (ins_nr > 0) {
1a93c36a 4618 ret = copy_items(trans, inode, dst_path, path,
0e56315c 4619 start_slot, ins_nr, 1, 0);
36283bf7
FM
4620 if (ret < 0)
4621 return ret;
4622 }
4623
f2f121ab
FM
4624 if (!found_xattrs)
4625 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4626
36283bf7
FM
4627 return 0;
4628}
4629
a89ca6f2 4630/*
0e56315c
FM
4631 * When using the NO_HOLES feature if we punched a hole that causes the
4632 * deletion of entire leafs or all the extent items of the first leaf (the one
4633 * that contains the inode item and references) we may end up not processing
4634 * any extents, because there are no leafs with a generation matching the
4635 * current transaction that have extent items for our inode. So we need to find
4636 * if any holes exist and then log them. We also need to log holes after any
4637 * truncate operation that changes the inode's size.
a89ca6f2 4638 */
0e56315c
FM
4639static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4640 struct btrfs_root *root,
4641 struct btrfs_inode *inode,
7af59743 4642 struct btrfs_path *path)
a89ca6f2 4643{
0b246afa 4644 struct btrfs_fs_info *fs_info = root->fs_info;
a89ca6f2 4645 struct btrfs_key key;
a0308dd7
NB
4646 const u64 ino = btrfs_ino(inode);
4647 const u64 i_size = i_size_read(&inode->vfs_inode);
7af59743 4648 u64 prev_extent_end = 0;
0e56315c 4649 int ret;
a89ca6f2 4650
0e56315c 4651 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
a89ca6f2
FM
4652 return 0;
4653
4654 key.objectid = ino;
4655 key.type = BTRFS_EXTENT_DATA_KEY;
7af59743 4656 key.offset = 0;
a89ca6f2
FM
4657
4658 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
a89ca6f2
FM
4659 if (ret < 0)
4660 return ret;
4661
0e56315c 4662 while (true) {
0e56315c 4663 struct extent_buffer *leaf = path->nodes[0];
a89ca6f2 4664
0e56315c
FM
4665 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4666 ret = btrfs_next_leaf(root, path);
4667 if (ret < 0)
4668 return ret;
4669 if (ret > 0) {
4670 ret = 0;
4671 break;
4672 }
4673 leaf = path->nodes[0];
4674 }
4675
4676 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4677 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4678 break;
4679
4680 /* We have a hole, log it. */
4681 if (prev_extent_end < key.offset) {
7af59743 4682 const u64 hole_len = key.offset - prev_extent_end;
0e56315c
FM
4683
4684 /*
4685 * Release the path to avoid deadlocks with other code
4686 * paths that search the root while holding locks on
4687 * leafs from the log root.
4688 */
4689 btrfs_release_path(path);
4690 ret = btrfs_insert_file_extent(trans, root->log_root,
4691 ino, prev_extent_end, 0,
4692 0, hole_len, 0, hole_len,
4693 0, 0, 0);
4694 if (ret < 0)
4695 return ret;
4696
4697 /*
4698 * Search for the same key again in the root. Since it's
4699 * an extent item and we are holding the inode lock, the
4700 * key must still exist. If it doesn't just emit warning
4701 * and return an error to fall back to a transaction
4702 * commit.
4703 */
4704 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4705 if (ret < 0)
4706 return ret;
4707 if (WARN_ON(ret > 0))
4708 return -ENOENT;
4709 leaf = path->nodes[0];
4710 }
a89ca6f2 4711
7af59743 4712 prev_extent_end = btrfs_file_extent_end(path);
0e56315c
FM
4713 path->slots[0]++;
4714 cond_resched();
a89ca6f2 4715 }
a89ca6f2 4716
7af59743 4717 if (prev_extent_end < i_size) {
0e56315c 4718 u64 hole_len;
a89ca6f2 4719
0e56315c 4720 btrfs_release_path(path);
7af59743 4721 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
0e56315c
FM
4722 ret = btrfs_insert_file_extent(trans, root->log_root,
4723 ino, prev_extent_end, 0, 0,
4724 hole_len, 0, hole_len,
4725 0, 0, 0);
4726 if (ret < 0)
4727 return ret;
4728 }
4729
4730 return 0;
a89ca6f2
FM
4731}
4732
56f23fdb
FM
4733/*
4734 * When we are logging a new inode X, check if it doesn't have a reference that
4735 * matches the reference from some other inode Y created in a past transaction
4736 * and that was renamed in the current transaction. If we don't do this, then at
4737 * log replay time we can lose inode Y (and all its files if it's a directory):
4738 *
4739 * mkdir /mnt/x
4740 * echo "hello world" > /mnt/x/foobar
4741 * sync
4742 * mv /mnt/x /mnt/y
4743 * mkdir /mnt/x # or touch /mnt/x
4744 * xfs_io -c fsync /mnt/x
4745 * <power fail>
4746 * mount fs, trigger log replay
4747 *
4748 * After the log replay procedure, we would lose the first directory and all its
4749 * files (file foobar).
4750 * For the case where inode Y is not a directory we simply end up losing it:
4751 *
4752 * echo "123" > /mnt/foo
4753 * sync
4754 * mv /mnt/foo /mnt/bar
4755 * echo "abc" > /mnt/foo
4756 * xfs_io -c fsync /mnt/foo
4757 * <power fail>
4758 *
4759 * We also need this for cases where a snapshot entry is replaced by some other
4760 * entry (file or directory) otherwise we end up with an unreplayable log due to
4761 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4762 * if it were a regular entry:
4763 *
4764 * mkdir /mnt/x
4765 * btrfs subvolume snapshot /mnt /mnt/x/snap
4766 * btrfs subvolume delete /mnt/x/snap
4767 * rmdir /mnt/x
4768 * mkdir /mnt/x
4769 * fsync /mnt/x or fsync some new file inside it
4770 * <power fail>
4771 *
4772 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4773 * the same transaction.
4774 */
4775static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4776 const int slot,
4777 const struct btrfs_key *key,
4791c8f1 4778 struct btrfs_inode *inode,
a3baaf0d 4779 u64 *other_ino, u64 *other_parent)
56f23fdb
FM
4780{
4781 int ret;
4782 struct btrfs_path *search_path;
4783 char *name = NULL;
4784 u32 name_len = 0;
4785 u32 item_size = btrfs_item_size_nr(eb, slot);
4786 u32 cur_offset = 0;
4787 unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4788
4789 search_path = btrfs_alloc_path();
4790 if (!search_path)
4791 return -ENOMEM;
4792 search_path->search_commit_root = 1;
4793 search_path->skip_locking = 1;
4794
4795 while (cur_offset < item_size) {
4796 u64 parent;
4797 u32 this_name_len;
4798 u32 this_len;
4799 unsigned long name_ptr;
4800 struct btrfs_dir_item *di;
4801
4802 if (key->type == BTRFS_INODE_REF_KEY) {
4803 struct btrfs_inode_ref *iref;
4804
4805 iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4806 parent = key->offset;
4807 this_name_len = btrfs_inode_ref_name_len(eb, iref);
4808 name_ptr = (unsigned long)(iref + 1);
4809 this_len = sizeof(*iref) + this_name_len;
4810 } else {
4811 struct btrfs_inode_extref *extref;
4812
4813 extref = (struct btrfs_inode_extref *)(ptr +
4814 cur_offset);
4815 parent = btrfs_inode_extref_parent(eb, extref);
4816 this_name_len = btrfs_inode_extref_name_len(eb, extref);
4817 name_ptr = (unsigned long)&extref->name;
4818 this_len = sizeof(*extref) + this_name_len;
4819 }
4820
4821 if (this_name_len > name_len) {
4822 char *new_name;
4823
4824 new_name = krealloc(name, this_name_len, GFP_NOFS);
4825 if (!new_name) {
4826 ret = -ENOMEM;
4827 goto out;
4828 }
4829 name_len = this_name_len;
4830 name = new_name;
4831 }
4832
4833 read_extent_buffer(eb, name, name_ptr, this_name_len);
4791c8f1
NB
4834 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4835 parent, name, this_name_len, 0);
56f23fdb 4836 if (di && !IS_ERR(di)) {
44f714da
FM
4837 struct btrfs_key di_key;
4838
4839 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4840 di, &di_key);
4841 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
6b5fc433
FM
4842 if (di_key.objectid != key->objectid) {
4843 ret = 1;
4844 *other_ino = di_key.objectid;
a3baaf0d 4845 *other_parent = parent;
6b5fc433
FM
4846 } else {
4847 ret = 0;
4848 }
44f714da
FM
4849 } else {
4850 ret = -EAGAIN;
4851 }
56f23fdb
FM
4852 goto out;
4853 } else if (IS_ERR(di)) {
4854 ret = PTR_ERR(di);
4855 goto out;
4856 }
4857 btrfs_release_path(search_path);
4858
4859 cur_offset += this_len;
4860 }
4861 ret = 0;
4862out:
4863 btrfs_free_path(search_path);
4864 kfree(name);
4865 return ret;
4866}
4867
6b5fc433
FM
4868struct btrfs_ino_list {
4869 u64 ino;
a3baaf0d 4870 u64 parent;
6b5fc433
FM
4871 struct list_head list;
4872};
4873
4874static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4875 struct btrfs_root *root,
4876 struct btrfs_path *path,
4877 struct btrfs_log_ctx *ctx,
a3baaf0d 4878 u64 ino, u64 parent)
6b5fc433
FM
4879{
4880 struct btrfs_ino_list *ino_elem;
4881 LIST_HEAD(inode_list);
4882 int ret = 0;
4883
4884 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4885 if (!ino_elem)
4886 return -ENOMEM;
4887 ino_elem->ino = ino;
a3baaf0d 4888 ino_elem->parent = parent;
6b5fc433
FM
4889 list_add_tail(&ino_elem->list, &inode_list);
4890
4891 while (!list_empty(&inode_list)) {
4892 struct btrfs_fs_info *fs_info = root->fs_info;
4893 struct btrfs_key key;
4894 struct inode *inode;
4895
4896 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4897 list);
4898 ino = ino_elem->ino;
a3baaf0d 4899 parent = ino_elem->parent;
6b5fc433
FM
4900 list_del(&ino_elem->list);
4901 kfree(ino_elem);
4902 if (ret)
4903 continue;
4904
4905 btrfs_release_path(path);
4906
0202e83f 4907 inode = btrfs_iget(fs_info->sb, ino, root);
6b5fc433
FM
4908 /*
4909 * If the other inode that had a conflicting dir entry was
a3baaf0d
FM
4910 * deleted in the current transaction, we need to log its parent
4911 * directory.
6b5fc433
FM
4912 */
4913 if (IS_ERR(inode)) {
4914 ret = PTR_ERR(inode);
a3baaf0d 4915 if (ret == -ENOENT) {
0202e83f 4916 inode = btrfs_iget(fs_info->sb, parent, root);
a3baaf0d
FM
4917 if (IS_ERR(inode)) {
4918 ret = PTR_ERR(inode);
4919 } else {
4920 ret = btrfs_log_inode(trans, root,
4921 BTRFS_I(inode),
4922 LOG_OTHER_INODE_ALL,
48778179 4923 ctx);
410f954c 4924 btrfs_add_delayed_iput(inode);
a3baaf0d
FM
4925 }
4926 }
6b5fc433
FM
4927 continue;
4928 }
b5e4ff9d
FM
4929 /*
4930 * If the inode was already logged skip it - otherwise we can
4931 * hit an infinite loop. Example:
4932 *
4933 * From the commit root (previous transaction) we have the
4934 * following inodes:
4935 *
4936 * inode 257 a directory
4937 * inode 258 with references "zz" and "zz_link" on inode 257
4938 * inode 259 with reference "a" on inode 257
4939 *
4940 * And in the current (uncommitted) transaction we have:
4941 *
4942 * inode 257 a directory, unchanged
4943 * inode 258 with references "a" and "a2" on inode 257
4944 * inode 259 with reference "zz_link" on inode 257
4945 * inode 261 with reference "zz" on inode 257
4946 *
4947 * When logging inode 261 the following infinite loop could
4948 * happen if we don't skip already logged inodes:
4949 *
4950 * - we detect inode 258 as a conflicting inode, with inode 261
4951 * on reference "zz", and log it;
4952 *
4953 * - we detect inode 259 as a conflicting inode, with inode 258
4954 * on reference "a", and log it;
4955 *
4956 * - we detect inode 258 as a conflicting inode, with inode 259
4957 * on reference "zz_link", and log it - again! After this we
4958 * repeat the above steps forever.
4959 */
4960 spin_lock(&BTRFS_I(inode)->lock);
4961 /*
4962 * Check the inode's logged_trans only instead of
4963 * btrfs_inode_in_log(). This is because the last_log_commit of
4964 * the inode is not updated when we only log that it exists and
260db43c 4965 * it has the full sync bit set (see btrfs_log_inode()).
b5e4ff9d
FM
4966 */
4967 if (BTRFS_I(inode)->logged_trans == trans->transid) {
4968 spin_unlock(&BTRFS_I(inode)->lock);
4969 btrfs_add_delayed_iput(inode);
4970 continue;
4971 }
4972 spin_unlock(&BTRFS_I(inode)->lock);
6b5fc433
FM
4973 /*
4974 * We are safe logging the other inode without acquiring its
4975 * lock as long as we log with the LOG_INODE_EXISTS mode. We
4976 * are safe against concurrent renames of the other inode as
4977 * well because during a rename we pin the log and update the
4978 * log with the new name before we unpin it.
4979 */
4980 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
48778179 4981 LOG_OTHER_INODE, ctx);
6b5fc433 4982 if (ret) {
410f954c 4983 btrfs_add_delayed_iput(inode);
6b5fc433
FM
4984 continue;
4985 }
4986
4987 key.objectid = ino;
4988 key.type = BTRFS_INODE_REF_KEY;
4989 key.offset = 0;
4990 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4991 if (ret < 0) {
410f954c 4992 btrfs_add_delayed_iput(inode);
6b5fc433
FM
4993 continue;
4994 }
4995
4996 while (true) {
4997 struct extent_buffer *leaf = path->nodes[0];
4998 int slot = path->slots[0];
4999 u64 other_ino = 0;
a3baaf0d 5000 u64 other_parent = 0;
6b5fc433
FM
5001
5002 if (slot >= btrfs_header_nritems(leaf)) {
5003 ret = btrfs_next_leaf(root, path);
5004 if (ret < 0) {
5005 break;
5006 } else if (ret > 0) {
5007 ret = 0;
5008 break;
5009 }
5010 continue;
5011 }
5012
5013 btrfs_item_key_to_cpu(leaf, &key, slot);
5014 if (key.objectid != ino ||
5015 (key.type != BTRFS_INODE_REF_KEY &&
5016 key.type != BTRFS_INODE_EXTREF_KEY)) {
5017 ret = 0;
5018 break;
5019 }
5020
5021 ret = btrfs_check_ref_name_override(leaf, slot, &key,
a3baaf0d
FM
5022 BTRFS_I(inode), &other_ino,
5023 &other_parent);
6b5fc433
FM
5024 if (ret < 0)
5025 break;
5026 if (ret > 0) {
5027 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5028 if (!ino_elem) {
5029 ret = -ENOMEM;
5030 break;
5031 }
5032 ino_elem->ino = other_ino;
a3baaf0d 5033 ino_elem->parent = other_parent;
6b5fc433
FM
5034 list_add_tail(&ino_elem->list, &inode_list);
5035 ret = 0;
5036 }
5037 path->slots[0]++;
5038 }
410f954c 5039 btrfs_add_delayed_iput(inode);
6b5fc433
FM
5040 }
5041
5042 return ret;
5043}
5044
da447009
FM
5045static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5046 struct btrfs_inode *inode,
5047 struct btrfs_key *min_key,
5048 const struct btrfs_key *max_key,
5049 struct btrfs_path *path,
5050 struct btrfs_path *dst_path,
5051 const u64 logged_isize,
5052 const bool recursive_logging,
5053 const int inode_only,
5054 struct btrfs_log_ctx *ctx,
5055 bool *need_log_inode_item)
5056{
5057 struct btrfs_root *root = inode->root;
5058 int ins_start_slot = 0;
5059 int ins_nr = 0;
5060 int ret;
5061
5062 while (1) {
5063 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5064 if (ret < 0)
5065 return ret;
5066 if (ret > 0) {
5067 ret = 0;
5068 break;
5069 }
5070again:
5071 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5072 if (min_key->objectid != max_key->objectid)
5073 break;
5074 if (min_key->type > max_key->type)
5075 break;
5076
5077 if (min_key->type == BTRFS_INODE_ITEM_KEY)
5078 *need_log_inode_item = false;
5079
5080 if ((min_key->type == BTRFS_INODE_REF_KEY ||
5081 min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5082 inode->generation == trans->transid &&
5083 !recursive_logging) {
5084 u64 other_ino = 0;
5085 u64 other_parent = 0;
5086
5087 ret = btrfs_check_ref_name_override(path->nodes[0],
5088 path->slots[0], min_key, inode,
5089 &other_ino, &other_parent);
5090 if (ret < 0) {
5091 return ret;
5092 } else if (ret > 0 && ctx &&
5093 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5094 if (ins_nr > 0) {
5095 ins_nr++;
5096 } else {
5097 ins_nr = 1;
5098 ins_start_slot = path->slots[0];
5099 }
5100 ret = copy_items(trans, inode, dst_path, path,
5101 ins_start_slot, ins_nr,
5102 inode_only, logged_isize);
5103 if (ret < 0)
5104 return ret;
5105 ins_nr = 0;
5106
5107 ret = log_conflicting_inodes(trans, root, path,
5108 ctx, other_ino, other_parent);
5109 if (ret)
5110 return ret;
5111 btrfs_release_path(path);
5112 goto next_key;
5113 }
5114 }
5115
5116 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5117 if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5118 if (ins_nr == 0)
5119 goto next_slot;
5120 ret = copy_items(trans, inode, dst_path, path,
5121 ins_start_slot,
5122 ins_nr, inode_only, logged_isize);
5123 if (ret < 0)
5124 return ret;
5125 ins_nr = 0;
5126 goto next_slot;
5127 }
5128
5129 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5130 ins_nr++;
5131 goto next_slot;
5132 } else if (!ins_nr) {
5133 ins_start_slot = path->slots[0];
5134 ins_nr = 1;
5135 goto next_slot;
5136 }
5137
5138 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5139 ins_nr, inode_only, logged_isize);
5140 if (ret < 0)
5141 return ret;
5142 ins_nr = 1;
5143 ins_start_slot = path->slots[0];
5144next_slot:
5145 path->slots[0]++;
5146 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5147 btrfs_item_key_to_cpu(path->nodes[0], min_key,
5148 path->slots[0]);
5149 goto again;
5150 }
5151 if (ins_nr) {
5152 ret = copy_items(trans, inode, dst_path, path,
5153 ins_start_slot, ins_nr, inode_only,
5154 logged_isize);
5155 if (ret < 0)
5156 return ret;
5157 ins_nr = 0;
5158 }
5159 btrfs_release_path(path);
5160next_key:
5161 if (min_key->offset < (u64)-1) {
5162 min_key->offset++;
5163 } else if (min_key->type < max_key->type) {
5164 min_key->type++;
5165 min_key->offset = 0;
5166 } else {
5167 break;
5168 }
5169 }
5170 if (ins_nr)
5171 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5172 ins_nr, inode_only, logged_isize);
5173
5174 return ret;
5175}
5176
e02119d5
CM
5177/* log a single inode in the tree log.
5178 * At least one parent directory for this inode must exist in the tree
5179 * or be logged already.
5180 *
5181 * Any items from this inode changed by the current transaction are copied
5182 * to the log tree. An extra reference is taken on any extents in this
5183 * file, allowing us to avoid a whole pile of corner cases around logging
5184 * blocks that have been removed from the tree.
5185 *
5186 * See LOG_INODE_ALL and related defines for a description of what inode_only
5187 * does.
5188 *
5189 * This handles both files and directories.
5190 */
12fcfd22 5191static int btrfs_log_inode(struct btrfs_trans_handle *trans,
a59108a7 5192 struct btrfs_root *root, struct btrfs_inode *inode,
49dae1bc 5193 int inode_only,
8407f553 5194 struct btrfs_log_ctx *ctx)
e02119d5
CM
5195{
5196 struct btrfs_path *path;
5197 struct btrfs_path *dst_path;
5198 struct btrfs_key min_key;
5199 struct btrfs_key max_key;
5200 struct btrfs_root *log = root->log_root;
4a500fd1 5201 int err = 0;
8c8648dd 5202 int ret = 0;
5dc562c5 5203 bool fast_search = false;
a59108a7
NB
5204 u64 ino = btrfs_ino(inode);
5205 struct extent_map_tree *em_tree = &inode->extent_tree;
1a4bcf47 5206 u64 logged_isize = 0;
e4545de5 5207 bool need_log_inode_item = true;
9a8fca62 5208 bool xattrs_logged = false;
a3baaf0d 5209 bool recursive_logging = false;
e02119d5 5210
e02119d5 5211 path = btrfs_alloc_path();
5df67083
TI
5212 if (!path)
5213 return -ENOMEM;
e02119d5 5214 dst_path = btrfs_alloc_path();
5df67083
TI
5215 if (!dst_path) {
5216 btrfs_free_path(path);
5217 return -ENOMEM;
5218 }
e02119d5 5219
33345d01 5220 min_key.objectid = ino;
e02119d5
CM
5221 min_key.type = BTRFS_INODE_ITEM_KEY;
5222 min_key.offset = 0;
5223
33345d01 5224 max_key.objectid = ino;
12fcfd22 5225
12fcfd22 5226
5dc562c5 5227 /* today the code can only do partial logging of directories */
a59108a7 5228 if (S_ISDIR(inode->vfs_inode.i_mode) ||
5269b67e 5229 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
a59108a7 5230 &inode->runtime_flags) &&
781feef7 5231 inode_only >= LOG_INODE_EXISTS))
e02119d5
CM
5232 max_key.type = BTRFS_XATTR_ITEM_KEY;
5233 else
5234 max_key.type = (u8)-1;
5235 max_key.offset = (u64)-1;
5236
2c2c452b 5237 /*
5aa7d1a7
FM
5238 * Only run delayed items if we are a directory. We want to make sure
5239 * all directory indexes hit the fs/subvolume tree so we can find them
5240 * and figure out which index ranges have to be logged.
5241 *
8c8648dd
FM
5242 * Otherwise commit the delayed inode only if the full sync flag is set,
5243 * as we want to make sure an up to date version is in the subvolume
5244 * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5245 * it to the log tree. For a non full sync, we always log the inode item
5246 * based on the in-memory struct btrfs_inode which is always up to date.
2c2c452b 5247 */
5aa7d1a7 5248 if (S_ISDIR(inode->vfs_inode.i_mode))
a59108a7 5249 ret = btrfs_commit_inode_delayed_items(trans, inode);
8c8648dd 5250 else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
a59108a7 5251 ret = btrfs_commit_inode_delayed_inode(inode);
2c2c452b
FM
5252
5253 if (ret) {
5254 btrfs_free_path(path);
5255 btrfs_free_path(dst_path);
5256 return ret;
16cdcec7
MX
5257 }
5258
a3baaf0d
FM
5259 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5260 recursive_logging = true;
5261 if (inode_only == LOG_OTHER_INODE)
5262 inode_only = LOG_INODE_EXISTS;
5263 else
5264 inode_only = LOG_INODE_ALL;
a59108a7 5265 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
781feef7 5266 } else {
a59108a7 5267 mutex_lock(&inode->log_mutex);
781feef7 5268 }
e02119d5
CM
5269
5270 /*
5271 * a brute force approach to making sure we get the most uptodate
5272 * copies of everything.
5273 */
a59108a7 5274 if (S_ISDIR(inode->vfs_inode.i_mode)) {
e02119d5
CM
5275 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5276
4f764e51
FM
5277 if (inode_only == LOG_INODE_EXISTS)
5278 max_key_type = BTRFS_XATTR_ITEM_KEY;
33345d01 5279 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
e02119d5 5280 } else {
1a4bcf47
FM
5281 if (inode_only == LOG_INODE_EXISTS) {
5282 /*
5283 * Make sure the new inode item we write to the log has
5284 * the same isize as the current one (if it exists).
5285 * This is necessary to prevent data loss after log
5286 * replay, and also to prevent doing a wrong expanding
5287 * truncate - for e.g. create file, write 4K into offset
5288 * 0, fsync, write 4K into offset 4096, add hard link,
5289 * fsync some other file (to sync log), power fail - if
5290 * we use the inode's current i_size, after log replay
5291 * we get a 8Kb file, with the last 4Kb extent as a hole
5292 * (zeroes), as if an expanding truncate happened,
5293 * instead of getting a file of 4Kb only.
5294 */
a59108a7 5295 err = logged_inode_size(log, inode, path, &logged_isize);
1a4bcf47
FM
5296 if (err)
5297 goto out_unlock;
5298 }
a742994a 5299 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
a59108a7 5300 &inode->runtime_flags)) {
a742994a 5301 if (inode_only == LOG_INODE_EXISTS) {
4f764e51 5302 max_key.type = BTRFS_XATTR_ITEM_KEY;
a742994a
FM
5303 ret = drop_objectid_items(trans, log, path, ino,
5304 max_key.type);
5305 } else {
5306 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
a59108a7 5307 &inode->runtime_flags);
a742994a 5308 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
a59108a7 5309 &inode->runtime_flags);
28ed1345
CM
5310 while(1) {
5311 ret = btrfs_truncate_inode_items(trans,
50743398 5312 log, inode, 0, 0);
28ed1345
CM
5313 if (ret != -EAGAIN)
5314 break;
5315 }
a742994a 5316 }
4f764e51 5317 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
a59108a7 5318 &inode->runtime_flags) ||
6cfab851 5319 inode_only == LOG_INODE_EXISTS) {
4f764e51 5320 if (inode_only == LOG_INODE_ALL)
183f37fa 5321 fast_search = true;
4f764e51 5322 max_key.type = BTRFS_XATTR_ITEM_KEY;
5dc562c5 5323 ret = drop_objectid_items(trans, log, path, ino,
e9976151 5324 max_key.type);
a95249b3
JB
5325 } else {
5326 if (inode_only == LOG_INODE_ALL)
5327 fast_search = true;
a95249b3 5328 goto log_extents;
5dc562c5 5329 }
a95249b3 5330
e02119d5 5331 }
4a500fd1
YZ
5332 if (ret) {
5333 err = ret;
5334 goto out_unlock;
5335 }
e02119d5 5336
da447009
FM
5337 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5338 path, dst_path, logged_isize,
7af59743
FM
5339 recursive_logging, inode_only, ctx,
5340 &need_log_inode_item);
da447009
FM
5341 if (err)
5342 goto out_unlock;
5dc562c5 5343
36283bf7
FM
5344 btrfs_release_path(path);
5345 btrfs_release_path(dst_path);
a59108a7 5346 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
36283bf7
FM
5347 if (err)
5348 goto out_unlock;
9a8fca62 5349 xattrs_logged = true;
a89ca6f2
FM
5350 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5351 btrfs_release_path(path);
5352 btrfs_release_path(dst_path);
7af59743 5353 err = btrfs_log_holes(trans, root, inode, path);
a89ca6f2
FM
5354 if (err)
5355 goto out_unlock;
5356 }
a95249b3 5357log_extents:
f3b15ccd
JB
5358 btrfs_release_path(path);
5359 btrfs_release_path(dst_path);
e4545de5 5360 if (need_log_inode_item) {
a59108a7 5361 err = log_inode_item(trans, log, dst_path, inode);
9a8fca62
FM
5362 if (!err && !xattrs_logged) {
5363 err = btrfs_log_all_xattrs(trans, root, inode, path,
5364 dst_path);
5365 btrfs_release_path(path);
5366 }
e4545de5
FM
5367 if (err)
5368 goto out_unlock;
5369 }
5dc562c5 5370 if (fast_search) {
a59108a7 5371 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
48778179 5372 ctx);
5dc562c5
JB
5373 if (ret) {
5374 err = ret;
5375 goto out_unlock;
5376 }
d006a048 5377 } else if (inode_only == LOG_INODE_ALL) {
06d3d22b
LB
5378 struct extent_map *em, *n;
5379
49dae1bc 5380 write_lock(&em_tree->lock);
48778179
FM
5381 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5382 list_del_init(&em->list);
49dae1bc 5383 write_unlock(&em_tree->lock);
5dc562c5
JB
5384 }
5385
a59108a7
NB
5386 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5387 ret = log_directory_changes(trans, root, inode, path, dst_path,
5388 ctx);
4a500fd1
YZ
5389 if (ret) {
5390 err = ret;
5391 goto out_unlock;
5392 }
e02119d5 5393 }
49dae1bc 5394
d1d832a0 5395 /*
75b463d2
FM
5396 * If we are logging that an ancestor inode exists as part of logging a
5397 * new name from a link or rename operation, don't mark the inode as
5398 * logged - otherwise if an explicit fsync is made against an ancestor,
5399 * the fsync considers the inode in the log and doesn't sync the log,
5400 * resulting in the ancestor missing after a power failure unless the
5401 * log was synced as part of an fsync against any other unrelated inode.
5402 * So keep it simple for this case and just don't flag the ancestors as
5403 * logged.
d1d832a0 5404 */
75b463d2
FM
5405 if (!ctx ||
5406 !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5407 &inode->vfs_inode != ctx->inode)) {
5408 spin_lock(&inode->lock);
5409 inode->logged_trans = trans->transid;
5410 /*
5411 * Don't update last_log_commit if we logged that an inode exists
5412 * after it was loaded to memory (full_sync bit set).
5413 * This is to prevent data loss when we do a write to the inode,
5414 * then the inode gets evicted after all delalloc was flushed,
5415 * then we log it exists (due to a rename for example) and then
5416 * fsync it. This last fsync would do nothing (not logging the
5417 * extents previously written).
5418 */
5419 if (inode_only != LOG_INODE_EXISTS ||
5420 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5421 inode->last_log_commit = inode->last_sub_trans;
5422 spin_unlock(&inode->lock);
5423 }
4a500fd1 5424out_unlock:
a59108a7 5425 mutex_unlock(&inode->log_mutex);
e02119d5
CM
5426
5427 btrfs_free_path(path);
5428 btrfs_free_path(dst_path);
4a500fd1 5429 return err;
e02119d5
CM
5430}
5431
2be63d5c
FM
5432/*
5433 * Check if we must fallback to a transaction commit when logging an inode.
5434 * This must be called after logging the inode and is used only in the context
5435 * when fsyncing an inode requires the need to log some other inode - in which
5436 * case we can't lock the i_mutex of each other inode we need to log as that
5437 * can lead to deadlocks with concurrent fsync against other inodes (as we can
5438 * log inodes up or down in the hierarchy) or rename operations for example. So
5439 * we take the log_mutex of the inode after we have logged it and then check for
5440 * its last_unlink_trans value - this is safe because any task setting
5441 * last_unlink_trans must take the log_mutex and it must do this before it does
5442 * the actual unlink operation, so if we do this check before a concurrent task
5443 * sets last_unlink_trans it means we've logged a consistent version/state of
5444 * all the inode items, otherwise we are not sure and must do a transaction
01327610 5445 * commit (the concurrent task might have only updated last_unlink_trans before
2be63d5c
FM
5446 * we logged the inode or it might have also done the unlink).
5447 */
5448static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
ab1717b2 5449 struct btrfs_inode *inode)
2be63d5c 5450{
ab1717b2 5451 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2be63d5c
FM
5452 bool ret = false;
5453
ab1717b2
NB
5454 mutex_lock(&inode->log_mutex);
5455 if (inode->last_unlink_trans > fs_info->last_trans_committed) {
2be63d5c
FM
5456 /*
5457 * Make sure any commits to the log are forced to be full
5458 * commits.
5459 */
90787766 5460 btrfs_set_log_full_commit(trans);
2be63d5c
FM
5461 ret = true;
5462 }
ab1717b2 5463 mutex_unlock(&inode->log_mutex);
2be63d5c
FM
5464
5465 return ret;
5466}
5467
12fcfd22
CM
5468/*
5469 * follow the dentry parent pointers up the chain and see if any
5470 * of the directories in it require a full commit before they can
5471 * be logged. Returns zero if nothing special needs to be done or 1 if
5472 * a full commit is required.
5473 */
5474static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
aefa6115 5475 struct btrfs_inode *inode,
12fcfd22
CM
5476 struct dentry *parent,
5477 struct super_block *sb,
5478 u64 last_committed)
e02119d5 5479{
12fcfd22 5480 int ret = 0;
6a912213 5481 struct dentry *old_parent = NULL;
e02119d5 5482
af4176b4
CM
5483 /*
5484 * for regular files, if its inode is already on disk, we don't
5485 * have to worry about the parents at all. This is because
5486 * we can use the last_unlink_trans field to record renames
5487 * and other fun in this file.
5488 */
aefa6115
NB
5489 if (S_ISREG(inode->vfs_inode.i_mode) &&
5490 inode->generation <= last_committed &&
5491 inode->last_unlink_trans <= last_committed)
5492 goto out;
af4176b4 5493
aefa6115 5494 if (!S_ISDIR(inode->vfs_inode.i_mode)) {
fc64005c 5495 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
12fcfd22 5496 goto out;
aefa6115 5497 inode = BTRFS_I(d_inode(parent));
12fcfd22
CM
5498 }
5499
5500 while (1) {
aefa6115 5501 if (btrfs_must_commit_transaction(trans, inode)) {
12fcfd22
CM
5502 ret = 1;
5503 break;
5504 }
5505
fc64005c 5506 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
12fcfd22
CM
5507 break;
5508
44f714da 5509 if (IS_ROOT(parent)) {
aefa6115
NB
5510 inode = BTRFS_I(d_inode(parent));
5511 if (btrfs_must_commit_transaction(trans, inode))
44f714da 5512 ret = 1;
12fcfd22 5513 break;
44f714da 5514 }
12fcfd22 5515
6a912213
JB
5516 parent = dget_parent(parent);
5517 dput(old_parent);
5518 old_parent = parent;
aefa6115 5519 inode = BTRFS_I(d_inode(parent));
12fcfd22
CM
5520
5521 }
6a912213 5522 dput(old_parent);
12fcfd22 5523out:
e02119d5
CM
5524 return ret;
5525}
5526
2f2ff0ee
FM
5527struct btrfs_dir_list {
5528 u64 ino;
5529 struct list_head list;
5530};
5531
5532/*
5533 * Log the inodes of the new dentries of a directory. See log_dir_items() for
5534 * details about the why it is needed.
5535 * This is a recursive operation - if an existing dentry corresponds to a
5536 * directory, that directory's new entries are logged too (same behaviour as
5537 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5538 * the dentries point to we do not lock their i_mutex, otherwise lockdep
5539 * complains about the following circular lock dependency / possible deadlock:
5540 *
5541 * CPU0 CPU1
5542 * ---- ----
5543 * lock(&type->i_mutex_dir_key#3/2);
5544 * lock(sb_internal#2);
5545 * lock(&type->i_mutex_dir_key#3/2);
5546 * lock(&sb->s_type->i_mutex_key#14);
5547 *
5548 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5549 * sb_start_intwrite() in btrfs_start_transaction().
5550 * Not locking i_mutex of the inodes is still safe because:
5551 *
5552 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5553 * that while logging the inode new references (names) are added or removed
5554 * from the inode, leaving the logged inode item with a link count that does
5555 * not match the number of logged inode reference items. This is fine because
5556 * at log replay time we compute the real number of links and correct the
5557 * link count in the inode item (see replay_one_buffer() and
5558 * link_to_fixup_dir());
5559 *
5560 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5561 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5562 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5563 * has a size that doesn't match the sum of the lengths of all the logged
5564 * names. This does not result in a problem because if a dir_item key is
5565 * logged but its matching dir_index key is not logged, at log replay time we
5566 * don't use it to replay the respective name (see replay_one_name()). On the
5567 * other hand if only the dir_index key ends up being logged, the respective
5568 * name is added to the fs/subvol tree with both the dir_item and dir_index
5569 * keys created (see replay_one_name()).
5570 * The directory's inode item with a wrong i_size is not a problem as well,
5571 * since we don't use it at log replay time to set the i_size in the inode
5572 * item of the fs/subvol tree (see overwrite_item()).
5573 */
5574static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5575 struct btrfs_root *root,
51cc0d32 5576 struct btrfs_inode *start_inode,
2f2ff0ee
FM
5577 struct btrfs_log_ctx *ctx)
5578{
0b246afa 5579 struct btrfs_fs_info *fs_info = root->fs_info;
2f2ff0ee
FM
5580 struct btrfs_root *log = root->log_root;
5581 struct btrfs_path *path;
5582 LIST_HEAD(dir_list);
5583 struct btrfs_dir_list *dir_elem;
5584 int ret = 0;
5585
5586 path = btrfs_alloc_path();
5587 if (!path)
5588 return -ENOMEM;
5589
5590 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5591 if (!dir_elem) {
5592 btrfs_free_path(path);
5593 return -ENOMEM;
5594 }
51cc0d32 5595 dir_elem->ino = btrfs_ino(start_inode);
2f2ff0ee
FM
5596 list_add_tail(&dir_elem->list, &dir_list);
5597
5598 while (!list_empty(&dir_list)) {
5599 struct extent_buffer *leaf;
5600 struct btrfs_key min_key;
5601 int nritems;
5602 int i;
5603
5604 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5605 list);
5606 if (ret)
5607 goto next_dir_inode;
5608
5609 min_key.objectid = dir_elem->ino;
5610 min_key.type = BTRFS_DIR_ITEM_KEY;
5611 min_key.offset = 0;
5612again:
5613 btrfs_release_path(path);
5614 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5615 if (ret < 0) {
5616 goto next_dir_inode;
5617 } else if (ret > 0) {
5618 ret = 0;
5619 goto next_dir_inode;
5620 }
5621
5622process_leaf:
5623 leaf = path->nodes[0];
5624 nritems = btrfs_header_nritems(leaf);
5625 for (i = path->slots[0]; i < nritems; i++) {
5626 struct btrfs_dir_item *di;
5627 struct btrfs_key di_key;
5628 struct inode *di_inode;
5629 struct btrfs_dir_list *new_dir_elem;
5630 int log_mode = LOG_INODE_EXISTS;
5631 int type;
5632
5633 btrfs_item_key_to_cpu(leaf, &min_key, i);
5634 if (min_key.objectid != dir_elem->ino ||
5635 min_key.type != BTRFS_DIR_ITEM_KEY)
5636 goto next_dir_inode;
5637
5638 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5639 type = btrfs_dir_type(leaf, di);
5640 if (btrfs_dir_transid(leaf, di) < trans->transid &&
5641 type != BTRFS_FT_DIR)
5642 continue;
5643 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5644 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5645 continue;
5646
ec125cfb 5647 btrfs_release_path(path);
0202e83f 5648 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
2f2ff0ee
FM
5649 if (IS_ERR(di_inode)) {
5650 ret = PTR_ERR(di_inode);
5651 goto next_dir_inode;
5652 }
5653
0f8939b8 5654 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
410f954c 5655 btrfs_add_delayed_iput(di_inode);
ec125cfb 5656 break;
2f2ff0ee
FM
5657 }
5658
5659 ctx->log_new_dentries = false;
3f9749f6 5660 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
2f2ff0ee 5661 log_mode = LOG_INODE_ALL;
a59108a7 5662 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
48778179 5663 log_mode, ctx);
2be63d5c 5664 if (!ret &&
ab1717b2 5665 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
2be63d5c 5666 ret = 1;
410f954c 5667 btrfs_add_delayed_iput(di_inode);
2f2ff0ee
FM
5668 if (ret)
5669 goto next_dir_inode;
5670 if (ctx->log_new_dentries) {
5671 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5672 GFP_NOFS);
5673 if (!new_dir_elem) {
5674 ret = -ENOMEM;
5675 goto next_dir_inode;
5676 }
5677 new_dir_elem->ino = di_key.objectid;
5678 list_add_tail(&new_dir_elem->list, &dir_list);
5679 }
5680 break;
5681 }
5682 if (i == nritems) {
5683 ret = btrfs_next_leaf(log, path);
5684 if (ret < 0) {
5685 goto next_dir_inode;
5686 } else if (ret > 0) {
5687 ret = 0;
5688 goto next_dir_inode;
5689 }
5690 goto process_leaf;
5691 }
5692 if (min_key.offset < (u64)-1) {
5693 min_key.offset++;
5694 goto again;
5695 }
5696next_dir_inode:
5697 list_del(&dir_elem->list);
5698 kfree(dir_elem);
5699 }
5700
5701 btrfs_free_path(path);
5702 return ret;
5703}
5704
18aa0922 5705static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
d0a0b78d 5706 struct btrfs_inode *inode,
18aa0922
FM
5707 struct btrfs_log_ctx *ctx)
5708{
3ffbd68c 5709 struct btrfs_fs_info *fs_info = trans->fs_info;
18aa0922
FM
5710 int ret;
5711 struct btrfs_path *path;
5712 struct btrfs_key key;
d0a0b78d
NB
5713 struct btrfs_root *root = inode->root;
5714 const u64 ino = btrfs_ino(inode);
18aa0922
FM
5715
5716 path = btrfs_alloc_path();
5717 if (!path)
5718 return -ENOMEM;
5719 path->skip_locking = 1;
5720 path->search_commit_root = 1;
5721
5722 key.objectid = ino;
5723 key.type = BTRFS_INODE_REF_KEY;
5724 key.offset = 0;
5725 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5726 if (ret < 0)
5727 goto out;
5728
5729 while (true) {
5730 struct extent_buffer *leaf = path->nodes[0];
5731 int slot = path->slots[0];
5732 u32 cur_offset = 0;
5733 u32 item_size;
5734 unsigned long ptr;
5735
5736 if (slot >= btrfs_header_nritems(leaf)) {
5737 ret = btrfs_next_leaf(root, path);
5738 if (ret < 0)
5739 goto out;
5740 else if (ret > 0)
5741 break;
5742 continue;
5743 }
5744
5745 btrfs_item_key_to_cpu(leaf, &key, slot);
5746 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5747 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5748 break;
5749
5750 item_size = btrfs_item_size_nr(leaf, slot);
5751 ptr = btrfs_item_ptr_offset(leaf, slot);
5752 while (cur_offset < item_size) {
5753 struct btrfs_key inode_key;
5754 struct inode *dir_inode;
5755
5756 inode_key.type = BTRFS_INODE_ITEM_KEY;
5757 inode_key.offset = 0;
5758
5759 if (key.type == BTRFS_INODE_EXTREF_KEY) {
5760 struct btrfs_inode_extref *extref;
5761
5762 extref = (struct btrfs_inode_extref *)
5763 (ptr + cur_offset);
5764 inode_key.objectid = btrfs_inode_extref_parent(
5765 leaf, extref);
5766 cur_offset += sizeof(*extref);
5767 cur_offset += btrfs_inode_extref_name_len(leaf,
5768 extref);
5769 } else {
5770 inode_key.objectid = key.offset;
5771 cur_offset = item_size;
5772 }
5773
0202e83f
DS
5774 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5775 root);
0f375eed
FM
5776 /*
5777 * If the parent inode was deleted, return an error to
5778 * fallback to a transaction commit. This is to prevent
5779 * getting an inode that was moved from one parent A to
5780 * a parent B, got its former parent A deleted and then
5781 * it got fsync'ed, from existing at both parents after
5782 * a log replay (and the old parent still existing).
5783 * Example:
5784 *
5785 * mkdir /mnt/A
5786 * mkdir /mnt/B
5787 * touch /mnt/B/bar
5788 * sync
5789 * mv /mnt/B/bar /mnt/A/bar
5790 * mv -T /mnt/A /mnt/B
5791 * fsync /mnt/B/bar
5792 * <power fail>
5793 *
5794 * If we ignore the old parent B which got deleted,
5795 * after a log replay we would have file bar linked
5796 * at both parents and the old parent B would still
5797 * exist.
5798 */
5799 if (IS_ERR(dir_inode)) {
5800 ret = PTR_ERR(dir_inode);
5801 goto out;
5802 }
18aa0922 5803
657ed1aa
FM
5804 if (ctx)
5805 ctx->log_new_dentries = false;
a59108a7 5806 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
48778179 5807 LOG_INODE_ALL, ctx);
2be63d5c 5808 if (!ret &&
ab1717b2 5809 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
2be63d5c 5810 ret = 1;
657ed1aa
FM
5811 if (!ret && ctx && ctx->log_new_dentries)
5812 ret = log_new_dir_dentries(trans, root,
f85b7379 5813 BTRFS_I(dir_inode), ctx);
410f954c 5814 btrfs_add_delayed_iput(dir_inode);
18aa0922
FM
5815 if (ret)
5816 goto out;
5817 }
5818 path->slots[0]++;
5819 }
5820 ret = 0;
5821out:
5822 btrfs_free_path(path);
5823 return ret;
5824}
5825
b8aa330d
FM
5826static int log_new_ancestors(struct btrfs_trans_handle *trans,
5827 struct btrfs_root *root,
5828 struct btrfs_path *path,
5829 struct btrfs_log_ctx *ctx)
5830{
5831 struct btrfs_key found_key;
5832
5833 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5834
5835 while (true) {
5836 struct btrfs_fs_info *fs_info = root->fs_info;
b8aa330d
FM
5837 struct extent_buffer *leaf = path->nodes[0];
5838 int slot = path->slots[0];
5839 struct btrfs_key search_key;
5840 struct inode *inode;
0202e83f 5841 u64 ino;
b8aa330d
FM
5842 int ret = 0;
5843
5844 btrfs_release_path(path);
5845
0202e83f
DS
5846 ino = found_key.offset;
5847
b8aa330d
FM
5848 search_key.objectid = found_key.offset;
5849 search_key.type = BTRFS_INODE_ITEM_KEY;
5850 search_key.offset = 0;
0202e83f 5851 inode = btrfs_iget(fs_info->sb, ino, root);
b8aa330d
FM
5852 if (IS_ERR(inode))
5853 return PTR_ERR(inode);
5854
4d6221d7 5855 if (BTRFS_I(inode)->generation >= trans->transid)
b8aa330d 5856 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
48778179 5857 LOG_INODE_EXISTS, ctx);
410f954c 5858 btrfs_add_delayed_iput(inode);
b8aa330d
FM
5859 if (ret)
5860 return ret;
5861
5862 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5863 break;
5864
5865 search_key.type = BTRFS_INODE_REF_KEY;
5866 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5867 if (ret < 0)
5868 return ret;
5869
5870 leaf = path->nodes[0];
5871 slot = path->slots[0];
5872 if (slot >= btrfs_header_nritems(leaf)) {
5873 ret = btrfs_next_leaf(root, path);
5874 if (ret < 0)
5875 return ret;
5876 else if (ret > 0)
5877 return -ENOENT;
5878 leaf = path->nodes[0];
5879 slot = path->slots[0];
5880 }
5881
5882 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5883 if (found_key.objectid != search_key.objectid ||
5884 found_key.type != BTRFS_INODE_REF_KEY)
5885 return -ENOENT;
5886 }
5887 return 0;
5888}
5889
5890static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5891 struct btrfs_inode *inode,
5892 struct dentry *parent,
5893 struct btrfs_log_ctx *ctx)
5894{
5895 struct btrfs_root *root = inode->root;
b8aa330d
FM
5896 struct dentry *old_parent = NULL;
5897 struct super_block *sb = inode->vfs_inode.i_sb;
5898 int ret = 0;
5899
5900 while (true) {
5901 if (!parent || d_really_is_negative(parent) ||
5902 sb != parent->d_sb)
5903 break;
5904
5905 inode = BTRFS_I(d_inode(parent));
5906 if (root != inode->root)
5907 break;
5908
4d6221d7 5909 if (inode->generation >= trans->transid) {
b8aa330d 5910 ret = btrfs_log_inode(trans, root, inode,
48778179 5911 LOG_INODE_EXISTS, ctx);
b8aa330d
FM
5912 if (ret)
5913 break;
5914 }
5915 if (IS_ROOT(parent))
5916 break;
5917
5918 parent = dget_parent(parent);
5919 dput(old_parent);
5920 old_parent = parent;
5921 }
5922 dput(old_parent);
5923
5924 return ret;
5925}
5926
5927static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
5928 struct btrfs_inode *inode,
5929 struct dentry *parent,
5930 struct btrfs_log_ctx *ctx)
5931{
5932 struct btrfs_root *root = inode->root;
5933 const u64 ino = btrfs_ino(inode);
5934 struct btrfs_path *path;
5935 struct btrfs_key search_key;
5936 int ret;
5937
5938 /*
5939 * For a single hard link case, go through a fast path that does not
5940 * need to iterate the fs/subvolume tree.
5941 */
5942 if (inode->vfs_inode.i_nlink < 2)
5943 return log_new_ancestors_fast(trans, inode, parent, ctx);
5944
5945 path = btrfs_alloc_path();
5946 if (!path)
5947 return -ENOMEM;
5948
5949 search_key.objectid = ino;
5950 search_key.type = BTRFS_INODE_REF_KEY;
5951 search_key.offset = 0;
5952again:
5953 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5954 if (ret < 0)
5955 goto out;
5956 if (ret == 0)
5957 path->slots[0]++;
5958
5959 while (true) {
5960 struct extent_buffer *leaf = path->nodes[0];
5961 int slot = path->slots[0];
5962 struct btrfs_key found_key;
5963
5964 if (slot >= btrfs_header_nritems(leaf)) {
5965 ret = btrfs_next_leaf(root, path);
5966 if (ret < 0)
5967 goto out;
5968 else if (ret > 0)
5969 break;
5970 continue;
5971 }
5972
5973 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5974 if (found_key.objectid != ino ||
5975 found_key.type > BTRFS_INODE_EXTREF_KEY)
5976 break;
5977
5978 /*
5979 * Don't deal with extended references because they are rare
5980 * cases and too complex to deal with (we would need to keep
5981 * track of which subitem we are processing for each item in
5982 * this loop, etc). So just return some error to fallback to
5983 * a transaction commit.
5984 */
5985 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
5986 ret = -EMLINK;
5987 goto out;
5988 }
5989
5990 /*
5991 * Logging ancestors needs to do more searches on the fs/subvol
5992 * tree, so it releases the path as needed to avoid deadlocks.
5993 * Keep track of the last inode ref key and resume from that key
5994 * after logging all new ancestors for the current hard link.
5995 */
5996 memcpy(&search_key, &found_key, sizeof(search_key));
5997
5998 ret = log_new_ancestors(trans, root, path, ctx);
5999 if (ret)
6000 goto out;
6001 btrfs_release_path(path);
6002 goto again;
6003 }
6004 ret = 0;
6005out:
6006 btrfs_free_path(path);
6007 return ret;
6008}
6009
e02119d5
CM
6010/*
6011 * helper function around btrfs_log_inode to make sure newly created
6012 * parent directories also end up in the log. A minimal inode and backref
6013 * only logging is done of any parent directories that are older than
6014 * the last committed transaction
6015 */
48a3b636 6016static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
19df27a9 6017 struct btrfs_inode *inode,
49dae1bc 6018 struct dentry *parent,
41a1eada 6019 int inode_only,
8b050d35 6020 struct btrfs_log_ctx *ctx)
e02119d5 6021{
f882274b 6022 struct btrfs_root *root = inode->root;
0b246afa 6023 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5 6024 struct super_block *sb;
12fcfd22 6025 int ret = 0;
0b246afa 6026 u64 last_committed = fs_info->last_trans_committed;
2f2ff0ee 6027 bool log_dentries = false;
12fcfd22 6028
19df27a9 6029 sb = inode->vfs_inode.i_sb;
12fcfd22 6030
0b246afa 6031 if (btrfs_test_opt(fs_info, NOTREELOG)) {
3a5e1404
SW
6032 ret = 1;
6033 goto end_no_trans;
6034 }
6035
995946dd
MX
6036 /*
6037 * The prev transaction commit doesn't complete, we need do
6038 * full commit by ourselves.
6039 */
0b246afa
JM
6040 if (fs_info->last_trans_log_full_commit >
6041 fs_info->last_trans_committed) {
12fcfd22
CM
6042 ret = 1;
6043 goto end_no_trans;
6044 }
6045
f882274b 6046 if (btrfs_root_refs(&root->root_item) == 0) {
76dda93c
YZ
6047 ret = 1;
6048 goto end_no_trans;
6049 }
6050
19df27a9
NB
6051 ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
6052 last_committed);
12fcfd22
CM
6053 if (ret)
6054 goto end_no_trans;
e02119d5 6055
f2d72f42
FM
6056 /*
6057 * Skip already logged inodes or inodes corresponding to tmpfiles
6058 * (since logging them is pointless, a link count of 0 means they
6059 * will never be accessible).
6060 */
6061 if (btrfs_inode_in_log(inode, trans->transid) ||
6062 inode->vfs_inode.i_nlink == 0) {
257c62e1
CM
6063 ret = BTRFS_NO_LOG_SYNC;
6064 goto end_no_trans;
6065 }
6066
8b050d35 6067 ret = start_log_trans(trans, root, ctx);
4a500fd1 6068 if (ret)
e87ac136 6069 goto end_no_trans;
e02119d5 6070
48778179 6071 ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
4a500fd1
YZ
6072 if (ret)
6073 goto end_trans;
12fcfd22 6074
af4176b4
CM
6075 /*
6076 * for regular files, if its inode is already on disk, we don't
6077 * have to worry about the parents at all. This is because
6078 * we can use the last_unlink_trans field to record renames
6079 * and other fun in this file.
6080 */
19df27a9
NB
6081 if (S_ISREG(inode->vfs_inode.i_mode) &&
6082 inode->generation <= last_committed &&
6083 inode->last_unlink_trans <= last_committed) {
4a500fd1
YZ
6084 ret = 0;
6085 goto end_trans;
6086 }
af4176b4 6087
19df27a9 6088 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
2f2ff0ee
FM
6089 log_dentries = true;
6090
18aa0922 6091 /*
01327610 6092 * On unlink we must make sure all our current and old parent directory
18aa0922
FM
6093 * inodes are fully logged. This is to prevent leaving dangling
6094 * directory index entries in directories that were our parents but are
6095 * not anymore. Not doing this results in old parent directory being
6096 * impossible to delete after log replay (rmdir will always fail with
6097 * error -ENOTEMPTY).
6098 *
6099 * Example 1:
6100 *
6101 * mkdir testdir
6102 * touch testdir/foo
6103 * ln testdir/foo testdir/bar
6104 * sync
6105 * unlink testdir/bar
6106 * xfs_io -c fsync testdir/foo
6107 * <power failure>
6108 * mount fs, triggers log replay
6109 *
6110 * If we don't log the parent directory (testdir), after log replay the
6111 * directory still has an entry pointing to the file inode using the bar
6112 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6113 * the file inode has a link count of 1.
6114 *
6115 * Example 2:
6116 *
6117 * mkdir testdir
6118 * touch foo
6119 * ln foo testdir/foo2
6120 * ln foo testdir/foo3
6121 * sync
6122 * unlink testdir/foo3
6123 * xfs_io -c fsync foo
6124 * <power failure>
6125 * mount fs, triggers log replay
6126 *
6127 * Similar as the first example, after log replay the parent directory
6128 * testdir still has an entry pointing to the inode file with name foo3
6129 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6130 * and has a link count of 2.
6131 */
19df27a9 6132 if (inode->last_unlink_trans > last_committed) {
b8aa330d 6133 ret = btrfs_log_all_parents(trans, inode, ctx);
18aa0922
FM
6134 if (ret)
6135 goto end_trans;
6136 }
6137
b8aa330d
FM
6138 ret = log_all_new_ancestors(trans, inode, parent, ctx);
6139 if (ret)
41bd6067 6140 goto end_trans;
76dda93c 6141
2f2ff0ee 6142 if (log_dentries)
b8aa330d 6143 ret = log_new_dir_dentries(trans, root, inode, ctx);
2f2ff0ee
FM
6144 else
6145 ret = 0;
4a500fd1
YZ
6146end_trans:
6147 if (ret < 0) {
90787766 6148 btrfs_set_log_full_commit(trans);
4a500fd1
YZ
6149 ret = 1;
6150 }
8b050d35
MX
6151
6152 if (ret)
6153 btrfs_remove_log_ctx(root, ctx);
12fcfd22
CM
6154 btrfs_end_log_trans(root);
6155end_no_trans:
6156 return ret;
e02119d5
CM
6157}
6158
6159/*
6160 * it is not safe to log dentry if the chunk root has added new
6161 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
6162 * If this returns 1, you must commit the transaction to safely get your
6163 * data on disk.
6164 */
6165int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
e5b84f7a 6166 struct dentry *dentry,
8b050d35 6167 struct btrfs_log_ctx *ctx)
e02119d5 6168{
6a912213
JB
6169 struct dentry *parent = dget_parent(dentry);
6170 int ret;
6171
f882274b 6172 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
48778179 6173 LOG_INODE_ALL, ctx);
6a912213
JB
6174 dput(parent);
6175
6176 return ret;
e02119d5
CM
6177}
6178
6179/*
6180 * should be called during mount to recover any replay any log trees
6181 * from the FS
6182 */
6183int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6184{
6185 int ret;
6186 struct btrfs_path *path;
6187 struct btrfs_trans_handle *trans;
6188 struct btrfs_key key;
6189 struct btrfs_key found_key;
e02119d5
CM
6190 struct btrfs_root *log;
6191 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6192 struct walk_control wc = {
6193 .process_func = process_one_buffer,
430a6626 6194 .stage = LOG_WALK_PIN_ONLY,
e02119d5
CM
6195 };
6196
e02119d5 6197 path = btrfs_alloc_path();
db5b493a
TI
6198 if (!path)
6199 return -ENOMEM;
6200
afcdd129 6201 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
e02119d5 6202
4a500fd1 6203 trans = btrfs_start_transaction(fs_info->tree_root, 0);
79787eaa
JM
6204 if (IS_ERR(trans)) {
6205 ret = PTR_ERR(trans);
6206 goto error;
6207 }
e02119d5
CM
6208
6209 wc.trans = trans;
6210 wc.pin = 1;
6211
db5b493a 6212 ret = walk_log_tree(trans, log_root_tree, &wc);
79787eaa 6213 if (ret) {
5d163e0e
JM
6214 btrfs_handle_fs_error(fs_info, ret,
6215 "Failed to pin buffers while recovering log root tree.");
79787eaa
JM
6216 goto error;
6217 }
e02119d5
CM
6218
6219again:
6220 key.objectid = BTRFS_TREE_LOG_OBJECTID;
6221 key.offset = (u64)-1;
962a298f 6222 key.type = BTRFS_ROOT_ITEM_KEY;
e02119d5 6223
d397712b 6224 while (1) {
e02119d5 6225 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
79787eaa
JM
6226
6227 if (ret < 0) {
34d97007 6228 btrfs_handle_fs_error(fs_info, ret,
79787eaa
JM
6229 "Couldn't find tree log root.");
6230 goto error;
6231 }
e02119d5
CM
6232 if (ret > 0) {
6233 if (path->slots[0] == 0)
6234 break;
6235 path->slots[0]--;
6236 }
6237 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6238 path->slots[0]);
b3b4aa74 6239 btrfs_release_path(path);
e02119d5
CM
6240 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6241 break;
6242
62a2c73e 6243 log = btrfs_read_tree_root(log_root_tree, &found_key);
79787eaa
JM
6244 if (IS_ERR(log)) {
6245 ret = PTR_ERR(log);
34d97007 6246 btrfs_handle_fs_error(fs_info, ret,
79787eaa
JM
6247 "Couldn't read tree log root.");
6248 goto error;
6249 }
e02119d5 6250
56e9357a
DS
6251 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6252 true);
79787eaa
JM
6253 if (IS_ERR(wc.replay_dest)) {
6254 ret = PTR_ERR(wc.replay_dest);
9bc574de
JB
6255
6256 /*
6257 * We didn't find the subvol, likely because it was
6258 * deleted. This is ok, simply skip this log and go to
6259 * the next one.
6260 *
6261 * We need to exclude the root because we can't have
6262 * other log replays overwriting this log as we'll read
6263 * it back in a few more times. This will keep our
6264 * block from being modified, and we'll just bail for
6265 * each subsequent pass.
6266 */
6267 if (ret == -ENOENT)
9fce5704 6268 ret = btrfs_pin_extent_for_log_replay(trans,
9bc574de
JB
6269 log->node->start,
6270 log->node->len);
00246528 6271 btrfs_put_root(log);
9bc574de
JB
6272
6273 if (!ret)
6274 goto next;
5d163e0e
JM
6275 btrfs_handle_fs_error(fs_info, ret,
6276 "Couldn't read target root for tree log recovery.");
79787eaa
JM
6277 goto error;
6278 }
e02119d5 6279
07d400a6 6280 wc.replay_dest->log_root = log;
5d4f98a2 6281 btrfs_record_root_in_trans(trans, wc.replay_dest);
e02119d5 6282 ret = walk_log_tree(trans, log, &wc);
e02119d5 6283
b50c6e25 6284 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
e02119d5
CM
6285 ret = fixup_inode_link_counts(trans, wc.replay_dest,
6286 path);
e02119d5
CM
6287 }
6288
900c9981
LB
6289 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6290 struct btrfs_root *root = wc.replay_dest;
6291
6292 btrfs_release_path(path);
6293
6294 /*
6295 * We have just replayed everything, and the highest
6296 * objectid of fs roots probably has changed in case
6297 * some inode_item's got replayed.
6298 *
6299 * root->objectid_mutex is not acquired as log replay
6300 * could only happen during mount.
6301 */
6302 ret = btrfs_find_highest_objectid(root,
6303 &root->highest_objectid);
6304 }
6305
07d400a6 6306 wc.replay_dest->log_root = NULL;
00246528 6307 btrfs_put_root(wc.replay_dest);
00246528 6308 btrfs_put_root(log);
e02119d5 6309
b50c6e25
JB
6310 if (ret)
6311 goto error;
9bc574de 6312next:
e02119d5
CM
6313 if (found_key.offset == 0)
6314 break;
9bc574de 6315 key.offset = found_key.offset - 1;
e02119d5 6316 }
b3b4aa74 6317 btrfs_release_path(path);
e02119d5
CM
6318
6319 /* step one is to pin it all, step two is to replay just inodes */
6320 if (wc.pin) {
6321 wc.pin = 0;
6322 wc.process_func = replay_one_buffer;
6323 wc.stage = LOG_WALK_REPLAY_INODES;
6324 goto again;
6325 }
6326 /* step three is to replay everything */
6327 if (wc.stage < LOG_WALK_REPLAY_ALL) {
6328 wc.stage++;
6329 goto again;
6330 }
6331
6332 btrfs_free_path(path);
6333
abefa55a 6334 /* step 4: commit the transaction, which also unpins the blocks */
3a45bb20 6335 ret = btrfs_commit_transaction(trans);
abefa55a
JB
6336 if (ret)
6337 return ret;
6338
e02119d5 6339 log_root_tree->log_root = NULL;
afcdd129 6340 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
00246528 6341 btrfs_put_root(log_root_tree);
79787eaa 6342
abefa55a 6343 return 0;
79787eaa 6344error:
b50c6e25 6345 if (wc.trans)
3a45bb20 6346 btrfs_end_transaction(wc.trans);
79787eaa
JM
6347 btrfs_free_path(path);
6348 return ret;
e02119d5 6349}
12fcfd22
CM
6350
6351/*
6352 * there are some corner cases where we want to force a full
6353 * commit instead of allowing a directory to be logged.
6354 *
6355 * They revolve around files there were unlinked from the directory, and
6356 * this function updates the parent directory so that a full commit is
6357 * properly done if it is fsync'd later after the unlinks are done.
2be63d5c
FM
6358 *
6359 * Must be called before the unlink operations (updates to the subvolume tree,
6360 * inodes, etc) are done.
12fcfd22
CM
6361 */
6362void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4176bdbf 6363 struct btrfs_inode *dir, struct btrfs_inode *inode,
12fcfd22
CM
6364 int for_rename)
6365{
af4176b4
CM
6366 /*
6367 * when we're logging a file, if it hasn't been renamed
6368 * or unlinked, and its inode is fully committed on disk,
6369 * we don't have to worry about walking up the directory chain
6370 * to log its parents.
6371 *
6372 * So, we use the last_unlink_trans field to put this transid
6373 * into the file. When the file is logged we check it and
6374 * don't log the parents if the file is fully on disk.
6375 */
4176bdbf
NB
6376 mutex_lock(&inode->log_mutex);
6377 inode->last_unlink_trans = trans->transid;
6378 mutex_unlock(&inode->log_mutex);
af4176b4 6379
12fcfd22
CM
6380 /*
6381 * if this directory was already logged any new
6382 * names for this file/dir will get recorded
6383 */
4176bdbf 6384 if (dir->logged_trans == trans->transid)
12fcfd22
CM
6385 return;
6386
6387 /*
6388 * if the inode we're about to unlink was logged,
6389 * the log will be properly updated for any new names
6390 */
4176bdbf 6391 if (inode->logged_trans == trans->transid)
12fcfd22
CM
6392 return;
6393
6394 /*
6395 * when renaming files across directories, if the directory
6396 * there we're unlinking from gets fsync'd later on, there's
6397 * no way to find the destination directory later and fsync it
6398 * properly. So, we have to be conservative and force commits
6399 * so the new name gets discovered.
6400 */
6401 if (for_rename)
6402 goto record;
6403
6404 /* we can safely do the unlink without any special recording */
6405 return;
6406
6407record:
4176bdbf
NB
6408 mutex_lock(&dir->log_mutex);
6409 dir->last_unlink_trans = trans->transid;
6410 mutex_unlock(&dir->log_mutex);
1ec9a1ae
FM
6411}
6412
6413/*
6414 * Make sure that if someone attempts to fsync the parent directory of a deleted
6415 * snapshot, it ends up triggering a transaction commit. This is to guarantee
6416 * that after replaying the log tree of the parent directory's root we will not
6417 * see the snapshot anymore and at log replay time we will not see any log tree
6418 * corresponding to the deleted snapshot's root, which could lead to replaying
6419 * it after replaying the log tree of the parent directory (which would replay
6420 * the snapshot delete operation).
2be63d5c
FM
6421 *
6422 * Must be called before the actual snapshot destroy operation (updates to the
6423 * parent root and tree of tree roots trees, etc) are done.
1ec9a1ae
FM
6424 */
6425void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
43663557 6426 struct btrfs_inode *dir)
1ec9a1ae 6427{
43663557
NB
6428 mutex_lock(&dir->log_mutex);
6429 dir->last_unlink_trans = trans->transid;
6430 mutex_unlock(&dir->log_mutex);
12fcfd22
CM
6431}
6432
6433/*
6434 * Call this after adding a new name for a file and it will properly
6435 * update the log to reflect the new name.
12fcfd22 6436 */
75b463d2 6437void btrfs_log_new_name(struct btrfs_trans_handle *trans,
9ca5fbfb 6438 struct btrfs_inode *inode, struct btrfs_inode *old_dir,
75b463d2 6439 struct dentry *parent)
12fcfd22 6440{
75b463d2 6441 struct btrfs_log_ctx ctx;
12fcfd22 6442
af4176b4
CM
6443 /*
6444 * this will force the logging code to walk the dentry chain
6445 * up for the file
6446 */
9a6509c4 6447 if (!S_ISDIR(inode->vfs_inode.i_mode))
9ca5fbfb 6448 inode->last_unlink_trans = trans->transid;
af4176b4 6449
12fcfd22
CM
6450 /*
6451 * if this inode hasn't been logged and directory we're renaming it
6452 * from hasn't been logged, we don't need to log it
6453 */
de53d892
FM
6454 if (inode->logged_trans < trans->transid &&
6455 (!old_dir || old_dir->logged_trans < trans->transid))
75b463d2 6456 return;
12fcfd22 6457
75b463d2
FM
6458 btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6459 ctx.logging_new_name = true;
6460 /*
6461 * We don't care about the return value. If we fail to log the new name
6462 * then we know the next attempt to sync the log will fallback to a full
6463 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6464 * we don't need to worry about getting a log committed that has an
6465 * inconsistent state after a rename operation.
6466 */
48778179 6467 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
12fcfd22
CM
6468}
6469