]>
git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - fs/ext4/orphan.c
2 * Ext4 orphan inode handling
5 #include <linux/quotaops.h>
6 #include <linux/buffer_head.h>
11 static int ext4_orphan_file_add(handle_t
*handle
, struct inode
*inode
)
14 struct ext4_orphan_info
*oi
= &EXT4_SB(inode
->i_sb
)->s_orphan_info
;
18 int inodes_per_ob
= ext4_inodes_per_orphan_block(inode
->i_sb
);
22 * Find block with free orphan entry. Use CPU number for a naive hash
23 * for a search start in the orphan file
25 start
= raw_smp_processor_id()*13 % oi
->of_blocks
;
28 if (atomic_dec_if_positive(&oi
->of_binfo
[i
].ob_free_entries
)
33 if (++i
>= oi
->of_blocks
)
39 * For now we don't grow or shrink orphan file. We just use
40 * whatever was allocated at mke2fs time. The additional
41 * credits we would have to reserve for each orphan inode
42 * operation just don't seem worth it.
47 ret
= ext4_journal_get_write_access(handle
, inode
->i_sb
,
48 oi
->of_binfo
[i
].ob_bh
, EXT4_JTR_ORPHAN_FILE
);
50 atomic_inc(&oi
->of_binfo
[i
].ob_free_entries
);
54 bdata
= (__le32
*)(oi
->of_binfo
[i
].ob_bh
->b_data
);
55 /* Find empty slot in a block */
60 * Did we walk through the block several times without
61 * finding free entry? It is theoretically possible
62 * if entries get constantly allocated and freed or
63 * if the block is corrupted. Avoid indefinite looping
64 * and bail. We'll use orphan list instead.
67 atomic_inc(&oi
->of_binfo
[i
].ob_free_entries
);
73 if (++j
>= inodes_per_ob
) {
78 } while (cmpxchg(&bdata
[j
], (__le32
)0, cpu_to_le32(inode
->i_ino
)) !=
81 EXT4_I(inode
)->i_orphan_idx
= i
* inodes_per_ob
+ j
;
82 ext4_set_inode_state(inode
, EXT4_STATE_ORPHAN_FILE
);
84 return ext4_handle_dirty_metadata(handle
, NULL
, oi
->of_binfo
[i
].ob_bh
);
88 * ext4_orphan_add() links an unlinked or truncated inode into a list of
89 * such inodes, starting at the superblock, in case we crash before the
90 * file is closed/deleted, or in case the inode truncate spans multiple
91 * transactions and the last transaction is not recovered after a crash.
93 * At filesystem recovery time, we walk this list deleting unlinked
94 * inodes and truncating linked inodes in ext4_orphan_cleanup().
96 * Orphan list manipulation functions must be called under i_mutex unless
97 * we are just creating the inode or deleting it.
99 int ext4_orphan_add(handle_t
*handle
, struct inode
*inode
)
101 struct super_block
*sb
= inode
->i_sb
;
102 struct ext4_sb_info
*sbi
= EXT4_SB(sb
);
103 struct ext4_iloc iloc
;
107 if (!sbi
->s_journal
|| is_bad_inode(inode
))
110 WARN_ON_ONCE(!(inode
->i_state
& (I_NEW
| I_FREEING
)) &&
111 !inode_is_locked(inode
));
113 * Inode orphaned in orphan file or in orphan list?
115 if (ext4_test_inode_state(inode
, EXT4_STATE_ORPHAN_FILE
) ||
116 !list_empty(&EXT4_I(inode
)->i_orphan
))
120 * Orphan handling is only valid for files with data blocks
121 * being truncated, or files being unlinked. Note that we either
122 * hold i_mutex, or the inode can not be referenced from outside,
123 * so i_nlink should not be bumped due to race
125 ASSERT((S_ISREG(inode
->i_mode
) || S_ISDIR(inode
->i_mode
) ||
126 S_ISLNK(inode
->i_mode
)) || inode
->i_nlink
== 0);
128 if (sbi
->s_orphan_info
.of_blocks
) {
129 err
= ext4_orphan_file_add(handle
, inode
);
131 * Fallback to normal orphan list of orphan file is
138 BUFFER_TRACE(sbi
->s_sbh
, "get_write_access");
139 err
= ext4_journal_get_write_access(handle
, sb
, sbi
->s_sbh
,
144 err
= ext4_reserve_inode_write(handle
, inode
, &iloc
);
148 mutex_lock(&sbi
->s_orphan_lock
);
150 * Due to previous errors inode may be already a part of on-disk
151 * orphan list. If so skip on-disk list modification.
153 if (!NEXT_ORPHAN(inode
) || NEXT_ORPHAN(inode
) >
154 (le32_to_cpu(sbi
->s_es
->s_inodes_count
))) {
155 /* Insert this inode at the head of the on-disk orphan list */
156 NEXT_ORPHAN(inode
) = le32_to_cpu(sbi
->s_es
->s_last_orphan
);
157 lock_buffer(sbi
->s_sbh
);
158 sbi
->s_es
->s_last_orphan
= cpu_to_le32(inode
->i_ino
);
159 ext4_superblock_csum_set(sb
);
160 unlock_buffer(sbi
->s_sbh
);
163 list_add(&EXT4_I(inode
)->i_orphan
, &sbi
->s_orphan
);
164 mutex_unlock(&sbi
->s_orphan_lock
);
167 err
= ext4_handle_dirty_metadata(handle
, NULL
, sbi
->s_sbh
);
168 rc
= ext4_mark_iloc_dirty(handle
, inode
, &iloc
);
173 * We have to remove inode from in-memory list if
174 * addition to on disk orphan list failed. Stray orphan
175 * list entries can cause panics at unmount time.
177 mutex_lock(&sbi
->s_orphan_lock
);
178 list_del_init(&EXT4_I(inode
)->i_orphan
);
179 mutex_unlock(&sbi
->s_orphan_lock
);
184 jbd_debug(4, "superblock will point to %lu\n", inode
->i_ino
);
185 jbd_debug(4, "orphan inode %lu will point to %d\n",
186 inode
->i_ino
, NEXT_ORPHAN(inode
));
188 ext4_std_error(sb
, err
);
192 static int ext4_orphan_file_del(handle_t
*handle
, struct inode
*inode
)
194 struct ext4_orphan_info
*oi
= &EXT4_SB(inode
->i_sb
)->s_orphan_info
;
197 int inodes_per_ob
= ext4_inodes_per_orphan_block(inode
->i_sb
);
202 blk
= EXT4_I(inode
)->i_orphan_idx
/ inodes_per_ob
;
203 off
= EXT4_I(inode
)->i_orphan_idx
% inodes_per_ob
;
204 if (WARN_ON_ONCE(blk
>= oi
->of_blocks
))
207 ret
= ext4_journal_get_write_access(handle
, inode
->i_sb
,
208 oi
->of_binfo
[blk
].ob_bh
, EXT4_JTR_ORPHAN_FILE
);
212 bdata
= (__le32
*)(oi
->of_binfo
[blk
].ob_bh
->b_data
);
214 atomic_inc(&oi
->of_binfo
[blk
].ob_free_entries
);
215 ret
= ext4_handle_dirty_metadata(handle
, NULL
, oi
->of_binfo
[blk
].ob_bh
);
217 ext4_clear_inode_state(inode
, EXT4_STATE_ORPHAN_FILE
);
218 INIT_LIST_HEAD(&EXT4_I(inode
)->i_orphan
);
224 * ext4_orphan_del() removes an unlinked or truncated inode from the list
225 * of such inodes stored on disk, because it is finally being cleaned up.
227 int ext4_orphan_del(handle_t
*handle
, struct inode
*inode
)
229 struct list_head
*prev
;
230 struct ext4_inode_info
*ei
= EXT4_I(inode
);
231 struct ext4_sb_info
*sbi
= EXT4_SB(inode
->i_sb
);
233 struct ext4_iloc iloc
;
236 if (!sbi
->s_journal
&& !(sbi
->s_mount_state
& EXT4_ORPHAN_FS
))
239 WARN_ON_ONCE(!(inode
->i_state
& (I_NEW
| I_FREEING
)) &&
240 !inode_is_locked(inode
));
241 if (ext4_test_inode_state(inode
, EXT4_STATE_ORPHAN_FILE
))
242 return ext4_orphan_file_del(handle
, inode
);
244 /* Do this quick check before taking global s_orphan_lock. */
245 if (list_empty(&ei
->i_orphan
))
249 /* Grab inode buffer early before taking global s_orphan_lock */
250 err
= ext4_reserve_inode_write(handle
, inode
, &iloc
);
253 mutex_lock(&sbi
->s_orphan_lock
);
254 jbd_debug(4, "remove inode %lu from orphan list\n", inode
->i_ino
);
256 prev
= ei
->i_orphan
.prev
;
257 list_del_init(&ei
->i_orphan
);
259 /* If we're on an error path, we may not have a valid
260 * transaction handle with which to update the orphan list on
261 * disk, but we still need to remove the inode from the linked
263 if (!handle
|| err
) {
264 mutex_unlock(&sbi
->s_orphan_lock
);
268 ino_next
= NEXT_ORPHAN(inode
);
269 if (prev
== &sbi
->s_orphan
) {
270 jbd_debug(4, "superblock will point to %u\n", ino_next
);
271 BUFFER_TRACE(sbi
->s_sbh
, "get_write_access");
272 err
= ext4_journal_get_write_access(handle
, inode
->i_sb
,
273 sbi
->s_sbh
, EXT4_JTR_NONE
);
275 mutex_unlock(&sbi
->s_orphan_lock
);
278 lock_buffer(sbi
->s_sbh
);
279 sbi
->s_es
->s_last_orphan
= cpu_to_le32(ino_next
);
280 ext4_superblock_csum_set(inode
->i_sb
);
281 unlock_buffer(sbi
->s_sbh
);
282 mutex_unlock(&sbi
->s_orphan_lock
);
283 err
= ext4_handle_dirty_metadata(handle
, NULL
, sbi
->s_sbh
);
285 struct ext4_iloc iloc2
;
286 struct inode
*i_prev
=
287 &list_entry(prev
, struct ext4_inode_info
, i_orphan
)->vfs_inode
;
289 jbd_debug(4, "orphan inode %lu will point to %u\n",
290 i_prev
->i_ino
, ino_next
);
291 err
= ext4_reserve_inode_write(handle
, i_prev
, &iloc2
);
293 mutex_unlock(&sbi
->s_orphan_lock
);
296 NEXT_ORPHAN(i_prev
) = ino_next
;
297 err
= ext4_mark_iloc_dirty(handle
, i_prev
, &iloc2
);
298 mutex_unlock(&sbi
->s_orphan_lock
);
302 NEXT_ORPHAN(inode
) = 0;
303 err
= ext4_mark_iloc_dirty(handle
, inode
, &iloc
);
305 ext4_std_error(inode
->i_sb
, err
);
314 static int ext4_quota_on_mount(struct super_block
*sb
, int type
)
316 return dquot_quota_on_mount(sb
,
317 rcu_dereference_protected(EXT4_SB(sb
)->s_qf_names
[type
],
318 lockdep_is_held(&sb
->s_umount
)),
319 EXT4_SB(sb
)->s_jquota_fmt
, type
);
323 static void ext4_process_orphan(struct inode
*inode
,
324 int *nr_truncates
, int *nr_orphans
)
326 struct super_block
*sb
= inode
->i_sb
;
329 dquot_initialize(inode
);
330 if (inode
->i_nlink
) {
331 if (test_opt(sb
, DEBUG
))
332 ext4_msg(sb
, KERN_DEBUG
,
333 "%s: truncating inode %lu to %lld bytes",
334 __func__
, inode
->i_ino
, inode
->i_size
);
335 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
336 inode
->i_ino
, inode
->i_size
);
338 truncate_inode_pages(inode
->i_mapping
, inode
->i_size
);
339 ret
= ext4_truncate(inode
);
342 * We need to clean up the in-core orphan list
343 * manually if ext4_truncate() failed to get a
344 * transaction handle.
346 ext4_orphan_del(NULL
, inode
);
347 ext4_std_error(inode
->i_sb
, ret
);
352 if (test_opt(sb
, DEBUG
))
353 ext4_msg(sb
, KERN_DEBUG
,
354 "%s: deleting unreferenced inode %lu",
355 __func__
, inode
->i_ino
);
356 jbd_debug(2, "deleting unreferenced inode %lu\n",
360 iput(inode
); /* The delete magic happens here! */
363 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
364 * the superblock) which were deleted from all directories, but held open by
365 * a process at the time of a crash. We walk the list and try to delete these
366 * inodes at recovery time (only with a read-write filesystem).
368 * In order to keep the orphan inode chain consistent during traversal (in
369 * case of crash during recovery), we link each inode into the superblock
370 * orphan list_head and handle it the same way as an inode deletion during
371 * normal operation (which journals the operations for us).
373 * We only do an iget() and an iput() on each inode, which is very safe if we
374 * accidentally point at an in-use or already deleted inode. The worst that
375 * can happen in this case is that we get a "bit already cleared" message from
376 * ext4_free_inode(). The only reason we would point at a wrong inode is if
377 * e2fsck was run on this filesystem, and it must have already done the orphan
378 * inode cleanup for us, so we can safely abort without any further action.
380 void ext4_orphan_cleanup(struct super_block
*sb
, struct ext4_super_block
*es
)
382 unsigned int s_flags
= sb
->s_flags
;
383 int nr_orphans
= 0, nr_truncates
= 0;
387 int quota_update
= 0;
390 struct ext4_orphan_info
*oi
= &EXT4_SB(sb
)->s_orphan_info
;
391 int inodes_per_ob
= ext4_inodes_per_orphan_block(sb
);
393 if (!es
->s_last_orphan
&& !oi
->of_blocks
) {
394 jbd_debug(4, "no orphan inodes to clean up\n");
398 if (bdev_read_only(sb
->s_bdev
)) {
399 ext4_msg(sb
, KERN_ERR
, "write access "
400 "unavailable, skipping orphan cleanup");
404 /* Check if feature set would not allow a r/w mount */
405 if (!ext4_feature_set_ok(sb
, 0)) {
406 ext4_msg(sb
, KERN_INFO
, "Skipping orphan cleanup due to "
407 "unknown ROCOMPAT features");
411 if (EXT4_SB(sb
)->s_mount_state
& EXT4_ERROR_FS
) {
412 /* don't clear list on RO mount w/ errors */
413 if (es
->s_last_orphan
&& !(s_flags
& SB_RDONLY
)) {
414 ext4_msg(sb
, KERN_INFO
, "Errors on filesystem, "
415 "clearing orphan list.\n");
416 es
->s_last_orphan
= 0;
418 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
422 if (s_flags
& SB_RDONLY
) {
423 ext4_msg(sb
, KERN_INFO
, "orphan cleanup on readonly fs");
424 sb
->s_flags
&= ~SB_RDONLY
;
428 * Turn on quotas which were not enabled for read-only mounts if
429 * filesystem has quota feature, so that they are updated correctly.
431 if (ext4_has_feature_quota(sb
) && (s_flags
& SB_RDONLY
)) {
432 int ret
= ext4_enable_quotas(sb
);
437 ext4_msg(sb
, KERN_ERR
,
438 "Cannot turn on quotas: error %d", ret
);
441 /* Turn on journaled quotas used for old sytle */
442 for (i
= 0; i
< EXT4_MAXQUOTAS
; i
++) {
443 if (EXT4_SB(sb
)->s_qf_names
[i
]) {
444 int ret
= ext4_quota_on_mount(sb
, i
);
449 ext4_msg(sb
, KERN_ERR
,
450 "Cannot turn on journaled "
451 "quota: type %d: error %d", i
, ret
);
456 while (es
->s_last_orphan
) {
458 * We may have encountered an error during cleanup; if
461 if (EXT4_SB(sb
)->s_mount_state
& EXT4_ERROR_FS
) {
462 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
463 es
->s_last_orphan
= 0;
467 inode
= ext4_orphan_get(sb
, le32_to_cpu(es
->s_last_orphan
));
469 es
->s_last_orphan
= 0;
473 list_add(&EXT4_I(inode
)->i_orphan
, &EXT4_SB(sb
)->s_orphan
);
474 ext4_process_orphan(inode
, &nr_truncates
, &nr_orphans
);
477 for (i
= 0; i
< oi
->of_blocks
; i
++) {
478 bdata
= (__le32
*)(oi
->of_binfo
[i
].ob_bh
->b_data
);
479 for (j
= 0; j
< inodes_per_ob
; j
++) {
482 inode
= ext4_orphan_get(sb
, le32_to_cpu(bdata
[j
]));
485 ext4_set_inode_state(inode
, EXT4_STATE_ORPHAN_FILE
);
486 EXT4_I(inode
)->i_orphan_idx
= i
* inodes_per_ob
+ j
;
487 ext4_process_orphan(inode
, &nr_truncates
, &nr_orphans
);
491 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
494 ext4_msg(sb
, KERN_INFO
, "%d orphan inode%s deleted",
497 ext4_msg(sb
, KERN_INFO
, "%d truncate%s cleaned up",
498 PLURAL(nr_truncates
));
500 /* Turn off quotas if they were enabled for orphan cleanup */
502 for (i
= 0; i
< EXT4_MAXQUOTAS
; i
++) {
503 if (sb_dqopt(sb
)->files
[i
])
504 dquot_quota_off(sb
, i
);
508 sb
->s_flags
= s_flags
; /* Restore SB_RDONLY status */
511 void ext4_release_orphan_info(struct super_block
*sb
)
514 struct ext4_orphan_info
*oi
= &EXT4_SB(sb
)->s_orphan_info
;
518 for (i
= 0; i
< oi
->of_blocks
; i
++)
519 brelse(oi
->of_binfo
[i
].ob_bh
);
523 static struct ext4_orphan_block_tail
*ext4_orphan_block_tail(
524 struct super_block
*sb
,
525 struct buffer_head
*bh
)
527 return (struct ext4_orphan_block_tail
*)(bh
->b_data
+ sb
->s_blocksize
-
528 sizeof(struct ext4_orphan_block_tail
));
531 static int ext4_orphan_file_block_csum_verify(struct super_block
*sb
,
532 struct buffer_head
*bh
)
535 int inodes_per_ob
= ext4_inodes_per_orphan_block(sb
);
536 struct ext4_orphan_info
*oi
= &EXT4_SB(sb
)->s_orphan_info
;
537 struct ext4_orphan_block_tail
*ot
;
538 __le64 dsk_block_nr
= cpu_to_le64(bh
->b_blocknr
);
540 if (!ext4_has_metadata_csum(sb
))
543 ot
= ext4_orphan_block_tail(sb
, bh
);
544 calculated
= ext4_chksum(EXT4_SB(sb
), oi
->of_csum_seed
,
545 (__u8
*)&dsk_block_nr
, sizeof(dsk_block_nr
));
546 calculated
= ext4_chksum(EXT4_SB(sb
), calculated
, (__u8
*)bh
->b_data
,
547 inodes_per_ob
* sizeof(__u32
));
548 return le32_to_cpu(ot
->ob_checksum
) == calculated
;
551 /* This gets called only when checksumming is enabled */
552 void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type
*triggers
,
553 struct buffer_head
*bh
,
554 void *data
, size_t size
)
556 struct super_block
*sb
= EXT4_TRIGGER(triggers
)->sb
;
558 int inodes_per_ob
= ext4_inodes_per_orphan_block(sb
);
559 struct ext4_orphan_info
*oi
= &EXT4_SB(sb
)->s_orphan_info
;
560 struct ext4_orphan_block_tail
*ot
;
561 __le64 dsk_block_nr
= cpu_to_le64(bh
->b_blocknr
);
563 csum
= ext4_chksum(EXT4_SB(sb
), oi
->of_csum_seed
,
564 (__u8
*)&dsk_block_nr
, sizeof(dsk_block_nr
));
565 csum
= ext4_chksum(EXT4_SB(sb
), csum
, (__u8
*)data
,
566 inodes_per_ob
* sizeof(__u32
));
567 ot
= ext4_orphan_block_tail(sb
, bh
);
568 ot
->ob_checksum
= cpu_to_le32(csum
);
571 int ext4_init_orphan_info(struct super_block
*sb
)
573 struct ext4_orphan_info
*oi
= &EXT4_SB(sb
)->s_orphan_info
;
579 int inodes_per_ob
= ext4_inodes_per_orphan_block(sb
);
580 struct ext4_orphan_block_tail
*ot
;
581 ino_t orphan_ino
= le32_to_cpu(EXT4_SB(sb
)->s_es
->s_orphan_file_inum
);
583 if (!ext4_has_feature_orphan_file(sb
))
586 inode
= ext4_iget(sb
, orphan_ino
, EXT4_IGET_SPECIAL
);
588 ext4_msg(sb
, KERN_ERR
, "get orphan inode failed");
589 return PTR_ERR(inode
);
591 oi
->of_blocks
= inode
->i_size
>> sb
->s_blocksize_bits
;
592 oi
->of_csum_seed
= EXT4_I(inode
)->i_csum_seed
;
593 oi
->of_binfo
= kmalloc(oi
->of_blocks
*sizeof(struct ext4_orphan_block
),
599 for (i
= 0; i
< oi
->of_blocks
; i
++) {
600 oi
->of_binfo
[i
].ob_bh
= ext4_bread(NULL
, inode
, i
, 0);
601 if (IS_ERR(oi
->of_binfo
[i
].ob_bh
)) {
602 ret
= PTR_ERR(oi
->of_binfo
[i
].ob_bh
);
605 if (!oi
->of_binfo
[i
].ob_bh
) {
609 ot
= ext4_orphan_block_tail(sb
, oi
->of_binfo
[i
].ob_bh
);
610 if (le32_to_cpu(ot
->ob_magic
) != EXT4_ORPHAN_BLOCK_MAGIC
) {
611 ext4_error(sb
, "orphan file block %d: bad magic", i
);
615 if (!ext4_orphan_file_block_csum_verify(sb
,
616 oi
->of_binfo
[i
].ob_bh
)) {
617 ext4_error(sb
, "orphan file block %d: bad checksum", i
);
621 bdata
= (__le32
*)(oi
->of_binfo
[i
].ob_bh
->b_data
);
623 for (j
= 0; j
< inodes_per_ob
; j
++)
626 atomic_set(&oi
->of_binfo
[i
].ob_free_entries
, free
);
631 for (i
--; i
>= 0; i
--)
632 brelse(oi
->of_binfo
[i
].ob_bh
);
639 int ext4_orphan_file_empty(struct super_block
*sb
)
641 struct ext4_orphan_info
*oi
= &EXT4_SB(sb
)->s_orphan_info
;
643 int inodes_per_ob
= ext4_inodes_per_orphan_block(sb
);
645 if (!ext4_has_feature_orphan_file(sb
))
647 for (i
= 0; i
< oi
->of_blocks
; i
++)
648 if (atomic_read(&oi
->of_binfo
[i
].ob_free_entries
) !=