]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - fs/ext4/inode.c
ext2: Use clean_bdev_aliases() instead of iteration
[mirror_ubuntu-artful-kernel.git] / fs / ext4 / inode.c
CommitLineData
ac27a0ec 1/*
617ba13b 2 * linux/fs/ext4/inode.c
ac27a0ec
DK
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
ac27a0ec
DK
15 * 64-bit file support on 64-bit platforms by Jakub Jelinek
16 * (jj@sunsite.ms.mff.cuni.cz)
17 *
617ba13b 18 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
ac27a0ec
DK
19 */
20
ac27a0ec
DK
21#include <linux/fs.h>
22#include <linux/time.h>
ac27a0ec
DK
23#include <linux/highuid.h>
24#include <linux/pagemap.h>
c94c2acf 25#include <linux/dax.h>
ac27a0ec
DK
26#include <linux/quotaops.h>
27#include <linux/string.h>
28#include <linux/buffer_head.h>
29#include <linux/writeback.h>
64769240 30#include <linux/pagevec.h>
ac27a0ec 31#include <linux/mpage.h>
e83c1397 32#include <linux/namei.h>
ac27a0ec
DK
33#include <linux/uio.h>
34#include <linux/bio.h>
4c0425ff 35#include <linux/workqueue.h>
744692dc 36#include <linux/kernel.h>
6db26ffc 37#include <linux/printk.h>
5a0e3ad6 38#include <linux/slab.h>
00a1a053 39#include <linux/bitops.h>
9bffad1e 40
3dcf5451 41#include "ext4_jbd2.h"
ac27a0ec
DK
42#include "xattr.h"
43#include "acl.h"
9f125d64 44#include "truncate.h"
ac27a0ec 45
9bffad1e
TT
46#include <trace/events/ext4.h>
47
a1d6cc56
AK
48#define MPAGE_DA_EXTENT_TAIL 0x01
49
814525f4
DW
50static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
51 struct ext4_inode_info *ei)
52{
53 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
814525f4 54 __u32 csum;
b47820ed
DJ
55 __u16 dummy_csum = 0;
56 int offset = offsetof(struct ext4_inode, i_checksum_lo);
57 unsigned int csum_size = sizeof(dummy_csum);
814525f4 58
b47820ed
DJ
59 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
60 csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
61 offset += csum_size;
62 csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
63 EXT4_GOOD_OLD_INODE_SIZE - offset);
814525f4 64
b47820ed
DJ
65 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
66 offset = offsetof(struct ext4_inode, i_checksum_hi);
67 csum = ext4_chksum(sbi, csum, (__u8 *)raw +
68 EXT4_GOOD_OLD_INODE_SIZE,
69 offset - EXT4_GOOD_OLD_INODE_SIZE);
70 if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
71 csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
72 csum_size);
73 offset += csum_size;
74 csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
75 EXT4_INODE_SIZE(inode->i_sb) -
76 offset);
77 }
814525f4
DW
78 }
79
814525f4
DW
80 return csum;
81}
82
83static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
84 struct ext4_inode_info *ei)
85{
86 __u32 provided, calculated;
87
88 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
89 cpu_to_le32(EXT4_OS_LINUX) ||
9aa5d32b 90 !ext4_has_metadata_csum(inode->i_sb))
814525f4
DW
91 return 1;
92
93 provided = le16_to_cpu(raw->i_checksum_lo);
94 calculated = ext4_inode_csum(inode, raw, ei);
95 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
96 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
97 provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
98 else
99 calculated &= 0xFFFF;
100
101 return provided == calculated;
102}
103
104static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
105 struct ext4_inode_info *ei)
106{
107 __u32 csum;
108
109 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
110 cpu_to_le32(EXT4_OS_LINUX) ||
9aa5d32b 111 !ext4_has_metadata_csum(inode->i_sb))
814525f4
DW
112 return;
113
114 csum = ext4_inode_csum(inode, raw, ei);
115 raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
116 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
117 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
118 raw->i_checksum_hi = cpu_to_le16(csum >> 16);
119}
120
678aaf48
JK
121static inline int ext4_begin_ordered_truncate(struct inode *inode,
122 loff_t new_size)
123{
7ff9c073 124 trace_ext4_begin_ordered_truncate(inode, new_size);
8aefcd55
TT
125 /*
126 * If jinode is zero, then we never opened the file for
127 * writing, so there's no need to call
128 * jbd2_journal_begin_ordered_truncate() since there's no
129 * outstanding writes we need to flush.
130 */
131 if (!EXT4_I(inode)->jinode)
132 return 0;
133 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
134 EXT4_I(inode)->jinode,
135 new_size);
678aaf48
JK
136}
137
d47992f8
LC
138static void ext4_invalidatepage(struct page *page, unsigned int offset,
139 unsigned int length);
cb20d518
TT
140static int __ext4_journalled_writepage(struct page *page, unsigned int len);
141static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
fffb2739
JK
142static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
143 int pextents);
64769240 144
ac27a0ec
DK
145/*
146 * Test whether an inode is a fast symlink.
147 */
f348c252 148int ext4_inode_is_fast_symlink(struct inode *inode)
ac27a0ec 149{
65eddb56
YY
150 int ea_blocks = EXT4_I(inode)->i_file_acl ?
151 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
ac27a0ec 152
bd9db175
ZL
153 if (ext4_has_inline_data(inode))
154 return 0;
155
ac27a0ec
DK
156 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
157}
158
ac27a0ec
DK
159/*
160 * Restart the transaction associated with *handle. This does a commit,
161 * so before we call here everything must be consistently dirtied against
162 * this transaction.
163 */
fa5d1113 164int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
487caeef 165 int nblocks)
ac27a0ec 166{
487caeef
JK
167 int ret;
168
169 /*
e35fd660 170 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
487caeef
JK
171 * moment, get_block can be called only for blocks inside i_size since
172 * page cache has been already dropped and writes are blocked by
173 * i_mutex. So we can safely drop the i_data_sem here.
174 */
0390131b 175 BUG_ON(EXT4_JOURNAL(inode) == NULL);
ac27a0ec 176 jbd_debug(2, "restarting handle %p\n", handle);
487caeef 177 up_write(&EXT4_I(inode)->i_data_sem);
8e8eaabe 178 ret = ext4_journal_restart(handle, nblocks);
487caeef 179 down_write(&EXT4_I(inode)->i_data_sem);
fa5d1113 180 ext4_discard_preallocations(inode);
487caeef
JK
181
182 return ret;
ac27a0ec
DK
183}
184
185/*
186 * Called at the last iput() if i_nlink is zero.
187 */
0930fcc1 188void ext4_evict_inode(struct inode *inode)
ac27a0ec
DK
189{
190 handle_t *handle;
bc965ab3 191 int err;
ac27a0ec 192
7ff9c073 193 trace_ext4_evict_inode(inode);
2581fdc8 194
0930fcc1 195 if (inode->i_nlink) {
2d859db3
JK
196 /*
197 * When journalling data dirty buffers are tracked only in the
198 * journal. So although mm thinks everything is clean and
199 * ready for reaping the inode might still have some pages to
200 * write in the running transaction or waiting to be
201 * checkpointed. Thus calling jbd2_journal_invalidatepage()
202 * (via truncate_inode_pages()) to discard these buffers can
203 * cause data loss. Also even if we did not discard these
204 * buffers, we would have no way to find them after the inode
205 * is reaped and thus user could see stale data if he tries to
206 * read them before the transaction is checkpointed. So be
207 * careful and force everything to disk here... We use
208 * ei->i_datasync_tid to store the newest transaction
209 * containing inode's data.
210 *
211 * Note that directories do not have this problem because they
212 * don't use page cache.
213 */
6a7fd522
VN
214 if (inode->i_ino != EXT4_JOURNAL_INO &&
215 ext4_should_journal_data(inode) &&
216 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
2d859db3
JK
217 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
218 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
219
d76a3a77 220 jbd2_complete_transaction(journal, commit_tid);
2d859db3
JK
221 filemap_write_and_wait(&inode->i_data);
222 }
91b0abe3 223 truncate_inode_pages_final(&inode->i_data);
5dc23bdd 224
0930fcc1
AV
225 goto no_delete;
226 }
227
e2bfb088
TT
228 if (is_bad_inode(inode))
229 goto no_delete;
230 dquot_initialize(inode);
907f4554 231
678aaf48
JK
232 if (ext4_should_order_data(inode))
233 ext4_begin_ordered_truncate(inode, 0);
91b0abe3 234 truncate_inode_pages_final(&inode->i_data);
ac27a0ec 235
8e8ad8a5
JK
236 /*
237 * Protect us against freezing - iput() caller didn't have to have any
238 * protection against it
239 */
240 sb_start_intwrite(inode->i_sb);
9924a92a
TT
241 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
242 ext4_blocks_for_truncate(inode)+3);
ac27a0ec 243 if (IS_ERR(handle)) {
bc965ab3 244 ext4_std_error(inode->i_sb, PTR_ERR(handle));
ac27a0ec
DK
245 /*
246 * If we're going to skip the normal cleanup, we still need to
247 * make sure that the in-core orphan linked list is properly
248 * cleaned up.
249 */
617ba13b 250 ext4_orphan_del(NULL, inode);
8e8ad8a5 251 sb_end_intwrite(inode->i_sb);
ac27a0ec
DK
252 goto no_delete;
253 }
254
255 if (IS_SYNC(inode))
0390131b 256 ext4_handle_sync(handle);
ac27a0ec 257 inode->i_size = 0;
bc965ab3
TT
258 err = ext4_mark_inode_dirty(handle, inode);
259 if (err) {
12062ddd 260 ext4_warning(inode->i_sb,
bc965ab3
TT
261 "couldn't mark inode dirty (err %d)", err);
262 goto stop_handle;
263 }
ac27a0ec 264 if (inode->i_blocks)
617ba13b 265 ext4_truncate(inode);
bc965ab3
TT
266
267 /*
268 * ext4_ext_truncate() doesn't reserve any slop when it
269 * restarts journal transactions; therefore there may not be
270 * enough credits left in the handle to remove the inode from
271 * the orphan list and set the dtime field.
272 */
0390131b 273 if (!ext4_handle_has_enough_credits(handle, 3)) {
bc965ab3
TT
274 err = ext4_journal_extend(handle, 3);
275 if (err > 0)
276 err = ext4_journal_restart(handle, 3);
277 if (err != 0) {
12062ddd 278 ext4_warning(inode->i_sb,
bc965ab3
TT
279 "couldn't extend journal (err %d)", err);
280 stop_handle:
281 ext4_journal_stop(handle);
45388219 282 ext4_orphan_del(NULL, inode);
8e8ad8a5 283 sb_end_intwrite(inode->i_sb);
bc965ab3
TT
284 goto no_delete;
285 }
286 }
287
ac27a0ec 288 /*
617ba13b 289 * Kill off the orphan record which ext4_truncate created.
ac27a0ec 290 * AKPM: I think this can be inside the above `if'.
617ba13b 291 * Note that ext4_orphan_del() has to be able to cope with the
ac27a0ec 292 * deletion of a non-existent orphan - this is because we don't
617ba13b 293 * know if ext4_truncate() actually created an orphan record.
ac27a0ec
DK
294 * (Well, we could do this if we need to, but heck - it works)
295 */
617ba13b
MC
296 ext4_orphan_del(handle, inode);
297 EXT4_I(inode)->i_dtime = get_seconds();
ac27a0ec
DK
298
299 /*
300 * One subtle ordering requirement: if anything has gone wrong
301 * (transaction abort, IO errors, whatever), then we can still
302 * do these next steps (the fs will already have been marked as
303 * having errors), but we can't free the inode if the mark_dirty
304 * fails.
305 */
617ba13b 306 if (ext4_mark_inode_dirty(handle, inode))
ac27a0ec 307 /* If that failed, just do the required in-core inode clear. */
0930fcc1 308 ext4_clear_inode(inode);
ac27a0ec 309 else
617ba13b
MC
310 ext4_free_inode(handle, inode);
311 ext4_journal_stop(handle);
8e8ad8a5 312 sb_end_intwrite(inode->i_sb);
ac27a0ec
DK
313 return;
314no_delete:
0930fcc1 315 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
ac27a0ec
DK
316}
317
a9e7f447
DM
318#ifdef CONFIG_QUOTA
319qsize_t *ext4_get_reserved_space(struct inode *inode)
60e58e0f 320{
a9e7f447 321 return &EXT4_I(inode)->i_reserved_quota;
60e58e0f 322}
a9e7f447 323#endif
9d0be502 324
0637c6f4
TT
325/*
326 * Called with i_data_sem down, which is important since we can call
327 * ext4_discard_preallocations() from here.
328 */
5f634d06
AK
329void ext4_da_update_reserve_space(struct inode *inode,
330 int used, int quota_claim)
12219aea
AK
331{
332 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0637c6f4 333 struct ext4_inode_info *ei = EXT4_I(inode);
0637c6f4
TT
334
335 spin_lock(&ei->i_block_reservation_lock);
d8990240 336 trace_ext4_da_update_reserve_space(inode, used, quota_claim);
0637c6f4 337 if (unlikely(used > ei->i_reserved_data_blocks)) {
8de5c325 338 ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
1084f252 339 "with only %d reserved data blocks",
0637c6f4
TT
340 __func__, inode->i_ino, used,
341 ei->i_reserved_data_blocks);
342 WARN_ON(1);
343 used = ei->i_reserved_data_blocks;
344 }
12219aea 345
0637c6f4
TT
346 /* Update per-inode reservations */
347 ei->i_reserved_data_blocks -= used;
71d4f7d0 348 percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
6bc6e63f 349
12219aea 350 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
60e58e0f 351
72b8ab9d
ES
352 /* Update quota subsystem for data blocks */
353 if (quota_claim)
7b415bf6 354 dquot_claim_block(inode, EXT4_C2B(sbi, used));
72b8ab9d 355 else {
5f634d06
AK
356 /*
357 * We did fallocate with an offset that is already delayed
358 * allocated. So on delayed allocated writeback we should
72b8ab9d 359 * not re-claim the quota for fallocated blocks.
5f634d06 360 */
7b415bf6 361 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
5f634d06 362 }
d6014301
AK
363
364 /*
365 * If we have done all the pending block allocations and if
366 * there aren't any writers on the inode, we can discard the
367 * inode's preallocations.
368 */
0637c6f4
TT
369 if ((ei->i_reserved_data_blocks == 0) &&
370 (atomic_read(&inode->i_writecount) == 0))
d6014301 371 ext4_discard_preallocations(inode);
12219aea
AK
372}
373
e29136f8 374static int __check_block_validity(struct inode *inode, const char *func,
c398eda0
TT
375 unsigned int line,
376 struct ext4_map_blocks *map)
6fd058f7 377{
24676da4
TT
378 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
379 map->m_len)) {
c398eda0
TT
380 ext4_error_inode(inode, func, line, map->m_pblk,
381 "lblock %lu mapped to illegal pblock "
382 "(length %d)", (unsigned long) map->m_lblk,
383 map->m_len);
6a797d27 384 return -EFSCORRUPTED;
6fd058f7
TT
385 }
386 return 0;
387}
388
53085fac
JK
389int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
390 ext4_lblk_t len)
391{
392 int ret;
393
394 if (ext4_encrypted_inode(inode))
a7550b30 395 return fscrypt_zeroout_range(inode, lblk, pblk, len);
53085fac
JK
396
397 ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
398 if (ret > 0)
399 ret = 0;
400
401 return ret;
402}
403
e29136f8 404#define check_block_validity(inode, map) \
c398eda0 405 __check_block_validity((inode), __func__, __LINE__, (map))
e29136f8 406
921f266b
DM
407#ifdef ES_AGGRESSIVE_TEST
408static void ext4_map_blocks_es_recheck(handle_t *handle,
409 struct inode *inode,
410 struct ext4_map_blocks *es_map,
411 struct ext4_map_blocks *map,
412 int flags)
413{
414 int retval;
415
416 map->m_flags = 0;
417 /*
418 * There is a race window that the result is not the same.
419 * e.g. xfstests #223 when dioread_nolock enables. The reason
420 * is that we lookup a block mapping in extent status tree with
421 * out taking i_data_sem. So at the time the unwritten extent
422 * could be converted.
423 */
2dcba478 424 down_read(&EXT4_I(inode)->i_data_sem);
921f266b
DM
425 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
426 retval = ext4_ext_map_blocks(handle, inode, map, flags &
427 EXT4_GET_BLOCKS_KEEP_SIZE);
428 } else {
429 retval = ext4_ind_map_blocks(handle, inode, map, flags &
430 EXT4_GET_BLOCKS_KEEP_SIZE);
431 }
2dcba478 432 up_read((&EXT4_I(inode)->i_data_sem));
921f266b
DM
433
434 /*
435 * We don't check m_len because extent will be collpased in status
436 * tree. So the m_len might not equal.
437 */
438 if (es_map->m_lblk != map->m_lblk ||
439 es_map->m_flags != map->m_flags ||
440 es_map->m_pblk != map->m_pblk) {
bdafe42a 441 printk("ES cache assertion failed for inode: %lu "
921f266b
DM
442 "es_cached ex [%d/%d/%llu/%x] != "
443 "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
444 inode->i_ino, es_map->m_lblk, es_map->m_len,
445 es_map->m_pblk, es_map->m_flags, map->m_lblk,
446 map->m_len, map->m_pblk, map->m_flags,
447 retval, flags);
448 }
449}
450#endif /* ES_AGGRESSIVE_TEST */
451
f5ab0d1f 452/*
e35fd660 453 * The ext4_map_blocks() function tries to look up the requested blocks,
2b2d6d01 454 * and returns if the blocks are already mapped.
f5ab0d1f 455 *
f5ab0d1f
MC
456 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
457 * and store the allocated blocks in the result buffer head and mark it
458 * mapped.
459 *
e35fd660
TT
460 * If file type is extents based, it will call ext4_ext_map_blocks(),
461 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
f5ab0d1f
MC
462 * based files
463 *
facab4d9
JK
464 * On success, it returns the number of blocks being mapped or allocated. if
465 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
466 * is marked as unwritten. If the create == 1, it will mark @map as mapped.
f5ab0d1f
MC
467 *
468 * It returns 0 if plain look up failed (blocks have not been allocated), in
facab4d9
JK
469 * that case, @map is returned as unmapped but we still do fill map->m_len to
470 * indicate the length of a hole starting at map->m_lblk.
f5ab0d1f
MC
471 *
472 * It returns the error in case of allocation failure.
473 */
e35fd660
TT
474int ext4_map_blocks(handle_t *handle, struct inode *inode,
475 struct ext4_map_blocks *map, int flags)
0e855ac8 476{
d100eef2 477 struct extent_status es;
0e855ac8 478 int retval;
b8a86845 479 int ret = 0;
921f266b
DM
480#ifdef ES_AGGRESSIVE_TEST
481 struct ext4_map_blocks orig_map;
482
483 memcpy(&orig_map, map, sizeof(*map));
484#endif
f5ab0d1f 485
e35fd660
TT
486 map->m_flags = 0;
487 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
488 "logical block %lu\n", inode->i_ino, flags, map->m_len,
489 (unsigned long) map->m_lblk);
d100eef2 490
e861b5e9
TT
491 /*
492 * ext4_map_blocks returns an int, and m_len is an unsigned int
493 */
494 if (unlikely(map->m_len > INT_MAX))
495 map->m_len = INT_MAX;
496
4adb6ab3
KM
497 /* We can handle the block number less than EXT_MAX_BLOCKS */
498 if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
6a797d27 499 return -EFSCORRUPTED;
4adb6ab3 500
d100eef2
ZL
501 /* Lookup extent status tree firstly */
502 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
503 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
504 map->m_pblk = ext4_es_pblock(&es) +
505 map->m_lblk - es.es_lblk;
506 map->m_flags |= ext4_es_is_written(&es) ?
507 EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
508 retval = es.es_len - (map->m_lblk - es.es_lblk);
509 if (retval > map->m_len)
510 retval = map->m_len;
511 map->m_len = retval;
512 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
facab4d9
JK
513 map->m_pblk = 0;
514 retval = es.es_len - (map->m_lblk - es.es_lblk);
515 if (retval > map->m_len)
516 retval = map->m_len;
517 map->m_len = retval;
d100eef2
ZL
518 retval = 0;
519 } else {
520 BUG_ON(1);
521 }
921f266b
DM
522#ifdef ES_AGGRESSIVE_TEST
523 ext4_map_blocks_es_recheck(handle, inode, map,
524 &orig_map, flags);
525#endif
d100eef2
ZL
526 goto found;
527 }
528
4df3d265 529 /*
b920c755
TT
530 * Try to see if we can get the block without requesting a new
531 * file system block.
4df3d265 532 */
2dcba478 533 down_read(&EXT4_I(inode)->i_data_sem);
12e9b892 534 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
a4e5d88b
DM
535 retval = ext4_ext_map_blocks(handle, inode, map, flags &
536 EXT4_GET_BLOCKS_KEEP_SIZE);
0e855ac8 537 } else {
a4e5d88b
DM
538 retval = ext4_ind_map_blocks(handle, inode, map, flags &
539 EXT4_GET_BLOCKS_KEEP_SIZE);
0e855ac8 540 }
f7fec032 541 if (retval > 0) {
3be78c73 542 unsigned int status;
f7fec032 543
44fb851d
ZL
544 if (unlikely(retval != map->m_len)) {
545 ext4_warning(inode->i_sb,
546 "ES len assertion failed for inode "
547 "%lu: retval %d != map->m_len %d",
548 inode->i_ino, retval, map->m_len);
549 WARN_ON(1);
921f266b 550 }
921f266b 551
f7fec032
ZL
552 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
553 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
554 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
d2dc317d 555 !(status & EXTENT_STATUS_WRITTEN) &&
f7fec032
ZL
556 ext4_find_delalloc_range(inode, map->m_lblk,
557 map->m_lblk + map->m_len - 1))
558 status |= EXTENT_STATUS_DELAYED;
559 ret = ext4_es_insert_extent(inode, map->m_lblk,
560 map->m_len, map->m_pblk, status);
561 if (ret < 0)
562 retval = ret;
563 }
2dcba478 564 up_read((&EXT4_I(inode)->i_data_sem));
f5ab0d1f 565
d100eef2 566found:
e35fd660 567 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
b8a86845 568 ret = check_block_validity(inode, map);
6fd058f7
TT
569 if (ret != 0)
570 return ret;
571 }
572
f5ab0d1f 573 /* If it is only a block(s) look up */
c2177057 574 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
f5ab0d1f
MC
575 return retval;
576
577 /*
578 * Returns if the blocks have already allocated
579 *
580 * Note that if blocks have been preallocated
df3ab170 581 * ext4_ext_get_block() returns the create = 0
f5ab0d1f
MC
582 * with buffer head unmapped.
583 */
e35fd660 584 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
b8a86845
LC
585 /*
586 * If we need to convert extent to unwritten
587 * we continue and do the actual work in
588 * ext4_ext_map_blocks()
589 */
590 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
591 return retval;
4df3d265 592
2a8964d6 593 /*
a25a4e1a
ZL
594 * Here we clear m_flags because after allocating an new extent,
595 * it will be set again.
2a8964d6 596 */
a25a4e1a 597 map->m_flags &= ~EXT4_MAP_FLAGS;
2a8964d6 598
4df3d265 599 /*
556615dc 600 * New blocks allocate and/or writing to unwritten extent
f5ab0d1f 601 * will possibly result in updating i_data, so we take
d91bd2c1 602 * the write lock of i_data_sem, and call get_block()
f5ab0d1f 603 * with create == 1 flag.
4df3d265 604 */
c8b459f4 605 down_write(&EXT4_I(inode)->i_data_sem);
d2a17637 606
4df3d265
AK
607 /*
608 * We need to check for EXT4 here because migrate
609 * could have changed the inode type in between
610 */
12e9b892 611 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
e35fd660 612 retval = ext4_ext_map_blocks(handle, inode, map, flags);
0e855ac8 613 } else {
e35fd660 614 retval = ext4_ind_map_blocks(handle, inode, map, flags);
267e4db9 615
e35fd660 616 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
267e4db9
AK
617 /*
618 * We allocated new blocks which will result in
619 * i_data's format changing. Force the migrate
620 * to fail by clearing migrate flags
621 */
19f5fb7a 622 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
267e4db9 623 }
d2a17637 624
5f634d06
AK
625 /*
626 * Update reserved blocks/metadata blocks after successful
627 * block allocation which had been deferred till now. We don't
628 * support fallocate for non extent files. So we can update
629 * reserve space here.
630 */
631 if ((retval > 0) &&
1296cc85 632 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
5f634d06
AK
633 ext4_da_update_reserve_space(inode, retval, 1);
634 }
2ac3b6e0 635
f7fec032 636 if (retval > 0) {
3be78c73 637 unsigned int status;
f7fec032 638
44fb851d
ZL
639 if (unlikely(retval != map->m_len)) {
640 ext4_warning(inode->i_sb,
641 "ES len assertion failed for inode "
642 "%lu: retval %d != map->m_len %d",
643 inode->i_ino, retval, map->m_len);
644 WARN_ON(1);
921f266b 645 }
921f266b 646
c86d8db3
JK
647 /*
648 * We have to zeroout blocks before inserting them into extent
649 * status tree. Otherwise someone could look them up there and
9b623df6
JK
650 * use them before they are really zeroed. We also have to
651 * unmap metadata before zeroing as otherwise writeback can
652 * overwrite zeros with stale data from block device.
c86d8db3
JK
653 */
654 if (flags & EXT4_GET_BLOCKS_ZERO &&
655 map->m_flags & EXT4_MAP_MAPPED &&
656 map->m_flags & EXT4_MAP_NEW) {
64e1c57f
JK
657 clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
658 map->m_len);
c86d8db3
JK
659 ret = ext4_issue_zeroout(inode, map->m_lblk,
660 map->m_pblk, map->m_len);
661 if (ret) {
662 retval = ret;
663 goto out_sem;
664 }
665 }
666
adb23551
ZL
667 /*
668 * If the extent has been zeroed out, we don't need to update
669 * extent status tree.
670 */
671 if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
672 ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
673 if (ext4_es_is_written(&es))
c86d8db3 674 goto out_sem;
adb23551 675 }
f7fec032
ZL
676 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
677 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
678 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
d2dc317d 679 !(status & EXTENT_STATUS_WRITTEN) &&
f7fec032
ZL
680 ext4_find_delalloc_range(inode, map->m_lblk,
681 map->m_lblk + map->m_len - 1))
682 status |= EXTENT_STATUS_DELAYED;
683 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
684 map->m_pblk, status);
c86d8db3 685 if (ret < 0) {
f7fec032 686 retval = ret;
c86d8db3
JK
687 goto out_sem;
688 }
5356f261
AK
689 }
690
c86d8db3 691out_sem:
4df3d265 692 up_write((&EXT4_I(inode)->i_data_sem));
e35fd660 693 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
b8a86845 694 ret = check_block_validity(inode, map);
6fd058f7
TT
695 if (ret != 0)
696 return ret;
06bd3c36
JK
697
698 /*
699 * Inodes with freshly allocated blocks where contents will be
700 * visible after transaction commit must be on transaction's
701 * ordered data list.
702 */
703 if (map->m_flags & EXT4_MAP_NEW &&
704 !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
705 !(flags & EXT4_GET_BLOCKS_ZERO) &&
706 !IS_NOQUOTA(inode) &&
707 ext4_should_order_data(inode)) {
ee0876bc
JK
708 if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
709 ret = ext4_jbd2_inode_add_wait(handle, inode);
710 else
711 ret = ext4_jbd2_inode_add_write(handle, inode);
06bd3c36
JK
712 if (ret)
713 return ret;
714 }
6fd058f7 715 }
0e855ac8
AK
716 return retval;
717}
718
ed8ad838
JK
719/*
720 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
721 * we have to be careful as someone else may be manipulating b_state as well.
722 */
723static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
724{
725 unsigned long old_state;
726 unsigned long new_state;
727
728 flags &= EXT4_MAP_FLAGS;
729
730 /* Dummy buffer_head? Set non-atomically. */
731 if (!bh->b_page) {
732 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
733 return;
734 }
735 /*
736 * Someone else may be modifying b_state. Be careful! This is ugly but
737 * once we get rid of using bh as a container for mapping information
738 * to pass to / from get_block functions, this can go away.
739 */
740 do {
741 old_state = READ_ONCE(bh->b_state);
742 new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
743 } while (unlikely(
744 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
745}
746
2ed88685
TT
747static int _ext4_get_block(struct inode *inode, sector_t iblock,
748 struct buffer_head *bh, int flags)
ac27a0ec 749{
2ed88685 750 struct ext4_map_blocks map;
efe70c29 751 int ret = 0;
ac27a0ec 752
46c7f254
TM
753 if (ext4_has_inline_data(inode))
754 return -ERANGE;
755
2ed88685
TT
756 map.m_lblk = iblock;
757 map.m_len = bh->b_size >> inode->i_blkbits;
758
efe70c29
JK
759 ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
760 flags);
7fb5409d 761 if (ret > 0) {
2ed88685 762 map_bh(bh, inode->i_sb, map.m_pblk);
ed8ad838 763 ext4_update_bh_state(bh, map.m_flags);
2ed88685 764 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
7fb5409d 765 ret = 0;
ac27a0ec
DK
766 }
767 return ret;
768}
769
2ed88685
TT
770int ext4_get_block(struct inode *inode, sector_t iblock,
771 struct buffer_head *bh, int create)
772{
773 return _ext4_get_block(inode, iblock, bh,
774 create ? EXT4_GET_BLOCKS_CREATE : 0);
775}
776
705965bd
JK
777/*
778 * Get block function used when preparing for buffered write if we require
779 * creating an unwritten extent if blocks haven't been allocated. The extent
780 * will be converted to written after the IO is complete.
781 */
782int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
783 struct buffer_head *bh_result, int create)
784{
785 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
786 inode->i_ino, create);
787 return _ext4_get_block(inode, iblock, bh_result,
788 EXT4_GET_BLOCKS_IO_CREATE_EXT);
789}
790
efe70c29
JK
791/* Maximum number of blocks we map for direct IO at once. */
792#define DIO_MAX_BLOCKS 4096
793
e84dfbe2
JK
794/*
795 * Get blocks function for the cases that need to start a transaction -
796 * generally difference cases of direct IO and DAX IO. It also handles retries
797 * in case of ENOSPC.
798 */
799static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
800 struct buffer_head *bh_result, int flags)
efe70c29
JK
801{
802 int dio_credits;
e84dfbe2
JK
803 handle_t *handle;
804 int retries = 0;
805 int ret;
efe70c29
JK
806
807 /* Trim mapping request to maximum we can map at once for DIO */
808 if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
809 bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
810 dio_credits = ext4_chunk_trans_blocks(inode,
811 bh_result->b_size >> inode->i_blkbits);
e84dfbe2
JK
812retry:
813 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
814 if (IS_ERR(handle))
815 return PTR_ERR(handle);
816
817 ret = _ext4_get_block(inode, iblock, bh_result, flags);
818 ext4_journal_stop(handle);
819
820 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
821 goto retry;
822 return ret;
efe70c29
JK
823}
824
705965bd
JK
825/* Get block function for DIO reads and writes to inodes without extents */
826int ext4_dio_get_block(struct inode *inode, sector_t iblock,
827 struct buffer_head *bh, int create)
828{
efe70c29
JK
829 /* We don't expect handle for direct IO */
830 WARN_ON_ONCE(ext4_journal_current_handle());
831
e84dfbe2
JK
832 if (!create)
833 return _ext4_get_block(inode, iblock, bh, 0);
834 return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
705965bd
JK
835}
836
837/*
109811c2 838 * Get block function for AIO DIO writes when we create unwritten extent if
705965bd
JK
839 * blocks are not allocated yet. The extent will be converted to written
840 * after IO is complete.
841 */
109811c2
JK
842static int ext4_dio_get_block_unwritten_async(struct inode *inode,
843 sector_t iblock, struct buffer_head *bh_result, int create)
705965bd 844{
efe70c29
JK
845 int ret;
846
efe70c29
JK
847 /* We don't expect handle for direct IO */
848 WARN_ON_ONCE(ext4_journal_current_handle());
849
e84dfbe2
JK
850 ret = ext4_get_block_trans(inode, iblock, bh_result,
851 EXT4_GET_BLOCKS_IO_CREATE_EXT);
efe70c29 852
109811c2
JK
853 /*
854 * When doing DIO using unwritten extents, we need io_end to convert
855 * unwritten extents to written on IO completion. We allocate io_end
856 * once we spot unwritten extent and store it in b_private. Generic
857 * DIO code keeps b_private set and furthermore passes the value to
858 * our completion callback in 'private' argument.
859 */
860 if (!ret && buffer_unwritten(bh_result)) {
861 if (!bh_result->b_private) {
862 ext4_io_end_t *io_end;
863
864 io_end = ext4_init_io_end(inode, GFP_KERNEL);
865 if (!io_end)
866 return -ENOMEM;
867 bh_result->b_private = io_end;
868 ext4_set_io_unwritten_flag(inode, io_end);
869 }
efe70c29 870 set_buffer_defer_completion(bh_result);
efe70c29
JK
871 }
872
873 return ret;
705965bd
JK
874}
875
109811c2
JK
876/*
877 * Get block function for non-AIO DIO writes when we create unwritten extent if
878 * blocks are not allocated yet. The extent will be converted to written
879 * after IO is complete from ext4_ext_direct_IO() function.
880 */
881static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
882 sector_t iblock, struct buffer_head *bh_result, int create)
883{
109811c2
JK
884 int ret;
885
886 /* We don't expect handle for direct IO */
887 WARN_ON_ONCE(ext4_journal_current_handle());
888
e84dfbe2
JK
889 ret = ext4_get_block_trans(inode, iblock, bh_result,
890 EXT4_GET_BLOCKS_IO_CREATE_EXT);
109811c2
JK
891
892 /*
893 * Mark inode as having pending DIO writes to unwritten extents.
894 * ext4_ext_direct_IO() checks this flag and converts extents to
895 * written.
896 */
897 if (!ret && buffer_unwritten(bh_result))
898 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
899
900 return ret;
901}
902
705965bd
JK
903static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
904 struct buffer_head *bh_result, int create)
905{
906 int ret;
907
908 ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
909 inode->i_ino, create);
efe70c29
JK
910 /* We don't expect handle for direct IO */
911 WARN_ON_ONCE(ext4_journal_current_handle());
912
705965bd
JK
913 ret = _ext4_get_block(inode, iblock, bh_result, 0);
914 /*
915 * Blocks should have been preallocated! ext4_file_write_iter() checks
916 * that.
917 */
efe70c29 918 WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
705965bd
JK
919
920 return ret;
921}
922
923
ac27a0ec
DK
924/*
925 * `handle' can be NULL if create is zero
926 */
617ba13b 927struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
c5e298ae 928 ext4_lblk_t block, int map_flags)
ac27a0ec 929{
2ed88685
TT
930 struct ext4_map_blocks map;
931 struct buffer_head *bh;
c5e298ae 932 int create = map_flags & EXT4_GET_BLOCKS_CREATE;
10560082 933 int err;
ac27a0ec
DK
934
935 J_ASSERT(handle != NULL || create == 0);
936
2ed88685
TT
937 map.m_lblk = block;
938 map.m_len = 1;
c5e298ae 939 err = ext4_map_blocks(handle, inode, &map, map_flags);
ac27a0ec 940
10560082
TT
941 if (err == 0)
942 return create ? ERR_PTR(-ENOSPC) : NULL;
2ed88685 943 if (err < 0)
10560082 944 return ERR_PTR(err);
2ed88685
TT
945
946 bh = sb_getblk(inode->i_sb, map.m_pblk);
10560082
TT
947 if (unlikely(!bh))
948 return ERR_PTR(-ENOMEM);
2ed88685
TT
949 if (map.m_flags & EXT4_MAP_NEW) {
950 J_ASSERT(create != 0);
951 J_ASSERT(handle != NULL);
ac27a0ec 952
2ed88685
TT
953 /*
954 * Now that we do not always journal data, we should
955 * keep in mind whether this should always journal the
956 * new buffer as metadata. For now, regular file
957 * writes use ext4_get_block instead, so it's not a
958 * problem.
959 */
960 lock_buffer(bh);
961 BUFFER_TRACE(bh, "call get_create_access");
10560082
TT
962 err = ext4_journal_get_create_access(handle, bh);
963 if (unlikely(err)) {
964 unlock_buffer(bh);
965 goto errout;
966 }
967 if (!buffer_uptodate(bh)) {
2ed88685
TT
968 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
969 set_buffer_uptodate(bh);
ac27a0ec 970 }
2ed88685
TT
971 unlock_buffer(bh);
972 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
973 err = ext4_handle_dirty_metadata(handle, inode, bh);
10560082
TT
974 if (unlikely(err))
975 goto errout;
976 } else
2ed88685 977 BUFFER_TRACE(bh, "not a new buffer");
2ed88685 978 return bh;
10560082
TT
979errout:
980 brelse(bh);
981 return ERR_PTR(err);
ac27a0ec
DK
982}
983
617ba13b 984struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
c5e298ae 985 ext4_lblk_t block, int map_flags)
ac27a0ec 986{
af5bc92d 987 struct buffer_head *bh;
ac27a0ec 988
c5e298ae 989 bh = ext4_getblk(handle, inode, block, map_flags);
1c215028 990 if (IS_ERR(bh))
ac27a0ec 991 return bh;
1c215028 992 if (!bh || buffer_uptodate(bh))
ac27a0ec 993 return bh;
dfec8a14 994 ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
ac27a0ec
DK
995 wait_on_buffer(bh);
996 if (buffer_uptodate(bh))
997 return bh;
998 put_bh(bh);
1c215028 999 return ERR_PTR(-EIO);
ac27a0ec
DK
1000}
1001
f19d5870
TM
1002int ext4_walk_page_buffers(handle_t *handle,
1003 struct buffer_head *head,
1004 unsigned from,
1005 unsigned to,
1006 int *partial,
1007 int (*fn)(handle_t *handle,
1008 struct buffer_head *bh))
ac27a0ec
DK
1009{
1010 struct buffer_head *bh;
1011 unsigned block_start, block_end;
1012 unsigned blocksize = head->b_size;
1013 int err, ret = 0;
1014 struct buffer_head *next;
1015
af5bc92d
TT
1016 for (bh = head, block_start = 0;
1017 ret == 0 && (bh != head || !block_start);
de9a55b8 1018 block_start = block_end, bh = next) {
ac27a0ec
DK
1019 next = bh->b_this_page;
1020 block_end = block_start + blocksize;
1021 if (block_end <= from || block_start >= to) {
1022 if (partial && !buffer_uptodate(bh))
1023 *partial = 1;
1024 continue;
1025 }
1026 err = (*fn)(handle, bh);
1027 if (!ret)
1028 ret = err;
1029 }
1030 return ret;
1031}
1032
1033/*
1034 * To preserve ordering, it is essential that the hole instantiation and
1035 * the data write be encapsulated in a single transaction. We cannot
617ba13b 1036 * close off a transaction and start a new one between the ext4_get_block()
dab291af 1037 * and the commit_write(). So doing the jbd2_journal_start at the start of
ac27a0ec
DK
1038 * prepare_write() is the right place.
1039 *
36ade451
JK
1040 * Also, this function can nest inside ext4_writepage(). In that case, we
1041 * *know* that ext4_writepage() has generated enough buffer credits to do the
1042 * whole page. So we won't block on the journal in that case, which is good,
1043 * because the caller may be PF_MEMALLOC.
ac27a0ec 1044 *
617ba13b 1045 * By accident, ext4 can be reentered when a transaction is open via
ac27a0ec
DK
1046 * quota file writes. If we were to commit the transaction while thus
1047 * reentered, there can be a deadlock - we would be holding a quota
1048 * lock, and the commit would never complete if another thread had a
1049 * transaction open and was blocking on the quota lock - a ranking
1050 * violation.
1051 *
dab291af 1052 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
ac27a0ec
DK
1053 * will _not_ run commit under these circumstances because handle->h_ref
1054 * is elevated. We'll still have enough credits for the tiny quotafile
1055 * write.
1056 */
f19d5870
TM
1057int do_journal_get_write_access(handle_t *handle,
1058 struct buffer_head *bh)
ac27a0ec 1059{
56d35a4c
JK
1060 int dirty = buffer_dirty(bh);
1061 int ret;
1062
ac27a0ec
DK
1063 if (!buffer_mapped(bh) || buffer_freed(bh))
1064 return 0;
56d35a4c 1065 /*
ebdec241 1066 * __block_write_begin() could have dirtied some buffers. Clean
56d35a4c
JK
1067 * the dirty bit as jbd2_journal_get_write_access() could complain
1068 * otherwise about fs integrity issues. Setting of the dirty bit
ebdec241 1069 * by __block_write_begin() isn't a real problem here as we clear
56d35a4c
JK
1070 * the bit before releasing a page lock and thus writeback cannot
1071 * ever write the buffer.
1072 */
1073 if (dirty)
1074 clear_buffer_dirty(bh);
5d601255 1075 BUFFER_TRACE(bh, "get write access");
56d35a4c
JK
1076 ret = ext4_journal_get_write_access(handle, bh);
1077 if (!ret && dirty)
1078 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1079 return ret;
ac27a0ec
DK
1080}
1081
2058f83a
MH
1082#ifdef CONFIG_EXT4_FS_ENCRYPTION
1083static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
1084 get_block_t *get_block)
1085{
09cbfeaf 1086 unsigned from = pos & (PAGE_SIZE - 1);
2058f83a
MH
1087 unsigned to = from + len;
1088 struct inode *inode = page->mapping->host;
1089 unsigned block_start, block_end;
1090 sector_t block;
1091 int err = 0;
1092 unsigned blocksize = inode->i_sb->s_blocksize;
1093 unsigned bbits;
1094 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1095 bool decrypt = false;
1096
1097 BUG_ON(!PageLocked(page));
09cbfeaf
KS
1098 BUG_ON(from > PAGE_SIZE);
1099 BUG_ON(to > PAGE_SIZE);
2058f83a
MH
1100 BUG_ON(from > to);
1101
1102 if (!page_has_buffers(page))
1103 create_empty_buffers(page, blocksize, 0);
1104 head = page_buffers(page);
1105 bbits = ilog2(blocksize);
09cbfeaf 1106 block = (sector_t)page->index << (PAGE_SHIFT - bbits);
2058f83a
MH
1107
1108 for (bh = head, block_start = 0; bh != head || !block_start;
1109 block++, block_start = block_end, bh = bh->b_this_page) {
1110 block_end = block_start + blocksize;
1111 if (block_end <= from || block_start >= to) {
1112 if (PageUptodate(page)) {
1113 if (!buffer_uptodate(bh))
1114 set_buffer_uptodate(bh);
1115 }
1116 continue;
1117 }
1118 if (buffer_new(bh))
1119 clear_buffer_new(bh);
1120 if (!buffer_mapped(bh)) {
1121 WARN_ON(bh->b_size != blocksize);
1122 err = get_block(inode, block, bh, 1);
1123 if (err)
1124 break;
1125 if (buffer_new(bh)) {
1126 unmap_underlying_metadata(bh->b_bdev,
1127 bh->b_blocknr);
1128 if (PageUptodate(page)) {
1129 clear_buffer_new(bh);
1130 set_buffer_uptodate(bh);
1131 mark_buffer_dirty(bh);
1132 continue;
1133 }
1134 if (block_end > to || block_start < from)
1135 zero_user_segments(page, to, block_end,
1136 block_start, from);
1137 continue;
1138 }
1139 }
1140 if (PageUptodate(page)) {
1141 if (!buffer_uptodate(bh))
1142 set_buffer_uptodate(bh);
1143 continue;
1144 }
1145 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1146 !buffer_unwritten(bh) &&
1147 (block_start < from || block_end > to)) {
dfec8a14 1148 ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2058f83a
MH
1149 *wait_bh++ = bh;
1150 decrypt = ext4_encrypted_inode(inode) &&
1151 S_ISREG(inode->i_mode);
1152 }
1153 }
1154 /*
1155 * If we issued read requests, let them complete.
1156 */
1157 while (wait_bh > wait) {
1158 wait_on_buffer(*--wait_bh);
1159 if (!buffer_uptodate(*wait_bh))
1160 err = -EIO;
1161 }
1162 if (unlikely(err))
1163 page_zero_new_buffers(page, from, to);
1164 else if (decrypt)
a7550b30 1165 err = fscrypt_decrypt_page(page);
2058f83a
MH
1166 return err;
1167}
1168#endif
1169
bfc1af65 1170static int ext4_write_begin(struct file *file, struct address_space *mapping,
de9a55b8
TT
1171 loff_t pos, unsigned len, unsigned flags,
1172 struct page **pagep, void **fsdata)
ac27a0ec 1173{
af5bc92d 1174 struct inode *inode = mapping->host;
1938a150 1175 int ret, needed_blocks;
ac27a0ec
DK
1176 handle_t *handle;
1177 int retries = 0;
af5bc92d 1178 struct page *page;
de9a55b8 1179 pgoff_t index;
af5bc92d 1180 unsigned from, to;
bfc1af65 1181
9bffad1e 1182 trace_ext4_write_begin(inode, pos, len, flags);
1938a150
AK
1183 /*
1184 * Reserve one block more for addition to orphan list in case
1185 * we allocate blocks but write fails for some reason
1186 */
1187 needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
09cbfeaf
KS
1188 index = pos >> PAGE_SHIFT;
1189 from = pos & (PAGE_SIZE - 1);
af5bc92d 1190 to = from + len;
ac27a0ec 1191
f19d5870
TM
1192 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
1193 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
1194 flags, pagep);
1195 if (ret < 0)
47564bfb
TT
1196 return ret;
1197 if (ret == 1)
1198 return 0;
f19d5870
TM
1199 }
1200
47564bfb
TT
1201 /*
1202 * grab_cache_page_write_begin() can take a long time if the
1203 * system is thrashing due to memory pressure, or if the page
1204 * is being written back. So grab it first before we start
1205 * the transaction handle. This also allows us to allocate
1206 * the page (if needed) without using GFP_NOFS.
1207 */
1208retry_grab:
1209 page = grab_cache_page_write_begin(mapping, index, flags);
1210 if (!page)
1211 return -ENOMEM;
1212 unlock_page(page);
1213
1214retry_journal:
9924a92a 1215 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
af5bc92d 1216 if (IS_ERR(handle)) {
09cbfeaf 1217 put_page(page);
47564bfb 1218 return PTR_ERR(handle);
7479d2b9 1219 }
ac27a0ec 1220
47564bfb
TT
1221 lock_page(page);
1222 if (page->mapping != mapping) {
1223 /* The page got truncated from under us */
1224 unlock_page(page);
09cbfeaf 1225 put_page(page);
cf108bca 1226 ext4_journal_stop(handle);
47564bfb 1227 goto retry_grab;
cf108bca 1228 }
7afe5aa5
DM
1229 /* In case writeback began while the page was unlocked */
1230 wait_for_stable_page(page);
cf108bca 1231
2058f83a
MH
1232#ifdef CONFIG_EXT4_FS_ENCRYPTION
1233 if (ext4_should_dioread_nolock(inode))
1234 ret = ext4_block_write_begin(page, pos, len,
705965bd 1235 ext4_get_block_unwritten);
2058f83a
MH
1236 else
1237 ret = ext4_block_write_begin(page, pos, len,
1238 ext4_get_block);
1239#else
744692dc 1240 if (ext4_should_dioread_nolock(inode))
705965bd
JK
1241 ret = __block_write_begin(page, pos, len,
1242 ext4_get_block_unwritten);
744692dc 1243 else
6e1db88d 1244 ret = __block_write_begin(page, pos, len, ext4_get_block);
2058f83a 1245#endif
bfc1af65 1246 if (!ret && ext4_should_journal_data(inode)) {
f19d5870
TM
1247 ret = ext4_walk_page_buffers(handle, page_buffers(page),
1248 from, to, NULL,
1249 do_journal_get_write_access);
ac27a0ec 1250 }
bfc1af65
NP
1251
1252 if (ret) {
af5bc92d 1253 unlock_page(page);
ae4d5372 1254 /*
6e1db88d 1255 * __block_write_begin may have instantiated a few blocks
ae4d5372
AK
1256 * outside i_size. Trim these off again. Don't need
1257 * i_size_read because we hold i_mutex.
1938a150
AK
1258 *
1259 * Add inode to orphan list in case we crash before
1260 * truncate finishes
ae4d5372 1261 */
ffacfa7a 1262 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1938a150
AK
1263 ext4_orphan_add(handle, inode);
1264
1265 ext4_journal_stop(handle);
1266 if (pos + len > inode->i_size) {
b9a4207d 1267 ext4_truncate_failed_write(inode);
de9a55b8 1268 /*
ffacfa7a 1269 * If truncate failed early the inode might
1938a150
AK
1270 * still be on the orphan list; we need to
1271 * make sure the inode is removed from the
1272 * orphan list in that case.
1273 */
1274 if (inode->i_nlink)
1275 ext4_orphan_del(NULL, inode);
1276 }
bfc1af65 1277
47564bfb
TT
1278 if (ret == -ENOSPC &&
1279 ext4_should_retry_alloc(inode->i_sb, &retries))
1280 goto retry_journal;
09cbfeaf 1281 put_page(page);
47564bfb
TT
1282 return ret;
1283 }
1284 *pagep = page;
ac27a0ec
DK
1285 return ret;
1286}
1287
bfc1af65
NP
1288/* For write_end() in data=journal mode */
1289static int write_end_fn(handle_t *handle, struct buffer_head *bh)
ac27a0ec 1290{
13fca323 1291 int ret;
ac27a0ec
DK
1292 if (!buffer_mapped(bh) || buffer_freed(bh))
1293 return 0;
1294 set_buffer_uptodate(bh);
13fca323
TT
1295 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1296 clear_buffer_meta(bh);
1297 clear_buffer_prio(bh);
1298 return ret;
ac27a0ec
DK
1299}
1300
eed4333f
ZL
1301/*
1302 * We need to pick up the new inode size which generic_commit_write gave us
1303 * `file' can be NULL - eg, when called from page_symlink().
1304 *
1305 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1306 * buffers are managed internally.
1307 */
1308static int ext4_write_end(struct file *file,
1309 struct address_space *mapping,
1310 loff_t pos, unsigned len, unsigned copied,
1311 struct page *page, void *fsdata)
f8514083 1312{
f8514083 1313 handle_t *handle = ext4_journal_current_handle();
eed4333f 1314 struct inode *inode = mapping->host;
0572639f 1315 loff_t old_size = inode->i_size;
eed4333f
ZL
1316 int ret = 0, ret2;
1317 int i_size_changed = 0;
1318
1319 trace_ext4_write_end(inode, pos, len, copied);
42c832de
TT
1320 if (ext4_has_inline_data(inode)) {
1321 ret = ext4_write_inline_data_end(inode, pos, len,
1322 copied, page);
1323 if (ret < 0)
1324 goto errout;
1325 copied = ret;
1326 } else
f19d5870
TM
1327 copied = block_write_end(file, mapping, pos,
1328 len, copied, page, fsdata);
f8514083 1329 /*
4631dbf6 1330 * it's important to update i_size while still holding page lock:
f8514083
AK
1331 * page writeout could otherwise come in and zero beyond i_size.
1332 */
4631dbf6 1333 i_size_changed = ext4_update_inode_size(inode, pos + copied);
f8514083 1334 unlock_page(page);
09cbfeaf 1335 put_page(page);
f8514083 1336
0572639f
XW
1337 if (old_size < pos)
1338 pagecache_isize_extended(inode, old_size, pos);
f8514083
AK
1339 /*
1340 * Don't mark the inode dirty under page lock. First, it unnecessarily
1341 * makes the holding time of page lock longer. Second, it forces lock
1342 * ordering of page lock and transaction start for journaling
1343 * filesystems.
1344 */
1345 if (i_size_changed)
1346 ext4_mark_inode_dirty(handle, inode);
1347
ffacfa7a 1348 if (pos + len > inode->i_size && ext4_can_truncate(inode))
f8514083
AK
1349 /* if we have allocated more blocks and copied
1350 * less. We will have blocks allocated outside
1351 * inode->i_size. So truncate them
1352 */
1353 ext4_orphan_add(handle, inode);
74d553aa 1354errout:
617ba13b 1355 ret2 = ext4_journal_stop(handle);
ac27a0ec
DK
1356 if (!ret)
1357 ret = ret2;
bfc1af65 1358
f8514083 1359 if (pos + len > inode->i_size) {
b9a4207d 1360 ext4_truncate_failed_write(inode);
de9a55b8 1361 /*
ffacfa7a 1362 * If truncate failed early the inode might still be
f8514083
AK
1363 * on the orphan list; we need to make sure the inode
1364 * is removed from the orphan list in that case.
1365 */
1366 if (inode->i_nlink)
1367 ext4_orphan_del(NULL, inode);
1368 }
1369
bfc1af65 1370 return ret ? ret : copied;
ac27a0ec
DK
1371}
1372
b90197b6
TT
1373/*
1374 * This is a private version of page_zero_new_buffers() which doesn't
1375 * set the buffer to be dirty, since in data=journalled mode we need
1376 * to call ext4_handle_dirty_metadata() instead.
1377 */
1378static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
1379{
1380 unsigned int block_start = 0, block_end;
1381 struct buffer_head *head, *bh;
1382
1383 bh = head = page_buffers(page);
1384 do {
1385 block_end = block_start + bh->b_size;
1386 if (buffer_new(bh)) {
1387 if (block_end > from && block_start < to) {
1388 if (!PageUptodate(page)) {
1389 unsigned start, size;
1390
1391 start = max(from, block_start);
1392 size = min(to, block_end) - start;
1393
1394 zero_user(page, start, size);
1395 set_buffer_uptodate(bh);
1396 }
1397 clear_buffer_new(bh);
1398 }
1399 }
1400 block_start = block_end;
1401 bh = bh->b_this_page;
1402 } while (bh != head);
1403}
1404
bfc1af65 1405static int ext4_journalled_write_end(struct file *file,
de9a55b8
TT
1406 struct address_space *mapping,
1407 loff_t pos, unsigned len, unsigned copied,
1408 struct page *page, void *fsdata)
ac27a0ec 1409{
617ba13b 1410 handle_t *handle = ext4_journal_current_handle();
bfc1af65 1411 struct inode *inode = mapping->host;
0572639f 1412 loff_t old_size = inode->i_size;
ac27a0ec
DK
1413 int ret = 0, ret2;
1414 int partial = 0;
bfc1af65 1415 unsigned from, to;
4631dbf6 1416 int size_changed = 0;
ac27a0ec 1417
9bffad1e 1418 trace_ext4_journalled_write_end(inode, pos, len, copied);
09cbfeaf 1419 from = pos & (PAGE_SIZE - 1);
bfc1af65
NP
1420 to = from + len;
1421
441c8508
CW
1422 BUG_ON(!ext4_handle_valid(handle));
1423
3fdcfb66
TM
1424 if (ext4_has_inline_data(inode))
1425 copied = ext4_write_inline_data_end(inode, pos, len,
1426 copied, page);
1427 else {
1428 if (copied < len) {
1429 if (!PageUptodate(page))
1430 copied = 0;
b90197b6 1431 zero_new_buffers(page, from+copied, to);
3fdcfb66 1432 }
ac27a0ec 1433
3fdcfb66
TM
1434 ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
1435 to, &partial, write_end_fn);
1436 if (!partial)
1437 SetPageUptodate(page);
1438 }
4631dbf6 1439 size_changed = ext4_update_inode_size(inode, pos + copied);
19f5fb7a 1440 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2d859db3 1441 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
4631dbf6 1442 unlock_page(page);
09cbfeaf 1443 put_page(page);
4631dbf6 1444
0572639f
XW
1445 if (old_size < pos)
1446 pagecache_isize_extended(inode, old_size, pos);
1447
4631dbf6 1448 if (size_changed) {
617ba13b 1449 ret2 = ext4_mark_inode_dirty(handle, inode);
ac27a0ec
DK
1450 if (!ret)
1451 ret = ret2;
1452 }
bfc1af65 1453
ffacfa7a 1454 if (pos + len > inode->i_size && ext4_can_truncate(inode))
f8514083
AK
1455 /* if we have allocated more blocks and copied
1456 * less. We will have blocks allocated outside
1457 * inode->i_size. So truncate them
1458 */
1459 ext4_orphan_add(handle, inode);
1460
617ba13b 1461 ret2 = ext4_journal_stop(handle);
ac27a0ec
DK
1462 if (!ret)
1463 ret = ret2;
f8514083 1464 if (pos + len > inode->i_size) {
b9a4207d 1465 ext4_truncate_failed_write(inode);
de9a55b8 1466 /*
ffacfa7a 1467 * If truncate failed early the inode might still be
f8514083
AK
1468 * on the orphan list; we need to make sure the inode
1469 * is removed from the orphan list in that case.
1470 */
1471 if (inode->i_nlink)
1472 ext4_orphan_del(NULL, inode);
1473 }
bfc1af65
NP
1474
1475 return ret ? ret : copied;
ac27a0ec 1476}
d2a17637 1477
9d0be502 1478/*
c27e43a1 1479 * Reserve space for a single cluster
9d0be502 1480 */
c27e43a1 1481static int ext4_da_reserve_space(struct inode *inode)
d2a17637 1482{
60e58e0f 1483 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0637c6f4 1484 struct ext4_inode_info *ei = EXT4_I(inode);
5dd4056d 1485 int ret;
03179fe9
TT
1486
1487 /*
1488 * We will charge metadata quota at writeout time; this saves
1489 * us from metadata over-estimation, though we may go over by
1490 * a small amount in the end. Here we just reserve for data.
1491 */
1492 ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
1493 if (ret)
1494 return ret;
d2a17637 1495
0637c6f4 1496 spin_lock(&ei->i_block_reservation_lock);
71d4f7d0 1497 if (ext4_claim_free_clusters(sbi, 1, 0)) {
03179fe9 1498 spin_unlock(&ei->i_block_reservation_lock);
03179fe9 1499 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
d2a17637
MC
1500 return -ENOSPC;
1501 }
9d0be502 1502 ei->i_reserved_data_blocks++;
c27e43a1 1503 trace_ext4_da_reserve_space(inode);
0637c6f4 1504 spin_unlock(&ei->i_block_reservation_lock);
39bc680a 1505
d2a17637
MC
1506 return 0; /* success */
1507}
1508
12219aea 1509static void ext4_da_release_space(struct inode *inode, int to_free)
d2a17637
MC
1510{
1511 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
0637c6f4 1512 struct ext4_inode_info *ei = EXT4_I(inode);
d2a17637 1513
cd213226
MC
1514 if (!to_free)
1515 return; /* Nothing to release, exit */
1516
d2a17637 1517 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
cd213226 1518
5a58ec87 1519 trace_ext4_da_release_space(inode, to_free);
0637c6f4 1520 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
cd213226 1521 /*
0637c6f4
TT
1522 * if there aren't enough reserved blocks, then the
1523 * counter is messed up somewhere. Since this
1524 * function is called from invalidate page, it's
1525 * harmless to return without any action.
cd213226 1526 */
8de5c325 1527 ext4_warning(inode->i_sb, "ext4_da_release_space: "
0637c6f4 1528 "ino %lu, to_free %d with only %d reserved "
1084f252 1529 "data blocks", inode->i_ino, to_free,
0637c6f4
TT
1530 ei->i_reserved_data_blocks);
1531 WARN_ON(1);
1532 to_free = ei->i_reserved_data_blocks;
cd213226 1533 }
0637c6f4 1534 ei->i_reserved_data_blocks -= to_free;
cd213226 1535
72b8ab9d 1536 /* update fs dirty data blocks counter */
57042651 1537 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
d2a17637 1538
d2a17637 1539 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
60e58e0f 1540
7b415bf6 1541 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
d2a17637
MC
1542}
1543
1544static void ext4_da_page_release_reservation(struct page *page,
ca99fdd2
LC
1545 unsigned int offset,
1546 unsigned int length)
d2a17637 1547{
9705acd6 1548 int to_release = 0, contiguous_blks = 0;
d2a17637
MC
1549 struct buffer_head *head, *bh;
1550 unsigned int curr_off = 0;
7b415bf6
AK
1551 struct inode *inode = page->mapping->host;
1552 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ca99fdd2 1553 unsigned int stop = offset + length;
7b415bf6 1554 int num_clusters;
51865fda 1555 ext4_fsblk_t lblk;
d2a17637 1556
09cbfeaf 1557 BUG_ON(stop > PAGE_SIZE || stop < length);
ca99fdd2 1558
d2a17637
MC
1559 head = page_buffers(page);
1560 bh = head;
1561 do {
1562 unsigned int next_off = curr_off + bh->b_size;
1563
ca99fdd2
LC
1564 if (next_off > stop)
1565 break;
1566
d2a17637
MC
1567 if ((offset <= curr_off) && (buffer_delay(bh))) {
1568 to_release++;
9705acd6 1569 contiguous_blks++;
d2a17637 1570 clear_buffer_delay(bh);
9705acd6
LC
1571 } else if (contiguous_blks) {
1572 lblk = page->index <<
09cbfeaf 1573 (PAGE_SHIFT - inode->i_blkbits);
9705acd6
LC
1574 lblk += (curr_off >> inode->i_blkbits) -
1575 contiguous_blks;
1576 ext4_es_remove_extent(inode, lblk, contiguous_blks);
1577 contiguous_blks = 0;
d2a17637
MC
1578 }
1579 curr_off = next_off;
1580 } while ((bh = bh->b_this_page) != head);
7b415bf6 1581
9705acd6 1582 if (contiguous_blks) {
09cbfeaf 1583 lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
9705acd6
LC
1584 lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
1585 ext4_es_remove_extent(inode, lblk, contiguous_blks);
51865fda
ZL
1586 }
1587
7b415bf6
AK
1588 /* If we have released all the blocks belonging to a cluster, then we
1589 * need to release the reserved space for that cluster. */
1590 num_clusters = EXT4_NUM_B2C(sbi, to_release);
1591 while (num_clusters > 0) {
09cbfeaf 1592 lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
7b415bf6
AK
1593 ((num_clusters - 1) << sbi->s_cluster_bits);
1594 if (sbi->s_cluster_ratio == 1 ||
7d1b1fbc 1595 !ext4_find_delalloc_cluster(inode, lblk))
7b415bf6
AK
1596 ext4_da_release_space(inode, 1);
1597
1598 num_clusters--;
1599 }
d2a17637 1600}
ac27a0ec 1601
64769240
AT
1602/*
1603 * Delayed allocation stuff
1604 */
1605
4e7ea81d
JK
1606struct mpage_da_data {
1607 struct inode *inode;
1608 struct writeback_control *wbc;
6b523df4 1609
4e7ea81d
JK
1610 pgoff_t first_page; /* The first page to write */
1611 pgoff_t next_page; /* Current page to examine */
1612 pgoff_t last_page; /* Last page to examine */
791b7f08 1613 /*
4e7ea81d
JK
1614 * Extent to map - this can be after first_page because that can be
1615 * fully mapped. We somewhat abuse m_flags to store whether the extent
1616 * is delalloc or unwritten.
791b7f08 1617 */
4e7ea81d
JK
1618 struct ext4_map_blocks map;
1619 struct ext4_io_submit io_submit; /* IO submission data */
1620};
64769240 1621
4e7ea81d
JK
1622static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1623 bool invalidate)
c4a0c46e
AK
1624{
1625 int nr_pages, i;
1626 pgoff_t index, end;
1627 struct pagevec pvec;
1628 struct inode *inode = mpd->inode;
1629 struct address_space *mapping = inode->i_mapping;
4e7ea81d
JK
1630
1631 /* This is necessary when next_page == 0. */
1632 if (mpd->first_page >= mpd->next_page)
1633 return;
c4a0c46e 1634
c7f5938a
CW
1635 index = mpd->first_page;
1636 end = mpd->next_page - 1;
4e7ea81d
JK
1637 if (invalidate) {
1638 ext4_lblk_t start, last;
09cbfeaf
KS
1639 start = index << (PAGE_SHIFT - inode->i_blkbits);
1640 last = end << (PAGE_SHIFT - inode->i_blkbits);
4e7ea81d
JK
1641 ext4_es_remove_extent(inode, start, last - start + 1);
1642 }
51865fda 1643
66bea92c 1644 pagevec_init(&pvec, 0);
c4a0c46e
AK
1645 while (index <= end) {
1646 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1647 if (nr_pages == 0)
1648 break;
1649 for (i = 0; i < nr_pages; i++) {
1650 struct page *page = pvec.pages[i];
9b1d0998 1651 if (page->index > end)
c4a0c46e 1652 break;
c4a0c46e
AK
1653 BUG_ON(!PageLocked(page));
1654 BUG_ON(PageWriteback(page));
4e7ea81d 1655 if (invalidate) {
4e800c03 1656 if (page_mapped(page))
1657 clear_page_dirty_for_io(page);
09cbfeaf 1658 block_invalidatepage(page, 0, PAGE_SIZE);
4e7ea81d
JK
1659 ClearPageUptodate(page);
1660 }
c4a0c46e
AK
1661 unlock_page(page);
1662 }
9b1d0998
JK
1663 index = pvec.pages[nr_pages - 1]->index + 1;
1664 pagevec_release(&pvec);
c4a0c46e 1665 }
c4a0c46e
AK
1666}
1667
df22291f
AK
1668static void ext4_print_free_blocks(struct inode *inode)
1669{
1670 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
92b97816 1671 struct super_block *sb = inode->i_sb;
f78ee70d 1672 struct ext4_inode_info *ei = EXT4_I(inode);
92b97816
TT
1673
1674 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
5dee5437 1675 EXT4_C2B(EXT4_SB(inode->i_sb),
f78ee70d 1676 ext4_count_free_clusters(sb)));
92b97816
TT
1677 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1678 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
f78ee70d 1679 (long long) EXT4_C2B(EXT4_SB(sb),
57042651 1680 percpu_counter_sum(&sbi->s_freeclusters_counter)));
92b97816 1681 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
f78ee70d 1682 (long long) EXT4_C2B(EXT4_SB(sb),
7b415bf6 1683 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
92b97816
TT
1684 ext4_msg(sb, KERN_CRIT, "Block reservation details");
1685 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
f78ee70d 1686 ei->i_reserved_data_blocks);
df22291f
AK
1687 return;
1688}
1689
c364b22c 1690static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
29fa89d0 1691{
c364b22c 1692 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
29fa89d0
AK
1693}
1694
5356f261
AK
1695/*
1696 * This function is grabs code from the very beginning of
1697 * ext4_map_blocks, but assumes that the caller is from delayed write
1698 * time. This function looks up the requested blocks and sets the
1699 * buffer delay bit under the protection of i_data_sem.
1700 */
1701static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1702 struct ext4_map_blocks *map,
1703 struct buffer_head *bh)
1704{
d100eef2 1705 struct extent_status es;
5356f261
AK
1706 int retval;
1707 sector_t invalid_block = ~((sector_t) 0xffff);
921f266b
DM
1708#ifdef ES_AGGRESSIVE_TEST
1709 struct ext4_map_blocks orig_map;
1710
1711 memcpy(&orig_map, map, sizeof(*map));
1712#endif
5356f261
AK
1713
1714 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1715 invalid_block = ~0;
1716
1717 map->m_flags = 0;
1718 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1719 "logical block %lu\n", inode->i_ino, map->m_len,
1720 (unsigned long) map->m_lblk);
d100eef2
ZL
1721
1722 /* Lookup extent status tree firstly */
1723 if (ext4_es_lookup_extent(inode, iblock, &es)) {
d100eef2
ZL
1724 if (ext4_es_is_hole(&es)) {
1725 retval = 0;
c8b459f4 1726 down_read(&EXT4_I(inode)->i_data_sem);
d100eef2
ZL
1727 goto add_delayed;
1728 }
1729
1730 /*
1731 * Delayed extent could be allocated by fallocate.
1732 * So we need to check it.
1733 */
1734 if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
1735 map_bh(bh, inode->i_sb, invalid_block);
1736 set_buffer_new(bh);
1737 set_buffer_delay(bh);
1738 return 0;
1739 }
1740
1741 map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
1742 retval = es.es_len - (iblock - es.es_lblk);
1743 if (retval > map->m_len)
1744 retval = map->m_len;
1745 map->m_len = retval;
1746 if (ext4_es_is_written(&es))
1747 map->m_flags |= EXT4_MAP_MAPPED;
1748 else if (ext4_es_is_unwritten(&es))
1749 map->m_flags |= EXT4_MAP_UNWRITTEN;
1750 else
1751 BUG_ON(1);
1752
921f266b
DM
1753#ifdef ES_AGGRESSIVE_TEST
1754 ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
1755#endif
d100eef2
ZL
1756 return retval;
1757 }
1758
5356f261
AK
1759 /*
1760 * Try to see if we can get the block without requesting a new
1761 * file system block.
1762 */
c8b459f4 1763 down_read(&EXT4_I(inode)->i_data_sem);
cbd7584e 1764 if (ext4_has_inline_data(inode))
9c3569b5 1765 retval = 0;
cbd7584e 1766 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
2f8e0a7c 1767 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
5356f261 1768 else
2f8e0a7c 1769 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
5356f261 1770
d100eef2 1771add_delayed:
5356f261 1772 if (retval == 0) {
f7fec032 1773 int ret;
5356f261
AK
1774 /*
1775 * XXX: __block_prepare_write() unmaps passed block,
1776 * is it OK?
1777 */
386ad67c
LC
1778 /*
1779 * If the block was allocated from previously allocated cluster,
1780 * then we don't need to reserve it again. However we still need
1781 * to reserve metadata for every block we're going to write.
1782 */
c27e43a1 1783 if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
cbd7584e 1784 !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
c27e43a1 1785 ret = ext4_da_reserve_space(inode);
f7fec032 1786 if (ret) {
5356f261 1787 /* not enough space to reserve */
f7fec032 1788 retval = ret;
5356f261 1789 goto out_unlock;
f7fec032 1790 }
5356f261
AK
1791 }
1792
f7fec032
ZL
1793 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1794 ~0, EXTENT_STATUS_DELAYED);
1795 if (ret) {
1796 retval = ret;
51865fda 1797 goto out_unlock;
f7fec032 1798 }
51865fda 1799
5356f261
AK
1800 map_bh(bh, inode->i_sb, invalid_block);
1801 set_buffer_new(bh);
1802 set_buffer_delay(bh);
f7fec032
ZL
1803 } else if (retval > 0) {
1804 int ret;
3be78c73 1805 unsigned int status;
f7fec032 1806
44fb851d
ZL
1807 if (unlikely(retval != map->m_len)) {
1808 ext4_warning(inode->i_sb,
1809 "ES len assertion failed for inode "
1810 "%lu: retval %d != map->m_len %d",
1811 inode->i_ino, retval, map->m_len);
1812 WARN_ON(1);
921f266b 1813 }
921f266b 1814
f7fec032
ZL
1815 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
1816 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
1817 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1818 map->m_pblk, status);
1819 if (ret != 0)
1820 retval = ret;
5356f261
AK
1821 }
1822
1823out_unlock:
1824 up_read((&EXT4_I(inode)->i_data_sem));
1825
1826 return retval;
1827}
1828
64769240 1829/*
d91bd2c1 1830 * This is a special get_block_t callback which is used by
b920c755
TT
1831 * ext4_da_write_begin(). It will either return mapped block or
1832 * reserve space for a single block.
29fa89d0
AK
1833 *
1834 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
1835 * We also have b_blocknr = -1 and b_bdev initialized properly
1836 *
1837 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
1838 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1839 * initialized properly.
64769240 1840 */
9c3569b5
TM
1841int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1842 struct buffer_head *bh, int create)
64769240 1843{
2ed88685 1844 struct ext4_map_blocks map;
64769240
AT
1845 int ret = 0;
1846
1847 BUG_ON(create == 0);
2ed88685
TT
1848 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
1849
1850 map.m_lblk = iblock;
1851 map.m_len = 1;
64769240
AT
1852
1853 /*
1854 * first, we need to know whether the block is allocated already
1855 * preallocated blocks are unmapped but should treated
1856 * the same as allocated blocks.
1857 */
5356f261
AK
1858 ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1859 if (ret <= 0)
2ed88685 1860 return ret;
64769240 1861
2ed88685 1862 map_bh(bh, inode->i_sb, map.m_pblk);
ed8ad838 1863 ext4_update_bh_state(bh, map.m_flags);
2ed88685
TT
1864
1865 if (buffer_unwritten(bh)) {
1866 /* A delayed write to unwritten bh should be marked
1867 * new and mapped. Mapped ensures that we don't do
1868 * get_block multiple times when we write to the same
1869 * offset and new ensures that we do proper zero out
1870 * for partial write.
1871 */
1872 set_buffer_new(bh);
c8205636 1873 set_buffer_mapped(bh);
2ed88685
TT
1874 }
1875 return 0;
64769240 1876}
61628a3f 1877
62e086be
AK
1878static int bget_one(handle_t *handle, struct buffer_head *bh)
1879{
1880 get_bh(bh);
1881 return 0;
1882}
1883
1884static int bput_one(handle_t *handle, struct buffer_head *bh)
1885{
1886 put_bh(bh);
1887 return 0;
1888}
1889
1890static int __ext4_journalled_writepage(struct page *page,
62e086be
AK
1891 unsigned int len)
1892{
1893 struct address_space *mapping = page->mapping;
1894 struct inode *inode = mapping->host;
3fdcfb66 1895 struct buffer_head *page_bufs = NULL;
62e086be 1896 handle_t *handle = NULL;
3fdcfb66
TM
1897 int ret = 0, err = 0;
1898 int inline_data = ext4_has_inline_data(inode);
1899 struct buffer_head *inode_bh = NULL;
62e086be 1900
cb20d518 1901 ClearPageChecked(page);
3fdcfb66
TM
1902
1903 if (inline_data) {
1904 BUG_ON(page->index != 0);
1905 BUG_ON(len > ext4_get_max_inline_size(inode));
1906 inode_bh = ext4_journalled_write_inline_data(inode, len, page);
1907 if (inode_bh == NULL)
1908 goto out;
1909 } else {
1910 page_bufs = page_buffers(page);
1911 if (!page_bufs) {
1912 BUG();
1913 goto out;
1914 }
1915 ext4_walk_page_buffers(handle, page_bufs, 0, len,
1916 NULL, bget_one);
1917 }
bdf96838
TT
1918 /*
1919 * We need to release the page lock before we start the
1920 * journal, so grab a reference so the page won't disappear
1921 * out from under us.
1922 */
1923 get_page(page);
62e086be
AK
1924 unlock_page(page);
1925
9924a92a
TT
1926 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
1927 ext4_writepage_trans_blocks(inode));
62e086be
AK
1928 if (IS_ERR(handle)) {
1929 ret = PTR_ERR(handle);
bdf96838
TT
1930 put_page(page);
1931 goto out_no_pagelock;
62e086be 1932 }
441c8508
CW
1933 BUG_ON(!ext4_handle_valid(handle));
1934
bdf96838
TT
1935 lock_page(page);
1936 put_page(page);
1937 if (page->mapping != mapping) {
1938 /* The page got truncated from under us */
1939 ext4_journal_stop(handle);
1940 ret = 0;
1941 goto out;
1942 }
1943
3fdcfb66 1944 if (inline_data) {
5d601255 1945 BUFFER_TRACE(inode_bh, "get write access");
3fdcfb66 1946 ret = ext4_journal_get_write_access(handle, inode_bh);
62e086be 1947
3fdcfb66
TM
1948 err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
1949
1950 } else {
1951 ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
1952 do_journal_get_write_access);
1953
1954 err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
1955 write_end_fn);
1956 }
62e086be
AK
1957 if (ret == 0)
1958 ret = err;
2d859db3 1959 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
62e086be
AK
1960 err = ext4_journal_stop(handle);
1961 if (!ret)
1962 ret = err;
1963
3fdcfb66 1964 if (!ext4_has_inline_data(inode))
8c9367fd 1965 ext4_walk_page_buffers(NULL, page_bufs, 0, len,
3fdcfb66 1966 NULL, bput_one);
19f5fb7a 1967 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
62e086be 1968out:
bdf96838
TT
1969 unlock_page(page);
1970out_no_pagelock:
3fdcfb66 1971 brelse(inode_bh);
62e086be
AK
1972 return ret;
1973}
1974
61628a3f 1975/*
43ce1d23
AK
1976 * Note that we don't need to start a transaction unless we're journaling data
1977 * because we should have holes filled from ext4_page_mkwrite(). We even don't
1978 * need to file the inode to the transaction's list in ordered mode because if
1979 * we are writing back data added by write(), the inode is already there and if
25985edc 1980 * we are writing back data modified via mmap(), no one guarantees in which
43ce1d23
AK
1981 * transaction the data will hit the disk. In case we are journaling data, we
1982 * cannot start transaction directly because transaction start ranks above page
1983 * lock so we have to do some magic.
1984 *
b920c755 1985 * This function can get called via...
20970ba6 1986 * - ext4_writepages after taking page lock (have journal handle)
b920c755 1987 * - journal_submit_inode_data_buffers (no journal handle)
f6463b0d 1988 * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
b920c755 1989 * - grab_page_cache when doing write_begin (have journal handle)
43ce1d23
AK
1990 *
1991 * We don't do any block allocation in this function. If we have page with
1992 * multiple blocks we need to write those buffer_heads that are mapped. This
1993 * is important for mmaped based write. So if we do with blocksize 1K
1994 * truncate(f, 1024);
1995 * a = mmap(f, 0, 4096);
1996 * a[0] = 'a';
1997 * truncate(f, 4096);
1998 * we have in the page first buffer_head mapped via page_mkwrite call back
90802ed9 1999 * but other buffer_heads would be unmapped but dirty (dirty done via the
43ce1d23
AK
2000 * do_wp_page). So writepage should write the first block. If we modify
2001 * the mmap area beyond 1024 we will again get a page_fault and the
2002 * page_mkwrite callback will do the block allocation and mark the
2003 * buffer_heads mapped.
2004 *
2005 * We redirty the page if we have any buffer_heads that is either delay or
2006 * unwritten in the page.
2007 *
2008 * We can get recursively called as show below.
2009 *
2010 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2011 * ext4_writepage()
2012 *
2013 * But since we don't do any block allocation we should not deadlock.
2014 * Page also have the dirty flag cleared so we don't get recurive page_lock.
61628a3f 2015 */
43ce1d23 2016static int ext4_writepage(struct page *page,
62e086be 2017 struct writeback_control *wbc)
64769240 2018{
f8bec370 2019 int ret = 0;
61628a3f 2020 loff_t size;
498e5f24 2021 unsigned int len;
744692dc 2022 struct buffer_head *page_bufs = NULL;
61628a3f 2023 struct inode *inode = page->mapping->host;
36ade451 2024 struct ext4_io_submit io_submit;
1c8349a1 2025 bool keep_towrite = false;
61628a3f 2026
a9c667f8 2027 trace_ext4_writepage(page);
f0e6c985 2028 size = i_size_read(inode);
09cbfeaf
KS
2029 if (page->index == size >> PAGE_SHIFT)
2030 len = size & ~PAGE_MASK;
f0e6c985 2031 else
09cbfeaf 2032 len = PAGE_SIZE;
64769240 2033
a42afc5f 2034 page_bufs = page_buffers(page);
a42afc5f 2035 /*
fe386132
JK
2036 * We cannot do block allocation or other extent handling in this
2037 * function. If there are buffers needing that, we have to redirty
2038 * the page. But we may reach here when we do a journal commit via
2039 * journal_submit_inode_data_buffers() and in that case we must write
2040 * allocated buffers to achieve data=ordered mode guarantees.
cccd147a
TT
2041 *
2042 * Also, if there is only one buffer per page (the fs block
2043 * size == the page size), if one buffer needs block
2044 * allocation or needs to modify the extent tree to clear the
2045 * unwritten flag, we know that the page can't be written at
2046 * all, so we might as well refuse the write immediately.
2047 * Unfortunately if the block size != page size, we can't as
2048 * easily detect this case using ext4_walk_page_buffers(), but
2049 * for the extremely common case, this is an optimization that
2050 * skips a useless round trip through ext4_bio_write_page().
a42afc5f 2051 */
f19d5870
TM
2052 if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2053 ext4_bh_delay_or_unwritten)) {
f8bec370 2054 redirty_page_for_writepage(wbc, page);
cccd147a 2055 if ((current->flags & PF_MEMALLOC) ||
09cbfeaf 2056 (inode->i_sb->s_blocksize == PAGE_SIZE)) {
fe386132
JK
2057 /*
2058 * For memory cleaning there's no point in writing only
2059 * some buffers. So just bail out. Warn if we came here
2060 * from direct reclaim.
2061 */
2062 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
2063 == PF_MEMALLOC);
f0e6c985
AK
2064 unlock_page(page);
2065 return 0;
2066 }
1c8349a1 2067 keep_towrite = true;
a42afc5f 2068 }
64769240 2069
cb20d518 2070 if (PageChecked(page) && ext4_should_journal_data(inode))
43ce1d23
AK
2071 /*
2072 * It's mmapped pagecache. Add buffers and journal it. There
2073 * doesn't seem much point in redirtying the page here.
2074 */
3f0ca309 2075 return __ext4_journalled_writepage(page, len);
43ce1d23 2076
97a851ed
JK
2077 ext4_io_submit_init(&io_submit, wbc);
2078 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
2079 if (!io_submit.io_end) {
2080 redirty_page_for_writepage(wbc, page);
2081 unlock_page(page);
2082 return -ENOMEM;
2083 }
1c8349a1 2084 ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
36ade451 2085 ext4_io_submit(&io_submit);
97a851ed
JK
2086 /* Drop io_end reference we got from init */
2087 ext4_put_io_end_defer(io_submit.io_end);
64769240
AT
2088 return ret;
2089}
2090
5f1132b2
JK
2091static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
2092{
2093 int len;
2094 loff_t size = i_size_read(mpd->inode);
2095 int err;
2096
2097 BUG_ON(page->index != mpd->first_page);
09cbfeaf
KS
2098 if (page->index == size >> PAGE_SHIFT)
2099 len = size & ~PAGE_MASK;
5f1132b2 2100 else
09cbfeaf 2101 len = PAGE_SIZE;
5f1132b2 2102 clear_page_dirty_for_io(page);
1c8349a1 2103 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
5f1132b2
JK
2104 if (!err)
2105 mpd->wbc->nr_to_write--;
2106 mpd->first_page++;
2107
2108 return err;
2109}
2110
4e7ea81d
JK
2111#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
2112
61628a3f 2113/*
fffb2739
JK
2114 * mballoc gives us at most this number of blocks...
2115 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
70261f56 2116 * The rest of mballoc seems to handle chunks up to full group size.
61628a3f 2117 */
fffb2739 2118#define MAX_WRITEPAGES_EXTENT_LEN 2048
525f4ed8 2119
4e7ea81d
JK
2120/*
2121 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
2122 *
2123 * @mpd - extent of blocks
2124 * @lblk - logical number of the block in the file
09930042 2125 * @bh - buffer head we want to add to the extent
4e7ea81d 2126 *
09930042
JK
2127 * The function is used to collect contig. blocks in the same state. If the
2128 * buffer doesn't require mapping for writeback and we haven't started the
2129 * extent of buffers to map yet, the function returns 'true' immediately - the
2130 * caller can write the buffer right away. Otherwise the function returns true
2131 * if the block has been added to the extent, false if the block couldn't be
2132 * added.
4e7ea81d 2133 */
09930042
JK
2134static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
2135 struct buffer_head *bh)
4e7ea81d
JK
2136{
2137 struct ext4_map_blocks *map = &mpd->map;
2138
09930042
JK
2139 /* Buffer that doesn't need mapping for writeback? */
2140 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
2141 (!buffer_delay(bh) && !buffer_unwritten(bh))) {
2142 /* So far no extent to map => we write the buffer right away */
2143 if (map->m_len == 0)
2144 return true;
2145 return false;
2146 }
4e7ea81d
JK
2147
2148 /* First block in the extent? */
2149 if (map->m_len == 0) {
2150 map->m_lblk = lblk;
2151 map->m_len = 1;
09930042
JK
2152 map->m_flags = bh->b_state & BH_FLAGS;
2153 return true;
4e7ea81d
JK
2154 }
2155
09930042
JK
2156 /* Don't go larger than mballoc is willing to allocate */
2157 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
2158 return false;
2159
4e7ea81d
JK
2160 /* Can we merge the block to our big extent? */
2161 if (lblk == map->m_lblk + map->m_len &&
09930042 2162 (bh->b_state & BH_FLAGS) == map->m_flags) {
4e7ea81d 2163 map->m_len++;
09930042 2164 return true;
4e7ea81d 2165 }
09930042 2166 return false;
4e7ea81d
JK
2167}
2168
5f1132b2
JK
2169/*
2170 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
2171 *
2172 * @mpd - extent of blocks for mapping
2173 * @head - the first buffer in the page
2174 * @bh - buffer we should start processing from
2175 * @lblk - logical number of the block in the file corresponding to @bh
2176 *
2177 * Walk through page buffers from @bh upto @head (exclusive) and either submit
2178 * the page for IO if all buffers in this page were mapped and there's no
2179 * accumulated extent of buffers to map or add buffers in the page to the
2180 * extent of buffers to map. The function returns 1 if the caller can continue
2181 * by processing the next page, 0 if it should stop adding buffers to the
2182 * extent to map because we cannot extend it anymore. It can also return value
2183 * < 0 in case of error during IO submission.
2184 */
2185static int mpage_process_page_bufs(struct mpage_da_data *mpd,
2186 struct buffer_head *head,
2187 struct buffer_head *bh,
2188 ext4_lblk_t lblk)
4e7ea81d
JK
2189{
2190 struct inode *inode = mpd->inode;
5f1132b2 2191 int err;
4e7ea81d
JK
2192 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2193 >> inode->i_blkbits;
2194
2195 do {
2196 BUG_ON(buffer_locked(bh));
2197
09930042 2198 if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
4e7ea81d
JK
2199 /* Found extent to map? */
2200 if (mpd->map.m_len)
5f1132b2 2201 return 0;
09930042 2202 /* Everything mapped so far and we hit EOF */
5f1132b2 2203 break;
4e7ea81d 2204 }
4e7ea81d 2205 } while (lblk++, (bh = bh->b_this_page) != head);
5f1132b2
JK
2206 /* So far everything mapped? Submit the page for IO. */
2207 if (mpd->map.m_len == 0) {
2208 err = mpage_submit_page(mpd, head->b_page);
2209 if (err < 0)
2210 return err;
2211 }
2212 return lblk < blocks;
4e7ea81d
JK
2213}
2214
2215/*
2216 * mpage_map_buffers - update buffers corresponding to changed extent and
2217 * submit fully mapped pages for IO
2218 *
2219 * @mpd - description of extent to map, on return next extent to map
2220 *
2221 * Scan buffers corresponding to changed extent (we expect corresponding pages
2222 * to be already locked) and update buffer state according to new extent state.
2223 * We map delalloc buffers to their physical location, clear unwritten bits,
556615dc 2224 * and mark buffers as uninit when we perform writes to unwritten extents
4e7ea81d
JK
2225 * and do extent conversion after IO is finished. If the last page is not fully
2226 * mapped, we update @map to the next extent in the last page that needs
2227 * mapping. Otherwise we submit the page for IO.
2228 */
2229static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2230{
2231 struct pagevec pvec;
2232 int nr_pages, i;
2233 struct inode *inode = mpd->inode;
2234 struct buffer_head *head, *bh;
09cbfeaf 2235 int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
4e7ea81d
JK
2236 pgoff_t start, end;
2237 ext4_lblk_t lblk;
2238 sector_t pblock;
2239 int err;
2240
2241 start = mpd->map.m_lblk >> bpp_bits;
2242 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2243 lblk = start << bpp_bits;
2244 pblock = mpd->map.m_pblk;
2245
2246 pagevec_init(&pvec, 0);
2247 while (start <= end) {
2248 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
2249 PAGEVEC_SIZE);
2250 if (nr_pages == 0)
2251 break;
2252 for (i = 0; i < nr_pages; i++) {
2253 struct page *page = pvec.pages[i];
2254
2255 if (page->index > end)
2256 break;
70261f56 2257 /* Up to 'end' pages must be contiguous */
4e7ea81d
JK
2258 BUG_ON(page->index != start);
2259 bh = head = page_buffers(page);
2260 do {
2261 if (lblk < mpd->map.m_lblk)
2262 continue;
2263 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2264 /*
2265 * Buffer after end of mapped extent.
2266 * Find next buffer in the page to map.
2267 */
2268 mpd->map.m_len = 0;
2269 mpd->map.m_flags = 0;
5f1132b2
JK
2270 /*
2271 * FIXME: If dioread_nolock supports
2272 * blocksize < pagesize, we need to make
2273 * sure we add size mapped so far to
2274 * io_end->size as the following call
2275 * can submit the page for IO.
2276 */
2277 err = mpage_process_page_bufs(mpd, head,
2278 bh, lblk);
4e7ea81d 2279 pagevec_release(&pvec);
5f1132b2
JK
2280 if (err > 0)
2281 err = 0;
2282 return err;
4e7ea81d
JK
2283 }
2284 if (buffer_delay(bh)) {
2285 clear_buffer_delay(bh);
2286 bh->b_blocknr = pblock++;
2287 }
4e7ea81d 2288 clear_buffer_unwritten(bh);
5f1132b2 2289 } while (lblk++, (bh = bh->b_this_page) != head);
4e7ea81d
JK
2290
2291 /*
2292 * FIXME: This is going to break if dioread_nolock
2293 * supports blocksize < pagesize as we will try to
2294 * convert potentially unmapped parts of inode.
2295 */
09cbfeaf 2296 mpd->io_submit.io_end->size += PAGE_SIZE;
4e7ea81d
JK
2297 /* Page fully mapped - let IO run! */
2298 err = mpage_submit_page(mpd, page);
2299 if (err < 0) {
2300 pagevec_release(&pvec);
2301 return err;
2302 }
2303 start++;
2304 }
2305 pagevec_release(&pvec);
2306 }
2307 /* Extent fully mapped and matches with page boundary. We are done. */
2308 mpd->map.m_len = 0;
2309 mpd->map.m_flags = 0;
2310 return 0;
2311}
2312
2313static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2314{
2315 struct inode *inode = mpd->inode;
2316 struct ext4_map_blocks *map = &mpd->map;
2317 int get_blocks_flags;
090f32ee 2318 int err, dioread_nolock;
4e7ea81d
JK
2319
2320 trace_ext4_da_write_pages_extent(inode, map);
2321 /*
2322 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
556615dc 2323 * to convert an unwritten extent to be initialized (in the case
4e7ea81d
JK
2324 * where we have written into one or more preallocated blocks). It is
2325 * possible that we're going to need more metadata blocks than
2326 * previously reserved. However we must not fail because we're in
2327 * writeback and there is nothing we can do about it so it might result
2328 * in data loss. So use reserved blocks to allocate metadata if
2329 * possible.
2330 *
754cfed6
TT
2331 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
2332 * the blocks in question are delalloc blocks. This indicates
2333 * that the blocks and quotas has already been checked when
2334 * the data was copied into the page cache.
4e7ea81d
JK
2335 */
2336 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
ee0876bc
JK
2337 EXT4_GET_BLOCKS_METADATA_NOFAIL |
2338 EXT4_GET_BLOCKS_IO_SUBMIT;
090f32ee
LC
2339 dioread_nolock = ext4_should_dioread_nolock(inode);
2340 if (dioread_nolock)
4e7ea81d
JK
2341 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2342 if (map->m_flags & (1 << BH_Delay))
2343 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2344
2345 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2346 if (err < 0)
2347 return err;
090f32ee 2348 if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
6b523df4
JK
2349 if (!mpd->io_submit.io_end->handle &&
2350 ext4_handle_valid(handle)) {
2351 mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2352 handle->h_rsv_handle = NULL;
2353 }
3613d228 2354 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
6b523df4 2355 }
4e7ea81d
JK
2356
2357 BUG_ON(map->m_len == 0);
2358 if (map->m_flags & EXT4_MAP_NEW) {
64e1c57f
JK
2359 clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
2360 map->m_len);
4e7ea81d
JK
2361 }
2362 return 0;
2363}
2364
2365/*
2366 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2367 * mpd->len and submit pages underlying it for IO
2368 *
2369 * @handle - handle for journal operations
2370 * @mpd - extent to map
7534e854
JK
2371 * @give_up_on_write - we set this to true iff there is a fatal error and there
2372 * is no hope of writing the data. The caller should discard
2373 * dirty pages to avoid infinite loops.
4e7ea81d
JK
2374 *
2375 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2376 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2377 * them to initialized or split the described range from larger unwritten
2378 * extent. Note that we need not map all the described range since allocation
2379 * can return less blocks or the range is covered by more unwritten extents. We
2380 * cannot map more because we are limited by reserved transaction credits. On
2381 * the other hand we always make sure that the last touched page is fully
2382 * mapped so that it can be written out (and thus forward progress is
2383 * guaranteed). After mapping we submit all mapped pages for IO.
2384 */
2385static int mpage_map_and_submit_extent(handle_t *handle,
cb530541
TT
2386 struct mpage_da_data *mpd,
2387 bool *give_up_on_write)
4e7ea81d
JK
2388{
2389 struct inode *inode = mpd->inode;
2390 struct ext4_map_blocks *map = &mpd->map;
2391 int err;
2392 loff_t disksize;
6603120e 2393 int progress = 0;
4e7ea81d
JK
2394
2395 mpd->io_submit.io_end->offset =
2396 ((loff_t)map->m_lblk) << inode->i_blkbits;
27d7c4ed 2397 do {
4e7ea81d
JK
2398 err = mpage_map_one_extent(handle, mpd);
2399 if (err < 0) {
2400 struct super_block *sb = inode->i_sb;
2401
cb530541
TT
2402 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2403 goto invalidate_dirty_pages;
4e7ea81d 2404 /*
cb530541
TT
2405 * Let the uper layers retry transient errors.
2406 * In the case of ENOSPC, if ext4_count_free_blocks()
2407 * is non-zero, a commit should free up blocks.
4e7ea81d 2408 */
cb530541 2409 if ((err == -ENOMEM) ||
6603120e
DM
2410 (err == -ENOSPC && ext4_count_free_clusters(sb))) {
2411 if (progress)
2412 goto update_disksize;
cb530541 2413 return err;
6603120e 2414 }
cb530541
TT
2415 ext4_msg(sb, KERN_CRIT,
2416 "Delayed block allocation failed for "
2417 "inode %lu at logical offset %llu with"
2418 " max blocks %u with error %d",
2419 inode->i_ino,
2420 (unsigned long long)map->m_lblk,
2421 (unsigned)map->m_len, -err);
2422 ext4_msg(sb, KERN_CRIT,
2423 "This should not happen!! Data will "
2424 "be lost\n");
2425 if (err == -ENOSPC)
2426 ext4_print_free_blocks(inode);
2427 invalidate_dirty_pages:
2428 *give_up_on_write = true;
4e7ea81d
JK
2429 return err;
2430 }
6603120e 2431 progress = 1;
4e7ea81d
JK
2432 /*
2433 * Update buffer state, submit mapped pages, and get us new
2434 * extent to map
2435 */
2436 err = mpage_map_and_submit_buffers(mpd);
2437 if (err < 0)
6603120e 2438 goto update_disksize;
27d7c4ed 2439 } while (map->m_len);
4e7ea81d 2440
6603120e 2441update_disksize:
622cad13
TT
2442 /*
2443 * Update on-disk size after IO is submitted. Races with
2444 * truncate are avoided by checking i_size under i_data_sem.
2445 */
09cbfeaf 2446 disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
4e7ea81d
JK
2447 if (disksize > EXT4_I(inode)->i_disksize) {
2448 int err2;
622cad13
TT
2449 loff_t i_size;
2450
2451 down_write(&EXT4_I(inode)->i_data_sem);
2452 i_size = i_size_read(inode);
2453 if (disksize > i_size)
2454 disksize = i_size;
2455 if (disksize > EXT4_I(inode)->i_disksize)
2456 EXT4_I(inode)->i_disksize = disksize;
4e7ea81d 2457 err2 = ext4_mark_inode_dirty(handle, inode);
622cad13 2458 up_write(&EXT4_I(inode)->i_data_sem);
4e7ea81d
JK
2459 if (err2)
2460 ext4_error(inode->i_sb,
2461 "Failed to mark inode %lu dirty",
2462 inode->i_ino);
2463 if (!err)
2464 err = err2;
2465 }
2466 return err;
2467}
2468
fffb2739
JK
2469/*
2470 * Calculate the total number of credits to reserve for one writepages
20970ba6 2471 * iteration. This is called from ext4_writepages(). We map an extent of
70261f56 2472 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
fffb2739
JK
2473 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2474 * bpp - 1 blocks in bpp different extents.
2475 */
525f4ed8
MC
2476static int ext4_da_writepages_trans_blocks(struct inode *inode)
2477{
fffb2739 2478 int bpp = ext4_journal_blocks_per_page(inode);
525f4ed8 2479
fffb2739
JK
2480 return ext4_meta_trans_blocks(inode,
2481 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
525f4ed8 2482}
61628a3f 2483
8e48dcfb 2484/*
4e7ea81d
JK
2485 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2486 * and underlying extent to map
2487 *
2488 * @mpd - where to look for pages
2489 *
2490 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2491 * IO immediately. When we find a page which isn't mapped we start accumulating
2492 * extent of buffers underlying these pages that needs mapping (formed by
2493 * either delayed or unwritten buffers). We also lock the pages containing
2494 * these buffers. The extent found is returned in @mpd structure (starting at
2495 * mpd->lblk with length mpd->len blocks).
2496 *
2497 * Note that this function can attach bios to one io_end structure which are
2498 * neither logically nor physically contiguous. Although it may seem as an
2499 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2500 * case as we need to track IO to all buffers underlying a page in one io_end.
8e48dcfb 2501 */
4e7ea81d 2502static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
8e48dcfb 2503{
4e7ea81d
JK
2504 struct address_space *mapping = mpd->inode->i_mapping;
2505 struct pagevec pvec;
2506 unsigned int nr_pages;
aeac589a 2507 long left = mpd->wbc->nr_to_write;
4e7ea81d
JK
2508 pgoff_t index = mpd->first_page;
2509 pgoff_t end = mpd->last_page;
2510 int tag;
2511 int i, err = 0;
2512 int blkbits = mpd->inode->i_blkbits;
2513 ext4_lblk_t lblk;
2514 struct buffer_head *head;
8e48dcfb 2515
4e7ea81d 2516 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
5b41d924
ES
2517 tag = PAGECACHE_TAG_TOWRITE;
2518 else
2519 tag = PAGECACHE_TAG_DIRTY;
2520
4e7ea81d
JK
2521 pagevec_init(&pvec, 0);
2522 mpd->map.m_len = 0;
2523 mpd->next_page = index;
4f01b02c 2524 while (index <= end) {
5b41d924 2525 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
8e48dcfb
TT
2526 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2527 if (nr_pages == 0)
4e7ea81d 2528 goto out;
8e48dcfb
TT
2529
2530 for (i = 0; i < nr_pages; i++) {
2531 struct page *page = pvec.pages[i];
2532
2533 /*
2534 * At this point, the page may be truncated or
2535 * invalidated (changing page->mapping to NULL), or
2536 * even swizzled back from swapper_space to tmpfs file
2537 * mapping. However, page->index will not change
2538 * because we have a reference on the page.
2539 */
4f01b02c
TT
2540 if (page->index > end)
2541 goto out;
8e48dcfb 2542
aeac589a
ML
2543 /*
2544 * Accumulated enough dirty pages? This doesn't apply
2545 * to WB_SYNC_ALL mode. For integrity sync we have to
2546 * keep going because someone may be concurrently
2547 * dirtying pages, and we might have synced a lot of
2548 * newly appeared dirty pages, but have not synced all
2549 * of the old dirty pages.
2550 */
2551 if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
2552 goto out;
2553
4e7ea81d
JK
2554 /* If we can't merge this page, we are done. */
2555 if (mpd->map.m_len > 0 && mpd->next_page != page->index)
2556 goto out;
78aaced3 2557
8e48dcfb 2558 lock_page(page);
8e48dcfb 2559 /*
4e7ea81d
JK
2560 * If the page is no longer dirty, or its mapping no
2561 * longer corresponds to inode we are writing (which
2562 * means it has been truncated or invalidated), or the
2563 * page is already under writeback and we are not doing
2564 * a data integrity writeback, skip the page
8e48dcfb 2565 */
4f01b02c
TT
2566 if (!PageDirty(page) ||
2567 (PageWriteback(page) &&
4e7ea81d 2568 (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
4f01b02c 2569 unlikely(page->mapping != mapping)) {
8e48dcfb
TT
2570 unlock_page(page);
2571 continue;
2572 }
2573
7cb1a535 2574 wait_on_page_writeback(page);
8e48dcfb 2575 BUG_ON(PageWriteback(page));
8e48dcfb 2576
4e7ea81d 2577 if (mpd->map.m_len == 0)
8eb9e5ce 2578 mpd->first_page = page->index;
8eb9e5ce 2579 mpd->next_page = page->index + 1;
f8bec370 2580 /* Add all dirty buffers to mpd */
4e7ea81d 2581 lblk = ((ext4_lblk_t)page->index) <<
09cbfeaf 2582 (PAGE_SHIFT - blkbits);
f8bec370 2583 head = page_buffers(page);
5f1132b2
JK
2584 err = mpage_process_page_bufs(mpd, head, head, lblk);
2585 if (err <= 0)
4e7ea81d 2586 goto out;
5f1132b2 2587 err = 0;
aeac589a 2588 left--;
8e48dcfb
TT
2589 }
2590 pagevec_release(&pvec);
2591 cond_resched();
2592 }
4f01b02c 2593 return 0;
8eb9e5ce
TT
2594out:
2595 pagevec_release(&pvec);
4e7ea81d 2596 return err;
8e48dcfb
TT
2597}
2598
20970ba6
TT
2599static int __writepage(struct page *page, struct writeback_control *wbc,
2600 void *data)
2601{
2602 struct address_space *mapping = data;
2603 int ret = ext4_writepage(page, wbc);
2604 mapping_set_error(mapping, ret);
2605 return ret;
2606}
2607
2608static int ext4_writepages(struct address_space *mapping,
2609 struct writeback_control *wbc)
64769240 2610{
4e7ea81d
JK
2611 pgoff_t writeback_index = 0;
2612 long nr_to_write = wbc->nr_to_write;
22208ded 2613 int range_whole = 0;
4e7ea81d 2614 int cycled = 1;
61628a3f 2615 handle_t *handle = NULL;
df22291f 2616 struct mpage_da_data mpd;
5e745b04 2617 struct inode *inode = mapping->host;
6b523df4 2618 int needed_blocks, rsv_blocks = 0, ret = 0;
5e745b04 2619 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
4e7ea81d 2620 bool done;
1bce63d1 2621 struct blk_plug plug;
cb530541 2622 bool give_up_on_write = false;
61628a3f 2623
c8585c6f 2624 percpu_down_read(&sbi->s_journal_flag_rwsem);
20970ba6 2625 trace_ext4_writepages(inode, wbc);
ba80b101 2626
c8585c6f
DJ
2627 if (dax_mapping(mapping)) {
2628 ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
2629 wbc);
2630 goto out_writepages;
2631 }
7f6d5b52 2632
61628a3f
MC
2633 /*
2634 * No pages to write? This is mainly a kludge to avoid starting
2635 * a transaction for special inodes like journal inode on last iput()
2636 * because that could violate lock ordering on umount
2637 */
a1d6cc56 2638 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
bbf023c7 2639 goto out_writepages;
2a21e37e 2640
20970ba6
TT
2641 if (ext4_should_journal_data(inode)) {
2642 struct blk_plug plug;
20970ba6
TT
2643
2644 blk_start_plug(&plug);
2645 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2646 blk_finish_plug(&plug);
bbf023c7 2647 goto out_writepages;
20970ba6
TT
2648 }
2649
2a21e37e
TT
2650 /*
2651 * If the filesystem has aborted, it is read-only, so return
2652 * right away instead of dumping stack traces later on that
2653 * will obscure the real source of the problem. We test
4ab2f15b 2654 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2a21e37e 2655 * the latter could be true if the filesystem is mounted
20970ba6 2656 * read-only, and in that case, ext4_writepages should
2a21e37e
TT
2657 * *never* be called, so if that ever happens, we would want
2658 * the stack trace.
2659 */
bbf023c7
ML
2660 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2661 ret = -EROFS;
2662 goto out_writepages;
2663 }
2a21e37e 2664
6b523df4
JK
2665 if (ext4_should_dioread_nolock(inode)) {
2666 /*
70261f56 2667 * We may need to convert up to one extent per block in
6b523df4
JK
2668 * the page and we may dirty the inode.
2669 */
09cbfeaf 2670 rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
6b523df4
JK
2671 }
2672
4e7ea81d
JK
2673 /*
2674 * If we have inline data and arrive here, it means that
2675 * we will soon create the block for the 1st page, so
2676 * we'd better clear the inline data here.
2677 */
2678 if (ext4_has_inline_data(inode)) {
2679 /* Just inode will be modified... */
2680 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2681 if (IS_ERR(handle)) {
2682 ret = PTR_ERR(handle);
2683 goto out_writepages;
2684 }
2685 BUG_ON(ext4_test_inode_state(inode,
2686 EXT4_STATE_MAY_INLINE_DATA));
2687 ext4_destroy_inline_data(handle, inode);
2688 ext4_journal_stop(handle);
2689 }
2690
22208ded
AK
2691 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2692 range_whole = 1;
61628a3f 2693
2acf2c26 2694 if (wbc->range_cyclic) {
4e7ea81d
JK
2695 writeback_index = mapping->writeback_index;
2696 if (writeback_index)
2acf2c26 2697 cycled = 0;
4e7ea81d
JK
2698 mpd.first_page = writeback_index;
2699 mpd.last_page = -1;
5b41d924 2700 } else {
09cbfeaf
KS
2701 mpd.first_page = wbc->range_start >> PAGE_SHIFT;
2702 mpd.last_page = wbc->range_end >> PAGE_SHIFT;
5b41d924 2703 }
a1d6cc56 2704
4e7ea81d
JK
2705 mpd.inode = inode;
2706 mpd.wbc = wbc;
2707 ext4_io_submit_init(&mpd.io_submit, wbc);
2acf2c26 2708retry:
6e6938b6 2709 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4e7ea81d
JK
2710 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2711 done = false;
1bce63d1 2712 blk_start_plug(&plug);
4e7ea81d
JK
2713 while (!done && mpd.first_page <= mpd.last_page) {
2714 /* For each extent of pages we use new io_end */
2715 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2716 if (!mpd.io_submit.io_end) {
2717 ret = -ENOMEM;
2718 break;
2719 }
a1d6cc56
AK
2720
2721 /*
4e7ea81d
JK
2722 * We have two constraints: We find one extent to map and we
2723 * must always write out whole page (makes a difference when
2724 * blocksize < pagesize) so that we don't block on IO when we
2725 * try to write out the rest of the page. Journalled mode is
2726 * not supported by delalloc.
a1d6cc56
AK
2727 */
2728 BUG_ON(ext4_should_journal_data(inode));
525f4ed8 2729 needed_blocks = ext4_da_writepages_trans_blocks(inode);
a1d6cc56 2730
4e7ea81d 2731 /* start a new transaction */
6b523df4
JK
2732 handle = ext4_journal_start_with_reserve(inode,
2733 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
61628a3f
MC
2734 if (IS_ERR(handle)) {
2735 ret = PTR_ERR(handle);
1693918e 2736 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
fbe845dd 2737 "%ld pages, ino %lu; err %d", __func__,
a1d6cc56 2738 wbc->nr_to_write, inode->i_ino, ret);
4e7ea81d
JK
2739 /* Release allocated io_end */
2740 ext4_put_io_end(mpd.io_submit.io_end);
2741 break;
61628a3f 2742 }
f63e6005 2743
4e7ea81d
JK
2744 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2745 ret = mpage_prepare_extent_to_map(&mpd);
2746 if (!ret) {
2747 if (mpd.map.m_len)
cb530541
TT
2748 ret = mpage_map_and_submit_extent(handle, &mpd,
2749 &give_up_on_write);
4e7ea81d
JK
2750 else {
2751 /*
2752 * We scanned the whole range (or exhausted
2753 * nr_to_write), submitted what was mapped and
2754 * didn't find anything needing mapping. We are
2755 * done.
2756 */
2757 done = true;
2758 }
f63e6005 2759 }
646caa9c
JK
2760 /*
2761 * Caution: If the handle is synchronous,
2762 * ext4_journal_stop() can wait for transaction commit
2763 * to finish which may depend on writeback of pages to
2764 * complete or on page lock to be released. In that
2765 * case, we have to wait until after after we have
2766 * submitted all the IO, released page locks we hold,
2767 * and dropped io_end reference (for extent conversion
2768 * to be able to complete) before stopping the handle.
2769 */
2770 if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
2771 ext4_journal_stop(handle);
2772 handle = NULL;
2773 }
4e7ea81d
JK
2774 /* Submit prepared bio */
2775 ext4_io_submit(&mpd.io_submit);
2776 /* Unlock pages we didn't use */
cb530541 2777 mpage_release_unused_pages(&mpd, give_up_on_write);
646caa9c
JK
2778 /*
2779 * Drop our io_end reference we got from init. We have
2780 * to be careful and use deferred io_end finishing if
2781 * we are still holding the transaction as we can
2782 * release the last reference to io_end which may end
2783 * up doing unwritten extent conversion.
2784 */
2785 if (handle) {
2786 ext4_put_io_end_defer(mpd.io_submit.io_end);
2787 ext4_journal_stop(handle);
2788 } else
2789 ext4_put_io_end(mpd.io_submit.io_end);
4e7ea81d
JK
2790
2791 if (ret == -ENOSPC && sbi->s_journal) {
2792 /*
2793 * Commit the transaction which would
22208ded
AK
2794 * free blocks released in the transaction
2795 * and try again
2796 */
df22291f 2797 jbd2_journal_force_commit_nested(sbi->s_journal);
22208ded 2798 ret = 0;
4e7ea81d
JK
2799 continue;
2800 }
2801 /* Fatal error - ENOMEM, EIO... */
2802 if (ret)
61628a3f 2803 break;
a1d6cc56 2804 }
1bce63d1 2805 blk_finish_plug(&plug);
9c12a831 2806 if (!ret && !cycled && wbc->nr_to_write > 0) {
2acf2c26 2807 cycled = 1;
4e7ea81d
JK
2808 mpd.last_page = writeback_index - 1;
2809 mpd.first_page = 0;
2acf2c26
AK
2810 goto retry;
2811 }
22208ded
AK
2812
2813 /* Update index */
22208ded
AK
2814 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2815 /*
4e7ea81d 2816 * Set the writeback_index so that range_cyclic
22208ded
AK
2817 * mode will write it back later
2818 */
4e7ea81d 2819 mapping->writeback_index = mpd.first_page;
a1d6cc56 2820
61628a3f 2821out_writepages:
20970ba6
TT
2822 trace_ext4_writepages_result(inode, wbc, ret,
2823 nr_to_write - wbc->nr_to_write);
c8585c6f 2824 percpu_up_read(&sbi->s_journal_flag_rwsem);
61628a3f 2825 return ret;
64769240
AT
2826}
2827
79f0be8d
AK
2828static int ext4_nonda_switch(struct super_block *sb)
2829{
5c1ff336 2830 s64 free_clusters, dirty_clusters;
79f0be8d
AK
2831 struct ext4_sb_info *sbi = EXT4_SB(sb);
2832
2833 /*
2834 * switch to non delalloc mode if we are running low
2835 * on free block. The free block accounting via percpu
179f7ebf 2836 * counters can get slightly wrong with percpu_counter_batch getting
79f0be8d
AK
2837 * accumulated on each CPU without updating global counters
2838 * Delalloc need an accurate free block accounting. So switch
2839 * to non delalloc when we are near to error range.
2840 */
5c1ff336
EW
2841 free_clusters =
2842 percpu_counter_read_positive(&sbi->s_freeclusters_counter);
2843 dirty_clusters =
2844 percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
00d4e736
TT
2845 /*
2846 * Start pushing delalloc when 1/2 of free blocks are dirty.
2847 */
5c1ff336 2848 if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
10ee27a0 2849 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
00d4e736 2850
5c1ff336
EW
2851 if (2 * free_clusters < 3 * dirty_clusters ||
2852 free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
79f0be8d 2853 /*
c8afb446
ES
2854 * free block count is less than 150% of dirty blocks
2855 * or free blocks is less than watermark
79f0be8d
AK
2856 */
2857 return 1;
2858 }
2859 return 0;
2860}
2861
0ff8947f
ES
2862/* We always reserve for an inode update; the superblock could be there too */
2863static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
2864{
e2b911c5 2865 if (likely(ext4_has_feature_large_file(inode->i_sb)))
0ff8947f
ES
2866 return 1;
2867
2868 if (pos + len <= 0x7fffffffULL)
2869 return 1;
2870
2871 /* We might need to update the superblock to set LARGE_FILE */
2872 return 2;
2873}
2874
64769240 2875static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
de9a55b8
TT
2876 loff_t pos, unsigned len, unsigned flags,
2877 struct page **pagep, void **fsdata)
64769240 2878{
72b8ab9d 2879 int ret, retries = 0;
64769240
AT
2880 struct page *page;
2881 pgoff_t index;
64769240
AT
2882 struct inode *inode = mapping->host;
2883 handle_t *handle;
2884
09cbfeaf 2885 index = pos >> PAGE_SHIFT;
79f0be8d
AK
2886
2887 if (ext4_nonda_switch(inode->i_sb)) {
2888 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2889 return ext4_write_begin(file, mapping, pos,
2890 len, flags, pagep, fsdata);
2891 }
2892 *fsdata = (void *)0;
9bffad1e 2893 trace_ext4_da_write_begin(inode, pos, len, flags);
9c3569b5
TM
2894
2895 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2896 ret = ext4_da_write_inline_data_begin(mapping, inode,
2897 pos, len, flags,
2898 pagep, fsdata);
2899 if (ret < 0)
47564bfb
TT
2900 return ret;
2901 if (ret == 1)
2902 return 0;
9c3569b5
TM
2903 }
2904
47564bfb
TT
2905 /*
2906 * grab_cache_page_write_begin() can take a long time if the
2907 * system is thrashing due to memory pressure, or if the page
2908 * is being written back. So grab it first before we start
2909 * the transaction handle. This also allows us to allocate
2910 * the page (if needed) without using GFP_NOFS.
2911 */
2912retry_grab:
2913 page = grab_cache_page_write_begin(mapping, index, flags);
2914 if (!page)
2915 return -ENOMEM;
2916 unlock_page(page);
2917
64769240
AT
2918 /*
2919 * With delayed allocation, we don't log the i_disksize update
2920 * if there is delayed block allocation. But we still need
2921 * to journalling the i_disksize update if writes to the end
2922 * of file which has an already mapped buffer.
2923 */
47564bfb 2924retry_journal:
0ff8947f
ES
2925 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
2926 ext4_da_write_credits(inode, pos, len));
64769240 2927 if (IS_ERR(handle)) {
09cbfeaf 2928 put_page(page);
47564bfb 2929 return PTR_ERR(handle);
64769240
AT
2930 }
2931
47564bfb
TT
2932 lock_page(page);
2933 if (page->mapping != mapping) {
2934 /* The page got truncated from under us */
2935 unlock_page(page);
09cbfeaf 2936 put_page(page);
d5a0d4f7 2937 ext4_journal_stop(handle);
47564bfb 2938 goto retry_grab;
d5a0d4f7 2939 }
47564bfb 2940 /* In case writeback began while the page was unlocked */
7afe5aa5 2941 wait_for_stable_page(page);
64769240 2942
2058f83a
MH
2943#ifdef CONFIG_EXT4_FS_ENCRYPTION
2944 ret = ext4_block_write_begin(page, pos, len,
2945 ext4_da_get_block_prep);
2946#else
6e1db88d 2947 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2058f83a 2948#endif
64769240
AT
2949 if (ret < 0) {
2950 unlock_page(page);
2951 ext4_journal_stop(handle);
ae4d5372
AK
2952 /*
2953 * block_write_begin may have instantiated a few blocks
2954 * outside i_size. Trim these off again. Don't need
2955 * i_size_read because we hold i_mutex.
2956 */
2957 if (pos + len > inode->i_size)
b9a4207d 2958 ext4_truncate_failed_write(inode);
47564bfb
TT
2959
2960 if (ret == -ENOSPC &&
2961 ext4_should_retry_alloc(inode->i_sb, &retries))
2962 goto retry_journal;
2963
09cbfeaf 2964 put_page(page);
47564bfb 2965 return ret;
64769240
AT
2966 }
2967
47564bfb 2968 *pagep = page;
64769240
AT
2969 return ret;
2970}
2971
632eaeab
MC
2972/*
2973 * Check if we should update i_disksize
2974 * when write to the end of file but not require block allocation
2975 */
2976static int ext4_da_should_update_i_disksize(struct page *page,
de9a55b8 2977 unsigned long offset)
632eaeab
MC
2978{
2979 struct buffer_head *bh;
2980 struct inode *inode = page->mapping->host;
2981 unsigned int idx;
2982 int i;
2983
2984 bh = page_buffers(page);
2985 idx = offset >> inode->i_blkbits;
2986
af5bc92d 2987 for (i = 0; i < idx; i++)
632eaeab
MC
2988 bh = bh->b_this_page;
2989
29fa89d0 2990 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
632eaeab
MC
2991 return 0;
2992 return 1;
2993}
2994
64769240 2995static int ext4_da_write_end(struct file *file,
de9a55b8
TT
2996 struct address_space *mapping,
2997 loff_t pos, unsigned len, unsigned copied,
2998 struct page *page, void *fsdata)
64769240
AT
2999{
3000 struct inode *inode = mapping->host;
3001 int ret = 0, ret2;
3002 handle_t *handle = ext4_journal_current_handle();
3003 loff_t new_i_size;
632eaeab 3004 unsigned long start, end;
79f0be8d
AK
3005 int write_mode = (int)(unsigned long)fsdata;
3006
74d553aa
TT
3007 if (write_mode == FALL_BACK_TO_NONDELALLOC)
3008 return ext4_write_end(file, mapping, pos,
3009 len, copied, page, fsdata);
632eaeab 3010
9bffad1e 3011 trace_ext4_da_write_end(inode, pos, len, copied);
09cbfeaf 3012 start = pos & (PAGE_SIZE - 1);
af5bc92d 3013 end = start + copied - 1;
64769240
AT
3014
3015 /*
3016 * generic_write_end() will run mark_inode_dirty() if i_size
3017 * changes. So let's piggyback the i_disksize mark_inode_dirty
3018 * into that.
3019 */
64769240 3020 new_i_size = pos + copied;
ea51d132 3021 if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
9c3569b5
TM
3022 if (ext4_has_inline_data(inode) ||
3023 ext4_da_should_update_i_disksize(page, end)) {
ee124d27 3024 ext4_update_i_disksize(inode, new_i_size);
cf17fea6
AK
3025 /* We need to mark inode dirty even if
3026 * new_i_size is less that inode->i_size
3027 * bu greater than i_disksize.(hint delalloc)
3028 */
3029 ext4_mark_inode_dirty(handle, inode);
64769240 3030 }
632eaeab 3031 }
9c3569b5
TM
3032
3033 if (write_mode != CONVERT_INLINE_DATA &&
3034 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
3035 ext4_has_inline_data(inode))
3036 ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
3037 page);
3038 else
3039 ret2 = generic_write_end(file, mapping, pos, len, copied,
64769240 3040 page, fsdata);
9c3569b5 3041
64769240
AT
3042 copied = ret2;
3043 if (ret2 < 0)
3044 ret = ret2;
3045 ret2 = ext4_journal_stop(handle);
3046 if (!ret)
3047 ret = ret2;
3048
3049 return ret ? ret : copied;
3050}
3051
d47992f8
LC
3052static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
3053 unsigned int length)
64769240 3054{
64769240
AT
3055 /*
3056 * Drop reserved blocks
3057 */
3058 BUG_ON(!PageLocked(page));
3059 if (!page_has_buffers(page))
3060 goto out;
3061
ca99fdd2 3062 ext4_da_page_release_reservation(page, offset, length);
64769240
AT
3063
3064out:
d47992f8 3065 ext4_invalidatepage(page, offset, length);
64769240
AT
3066
3067 return;
3068}
3069
ccd2506b
TT
3070/*
3071 * Force all delayed allocation blocks to be allocated for a given inode.
3072 */
3073int ext4_alloc_da_blocks(struct inode *inode)
3074{
fb40ba0d
TT
3075 trace_ext4_alloc_da_blocks(inode);
3076
71d4f7d0 3077 if (!EXT4_I(inode)->i_reserved_data_blocks)
ccd2506b
TT
3078 return 0;
3079
3080 /*
3081 * We do something simple for now. The filemap_flush() will
3082 * also start triggering a write of the data blocks, which is
3083 * not strictly speaking necessary (and for users of
3084 * laptop_mode, not even desirable). However, to do otherwise
3085 * would require replicating code paths in:
de9a55b8 3086 *
20970ba6 3087 * ext4_writepages() ->
ccd2506b
TT
3088 * write_cache_pages() ---> (via passed in callback function)
3089 * __mpage_da_writepage() -->
3090 * mpage_add_bh_to_extent()
3091 * mpage_da_map_blocks()
3092 *
3093 * The problem is that write_cache_pages(), located in
3094 * mm/page-writeback.c, marks pages clean in preparation for
3095 * doing I/O, which is not desirable if we're not planning on
3096 * doing I/O at all.
3097 *
3098 * We could call write_cache_pages(), and then redirty all of
380cf090 3099 * the pages by calling redirty_page_for_writepage() but that
ccd2506b
TT
3100 * would be ugly in the extreme. So instead we would need to
3101 * replicate parts of the code in the above functions,
25985edc 3102 * simplifying them because we wouldn't actually intend to
ccd2506b
TT
3103 * write out the pages, but rather only collect contiguous
3104 * logical block extents, call the multi-block allocator, and
3105 * then update the buffer heads with the block allocations.
de9a55b8 3106 *
ccd2506b
TT
3107 * For now, though, we'll cheat by calling filemap_flush(),
3108 * which will map the blocks, and start the I/O, but not
3109 * actually wait for the I/O to complete.
3110 */
3111 return filemap_flush(inode->i_mapping);
3112}
64769240 3113
ac27a0ec
DK
3114/*
3115 * bmap() is special. It gets used by applications such as lilo and by
3116 * the swapper to find the on-disk block of a specific piece of data.
3117 *
3118 * Naturally, this is dangerous if the block concerned is still in the
617ba13b 3119 * journal. If somebody makes a swapfile on an ext4 data-journaling
ac27a0ec
DK
3120 * filesystem and enables swap, then they may get a nasty shock when the
3121 * data getting swapped to that swapfile suddenly gets overwritten by
3122 * the original zero's written out previously to the journal and
3123 * awaiting writeback in the kernel's buffer cache.
3124 *
3125 * So, if we see any bmap calls here on a modified, data-journaled file,
3126 * take extra steps to flush any blocks which might be in the cache.
3127 */
617ba13b 3128static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
ac27a0ec
DK
3129{
3130 struct inode *inode = mapping->host;
3131 journal_t *journal;
3132 int err;
3133
46c7f254
TM
3134 /*
3135 * We can get here for an inline file via the FIBMAP ioctl
3136 */
3137 if (ext4_has_inline_data(inode))
3138 return 0;
3139
64769240
AT
3140 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3141 test_opt(inode->i_sb, DELALLOC)) {
3142 /*
3143 * With delalloc we want to sync the file
3144 * so that we can make sure we allocate
3145 * blocks for file
3146 */
3147 filemap_write_and_wait(mapping);
3148 }
3149
19f5fb7a
TT
3150 if (EXT4_JOURNAL(inode) &&
3151 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
ac27a0ec
DK
3152 /*
3153 * This is a REALLY heavyweight approach, but the use of
3154 * bmap on dirty files is expected to be extremely rare:
3155 * only if we run lilo or swapon on a freshly made file
3156 * do we expect this to happen.
3157 *
3158 * (bmap requires CAP_SYS_RAWIO so this does not
3159 * represent an unprivileged user DOS attack --- we'd be
3160 * in trouble if mortal users could trigger this path at
3161 * will.)
3162 *
617ba13b 3163 * NB. EXT4_STATE_JDATA is not set on files other than
ac27a0ec
DK
3164 * regular files. If somebody wants to bmap a directory
3165 * or symlink and gets confused because the buffer
3166 * hasn't yet been flushed to disk, they deserve
3167 * everything they get.
3168 */
3169
19f5fb7a 3170 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
617ba13b 3171 journal = EXT4_JOURNAL(inode);
dab291af
MC
3172 jbd2_journal_lock_updates(journal);
3173 err = jbd2_journal_flush(journal);
3174 jbd2_journal_unlock_updates(journal);
ac27a0ec
DK
3175
3176 if (err)
3177 return 0;
3178 }
3179
af5bc92d 3180 return generic_block_bmap(mapping, block, ext4_get_block);
ac27a0ec
DK
3181}
3182
617ba13b 3183static int ext4_readpage(struct file *file, struct page *page)
ac27a0ec 3184{
46c7f254
TM
3185 int ret = -EAGAIN;
3186 struct inode *inode = page->mapping->host;
3187
0562e0ba 3188 trace_ext4_readpage(page);
46c7f254
TM
3189
3190 if (ext4_has_inline_data(inode))
3191 ret = ext4_readpage_inline(inode, page);
3192
3193 if (ret == -EAGAIN)
f64e02fe 3194 return ext4_mpage_readpages(page->mapping, NULL, page, 1);
46c7f254
TM
3195
3196 return ret;
ac27a0ec
DK
3197}
3198
3199static int
617ba13b 3200ext4_readpages(struct file *file, struct address_space *mapping,
ac27a0ec
DK
3201 struct list_head *pages, unsigned nr_pages)
3202{
46c7f254
TM
3203 struct inode *inode = mapping->host;
3204
3205 /* If the file has inline data, no need to do readpages. */
3206 if (ext4_has_inline_data(inode))
3207 return 0;
3208
f64e02fe 3209 return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
ac27a0ec
DK
3210}
3211
d47992f8
LC
3212static void ext4_invalidatepage(struct page *page, unsigned int offset,
3213 unsigned int length)
ac27a0ec 3214{
ca99fdd2 3215 trace_ext4_invalidatepage(page, offset, length);
0562e0ba 3216
4520fb3c
JK
3217 /* No journalling happens on data buffers when this function is used */
3218 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
3219
ca99fdd2 3220 block_invalidatepage(page, offset, length);
4520fb3c
JK
3221}
3222
53e87268 3223static int __ext4_journalled_invalidatepage(struct page *page,
ca99fdd2
LC
3224 unsigned int offset,
3225 unsigned int length)
4520fb3c
JK
3226{
3227 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3228
ca99fdd2 3229 trace_ext4_journalled_invalidatepage(page, offset, length);
4520fb3c 3230
ac27a0ec
DK
3231 /*
3232 * If it's a full truncate we just forget about the pending dirtying
3233 */
09cbfeaf 3234 if (offset == 0 && length == PAGE_SIZE)
ac27a0ec
DK
3235 ClearPageChecked(page);
3236
ca99fdd2 3237 return jbd2_journal_invalidatepage(journal, page, offset, length);
53e87268
JK
3238}
3239
3240/* Wrapper for aops... */
3241static void ext4_journalled_invalidatepage(struct page *page,
d47992f8
LC
3242 unsigned int offset,
3243 unsigned int length)
53e87268 3244{
ca99fdd2 3245 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
ac27a0ec
DK
3246}
3247
617ba13b 3248static int ext4_releasepage(struct page *page, gfp_t wait)
ac27a0ec 3249{
617ba13b 3250 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
ac27a0ec 3251
0562e0ba
JZ
3252 trace_ext4_releasepage(page);
3253
e1c36595
JK
3254 /* Page has dirty journalled data -> cannot release */
3255 if (PageChecked(page))
ac27a0ec 3256 return 0;
0390131b
FM
3257 if (journal)
3258 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3259 else
3260 return try_to_free_buffers(page);
ac27a0ec
DK
3261}
3262
ba5843f5 3263#ifdef CONFIG_FS_DAX
12735f88
JK
3264/*
3265 * Get block function for DAX IO and mmap faults. It takes care of converting
3266 * unwritten extents to written ones and initializes new / converted blocks
3267 * to zeros.
3268 */
3269int ext4_dax_get_block(struct inode *inode, sector_t iblock,
3270 struct buffer_head *bh_result, int create)
ed923b57 3271{
7cb476f8 3272 int ret;
ba5843f5 3273
12735f88 3274 ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
7cb476f8
JK
3275 if (!create)
3276 return _ext4_get_block(inode, iblock, bh_result, 0);
ba5843f5 3277
7cb476f8
JK
3278 ret = ext4_get_block_trans(inode, iblock, bh_result,
3279 EXT4_GET_BLOCKS_PRE_IO |
3280 EXT4_GET_BLOCKS_CREATE_ZERO);
3281 if (ret < 0)
3282 return ret;
ba5843f5 3283
7cb476f8 3284 if (buffer_unwritten(bh_result)) {
ba5843f5 3285 /*
12735f88
JK
3286 * We are protected by i_mmap_sem or i_mutex so we know block
3287 * cannot go away from under us even though we dropped
3288 * i_data_sem. Convert extent to written and write zeros there.
ba5843f5 3289 */
7cb476f8
JK
3290 ret = ext4_get_block_trans(inode, iblock, bh_result,
3291 EXT4_GET_BLOCKS_CONVERT |
3292 EXT4_GET_BLOCKS_CREATE_ZERO);
3293 if (ret < 0)
3294 return ret;
ba5843f5 3295 }
7cb476f8
JK
3296 /*
3297 * At least for now we have to clear BH_New so that DAX code
3298 * doesn't attempt to zero blocks again in a racy way.
3299 */
3300 clear_buffer_new(bh_result);
3301 return 0;
ed923b57 3302}
12735f88
JK
3303#else
3304/* Just define empty function, it will never get called. */
3305int ext4_dax_get_block(struct inode *inode, sector_t iblock,
3306 struct buffer_head *bh_result, int create)
3307{
3308 BUG();
3309 return 0;
ed923b57 3310}
ba5843f5 3311#endif
ed923b57 3312
187372a3 3313static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
7b7a8665 3314 ssize_t size, void *private)
4c0425ff 3315{
109811c2 3316 ext4_io_end_t *io_end = private;
4c0425ff 3317
97a851ed 3318 /* if not async direct IO just return */
7b7a8665 3319 if (!io_end)
187372a3 3320 return 0;
4b70df18 3321
88635ca2 3322 ext_debug("ext4_end_io_dio(): io_end 0x%p "
ace36ad4 3323 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
109811c2 3324 io_end, io_end->inode->i_ino, iocb, offset, size);
8d5d02e6 3325
74c66bcb
JK
3326 /*
3327 * Error during AIO DIO. We cannot convert unwritten extents as the
3328 * data was not written. Just clear the unwritten flag and drop io_end.
3329 */
3330 if (size <= 0) {
3331 ext4_clear_io_unwritten_flag(io_end);
3332 size = 0;
3333 }
4c0425ff
MC
3334 io_end->offset = offset;
3335 io_end->size = size;
7b7a8665 3336 ext4_put_io_end(io_end);
187372a3
CH
3337
3338 return 0;
4c0425ff 3339}
c7064ef1 3340
4c0425ff 3341/*
914f82a3
JK
3342 * Handling of direct IO writes.
3343 *
3344 * For ext4 extent files, ext4 will do direct-io write even to holes,
4c0425ff
MC
3345 * preallocated extents, and those write extend the file, no need to
3346 * fall back to buffered IO.
3347 *
556615dc 3348 * For holes, we fallocate those blocks, mark them as unwritten
69c499d1 3349 * If those blocks were preallocated, we mark sure they are split, but
556615dc 3350 * still keep the range to write as unwritten.
4c0425ff 3351 *
69c499d1 3352 * The unwritten extents will be converted to written when DIO is completed.
8d5d02e6 3353 * For async direct IO, since the IO may still pending when return, we
25985edc 3354 * set up an end_io call back function, which will do the conversion
8d5d02e6 3355 * when async direct IO completed.
4c0425ff
MC
3356 *
3357 * If the O_DIRECT write will extend the file then add this inode to the
3358 * orphan list. So recovery will truncate it back to the original size
3359 * if the machine crashes during the write.
3360 *
3361 */
0e01df10 3362static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
4c0425ff
MC
3363{
3364 struct file *file = iocb->ki_filp;
3365 struct inode *inode = file->f_mapping->host;
914f82a3 3366 struct ext4_inode_info *ei = EXT4_I(inode);
4c0425ff 3367 ssize_t ret;
c8b8e32d 3368 loff_t offset = iocb->ki_pos;
a6cbcd4a 3369 size_t count = iov_iter_count(iter);
69c499d1
TT
3370 int overwrite = 0;
3371 get_block_t *get_block_func = NULL;
3372 int dio_flags = 0;
4c0425ff 3373 loff_t final_size = offset + count;
914f82a3
JK
3374 int orphan = 0;
3375 handle_t *handle;
729f52c6 3376
914f82a3
JK
3377 if (final_size > inode->i_size) {
3378 /* Credits for sb + inode write */
3379 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3380 if (IS_ERR(handle)) {
3381 ret = PTR_ERR(handle);
3382 goto out;
3383 }
3384 ret = ext4_orphan_add(handle, inode);
3385 if (ret) {
3386 ext4_journal_stop(handle);
3387 goto out;
3388 }
3389 orphan = 1;
3390 ei->i_disksize = inode->i_size;
3391 ext4_journal_stop(handle);
3392 }
4bd809db 3393
69c499d1 3394 BUG_ON(iocb->private == NULL);
4bd809db 3395
e8340395
JK
3396 /*
3397 * Make all waiters for direct IO properly wait also for extent
3398 * conversion. This also disallows race between truncate() and
3399 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3400 */
914f82a3 3401 inode_dio_begin(inode);
e8340395 3402
69c499d1
TT
3403 /* If we do a overwrite dio, i_mutex locking can be released */
3404 overwrite = *((int *)iocb->private);
4bd809db 3405
2dcba478 3406 if (overwrite)
5955102c 3407 inode_unlock(inode);
8d5d02e6 3408
69c499d1 3409 /*
914f82a3 3410 * For extent mapped files we could direct write to holes and fallocate.
69c499d1 3411 *
109811c2
JK
3412 * Allocated blocks to fill the hole are marked as unwritten to prevent
3413 * parallel buffered read to expose the stale data before DIO complete
3414 * the data IO.
69c499d1 3415 *
109811c2
JK
3416 * As to previously fallocated extents, ext4 get_block will just simply
3417 * mark the buffer mapped but still keep the extents unwritten.
69c499d1 3418 *
109811c2
JK
3419 * For non AIO case, we will convert those unwritten extents to written
3420 * after return back from blockdev_direct_IO. That way we save us from
3421 * allocating io_end structure and also the overhead of offloading
3422 * the extent convertion to a workqueue.
69c499d1
TT
3423 *
3424 * For async DIO, the conversion needs to be deferred when the
3425 * IO is completed. The ext4 end_io callback function will be
3426 * called to take care of the conversion work. Here for async
3427 * case, we allocate an io_end structure to hook to the iocb.
3428 */
3429 iocb->private = NULL;
109811c2 3430 if (overwrite)
705965bd 3431 get_block_func = ext4_dio_get_block_overwrite;
12735f88
JK
3432 else if (IS_DAX(inode)) {
3433 /*
3434 * We can avoid zeroing for aligned DAX writes beyond EOF. Other
3435 * writes need zeroing either because they can race with page
3436 * faults or because they use partial blocks.
3437 */
3438 if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
3439 ext4_aligned_io(inode, offset, count))
3440 get_block_func = ext4_dio_get_block;
3441 else
3442 get_block_func = ext4_dax_get_block;
3443 dio_flags = DIO_LOCKING;
3444 } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3445 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
914f82a3
JK
3446 get_block_func = ext4_dio_get_block;
3447 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3448 } else if (is_sync_kiocb(iocb)) {
109811c2
JK
3449 get_block_func = ext4_dio_get_block_unwritten_sync;
3450 dio_flags = DIO_LOCKING;
69c499d1 3451 } else {
109811c2 3452 get_block_func = ext4_dio_get_block_unwritten_async;
69c499d1
TT
3453 dio_flags = DIO_LOCKING;
3454 }
2058f83a
MH
3455#ifdef CONFIG_EXT4_FS_ENCRYPTION
3456 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
3457#endif
914f82a3 3458 if (IS_DAX(inode)) {
c8b8e32d 3459 ret = dax_do_io(iocb, inode, iter, get_block_func,
923ae0ff 3460 ext4_end_io_dio, dio_flags);
914f82a3 3461 } else
17f8c842 3462 ret = __blockdev_direct_IO(iocb, inode,
c8b8e32d 3463 inode->i_sb->s_bdev, iter,
923ae0ff
RZ
3464 get_block_func,
3465 ext4_end_io_dio, NULL, dio_flags);
69c499d1 3466
97a851ed 3467 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
69c499d1
TT
3468 EXT4_STATE_DIO_UNWRITTEN)) {
3469 int err;
3470 /*
3471 * for non AIO case, since the IO is already
3472 * completed, we could do the conversion right here
3473 */
6b523df4 3474 err = ext4_convert_unwritten_extents(NULL, inode,
69c499d1
TT
3475 offset, ret);
3476 if (err < 0)
3477 ret = err;
3478 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3479 }
4bd809db 3480
914f82a3 3481 inode_dio_end(inode);
69c499d1 3482 /* take i_mutex locking again if we do a ovewrite dio */
2dcba478 3483 if (overwrite)
5955102c 3484 inode_lock(inode);
8d5d02e6 3485
914f82a3
JK
3486 if (ret < 0 && final_size > inode->i_size)
3487 ext4_truncate_failed_write(inode);
3488
3489 /* Handle extending of i_size after direct IO write */
3490 if (orphan) {
3491 int err;
3492
3493 /* Credits for sb + inode write */
3494 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3495 if (IS_ERR(handle)) {
3496 /* This is really bad luck. We've written the data
3497 * but cannot extend i_size. Bail out and pretend
3498 * the write failed... */
3499 ret = PTR_ERR(handle);
3500 if (inode->i_nlink)
3501 ext4_orphan_del(NULL, inode);
3502
3503 goto out;
3504 }
3505 if (inode->i_nlink)
3506 ext4_orphan_del(handle, inode);
3507 if (ret > 0) {
3508 loff_t end = offset + ret;
3509 if (end > inode->i_size) {
3510 ei->i_disksize = end;
3511 i_size_write(inode, end);
3512 /*
3513 * We're going to return a positive `ret'
3514 * here due to non-zero-length I/O, so there's
3515 * no way of reporting error returns from
3516 * ext4_mark_inode_dirty() to userspace. So
3517 * ignore it.
3518 */
3519 ext4_mark_inode_dirty(handle, inode);
3520 }
3521 }
3522 err = ext4_journal_stop(handle);
3523 if (ret == 0)
3524 ret = err;
3525 }
3526out:
3527 return ret;
3528}
3529
0e01df10 3530static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
914f82a3 3531{
16c54688
JK
3532 struct address_space *mapping = iocb->ki_filp->f_mapping;
3533 struct inode *inode = mapping->host;
914f82a3
JK
3534 ssize_t ret;
3535
16c54688
JK
3536 /*
3537 * Shared inode_lock is enough for us - it protects against concurrent
3538 * writes & truncates and since we take care of writing back page cache,
3539 * we are protected against page writeback as well.
3540 */
3541 inode_lock_shared(inode);
914f82a3 3542 if (IS_DAX(inode)) {
16c54688 3543 ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0);
914f82a3 3544 } else {
16c54688
JK
3545 size_t count = iov_iter_count(iter);
3546
3547 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
3548 iocb->ki_pos + count);
3549 if (ret)
3550 goto out_unlock;
914f82a3 3551 ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
0e01df10 3552 iter, ext4_dio_get_block,
16c54688 3553 NULL, NULL, 0);
914f82a3 3554 }
16c54688
JK
3555out_unlock:
3556 inode_unlock_shared(inode);
69c499d1 3557 return ret;
4c0425ff
MC
3558}
3559
c8b8e32d 3560static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
4c0425ff
MC
3561{
3562 struct file *file = iocb->ki_filp;
3563 struct inode *inode = file->f_mapping->host;
a6cbcd4a 3564 size_t count = iov_iter_count(iter);
c8b8e32d 3565 loff_t offset = iocb->ki_pos;
0562e0ba 3566 ssize_t ret;
4c0425ff 3567
2058f83a
MH
3568#ifdef CONFIG_EXT4_FS_ENCRYPTION
3569 if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
3570 return 0;
3571#endif
3572
84ebd795
TT
3573 /*
3574 * If we are doing data journalling we don't support O_DIRECT
3575 */
3576 if (ext4_should_journal_data(inode))
3577 return 0;
3578
46c7f254
TM
3579 /* Let buffer I/O handle the inline data case. */
3580 if (ext4_has_inline_data(inode))
3581 return 0;
3582
6f673763 3583 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
914f82a3 3584 if (iov_iter_rw(iter) == READ)
0e01df10 3585 ret = ext4_direct_IO_read(iocb, iter);
0562e0ba 3586 else
0e01df10 3587 ret = ext4_direct_IO_write(iocb, iter);
6f673763 3588 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
0562e0ba 3589 return ret;
4c0425ff
MC
3590}
3591
ac27a0ec 3592/*
617ba13b 3593 * Pages can be marked dirty completely asynchronously from ext4's journalling
ac27a0ec
DK
3594 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
3595 * much here because ->set_page_dirty is called under VFS locks. The page is
3596 * not necessarily locked.
3597 *
3598 * We cannot just dirty the page and leave attached buffers clean, because the
3599 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
3600 * or jbddirty because all the journalling code will explode.
3601 *
3602 * So what we do is to mark the page "pending dirty" and next time writepage
3603 * is called, propagate that into the buffers appropriately.
3604 */
617ba13b 3605static int ext4_journalled_set_page_dirty(struct page *page)
ac27a0ec
DK
3606{
3607 SetPageChecked(page);
3608 return __set_page_dirty_nobuffers(page);
3609}
3610
74d553aa 3611static const struct address_space_operations ext4_aops = {
8ab22b9a
HH
3612 .readpage = ext4_readpage,
3613 .readpages = ext4_readpages,
43ce1d23 3614 .writepage = ext4_writepage,
20970ba6 3615 .writepages = ext4_writepages,
8ab22b9a 3616 .write_begin = ext4_write_begin,
74d553aa 3617 .write_end = ext4_write_end,
8ab22b9a
HH
3618 .bmap = ext4_bmap,
3619 .invalidatepage = ext4_invalidatepage,
3620 .releasepage = ext4_releasepage,
3621 .direct_IO = ext4_direct_IO,
3622 .migratepage = buffer_migrate_page,
3623 .is_partially_uptodate = block_is_partially_uptodate,
aa261f54 3624 .error_remove_page = generic_error_remove_page,
ac27a0ec
DK
3625};
3626
617ba13b 3627static const struct address_space_operations ext4_journalled_aops = {
8ab22b9a
HH
3628 .readpage = ext4_readpage,
3629 .readpages = ext4_readpages,
43ce1d23 3630 .writepage = ext4_writepage,
20970ba6 3631 .writepages = ext4_writepages,
8ab22b9a
HH
3632 .write_begin = ext4_write_begin,
3633 .write_end = ext4_journalled_write_end,
3634 .set_page_dirty = ext4_journalled_set_page_dirty,
3635 .bmap = ext4_bmap,
4520fb3c 3636 .invalidatepage = ext4_journalled_invalidatepage,
8ab22b9a 3637 .releasepage = ext4_releasepage,
84ebd795 3638 .direct_IO = ext4_direct_IO,
8ab22b9a 3639 .is_partially_uptodate = block_is_partially_uptodate,
aa261f54 3640 .error_remove_page = generic_error_remove_page,
ac27a0ec
DK
3641};
3642
64769240 3643static const struct address_space_operations ext4_da_aops = {
8ab22b9a
HH
3644 .readpage = ext4_readpage,
3645 .readpages = ext4_readpages,
43ce1d23 3646 .writepage = ext4_writepage,
20970ba6 3647 .writepages = ext4_writepages,
8ab22b9a
HH
3648 .write_begin = ext4_da_write_begin,
3649 .write_end = ext4_da_write_end,
3650 .bmap = ext4_bmap,
3651 .invalidatepage = ext4_da_invalidatepage,
3652 .releasepage = ext4_releasepage,
3653 .direct_IO = ext4_direct_IO,
3654 .migratepage = buffer_migrate_page,
3655 .is_partially_uptodate = block_is_partially_uptodate,
aa261f54 3656 .error_remove_page = generic_error_remove_page,
64769240
AT
3657};
3658
617ba13b 3659void ext4_set_aops(struct inode *inode)
ac27a0ec 3660{
3d2b1582
LC
3661 switch (ext4_inode_journal_mode(inode)) {
3662 case EXT4_INODE_ORDERED_DATA_MODE:
3d2b1582 3663 case EXT4_INODE_WRITEBACK_DATA_MODE:
3d2b1582
LC
3664 break;
3665 case EXT4_INODE_JOURNAL_DATA_MODE:
617ba13b 3666 inode->i_mapping->a_ops = &ext4_journalled_aops;
74d553aa 3667 return;
3d2b1582
LC
3668 default:
3669 BUG();
3670 }
74d553aa
TT
3671 if (test_opt(inode->i_sb, DELALLOC))
3672 inode->i_mapping->a_ops = &ext4_da_aops;
3673 else
3674 inode->i_mapping->a_ops = &ext4_aops;
ac27a0ec
DK
3675}
3676
923ae0ff 3677static int __ext4_block_zero_page_range(handle_t *handle,
d863dc36
LC
3678 struct address_space *mapping, loff_t from, loff_t length)
3679{
09cbfeaf
KS
3680 ext4_fsblk_t index = from >> PAGE_SHIFT;
3681 unsigned offset = from & (PAGE_SIZE-1);
923ae0ff 3682 unsigned blocksize, pos;
d863dc36
LC
3683 ext4_lblk_t iblock;
3684 struct inode *inode = mapping->host;
3685 struct buffer_head *bh;
3686 struct page *page;
3687 int err = 0;
3688
09cbfeaf 3689 page = find_or_create_page(mapping, from >> PAGE_SHIFT,
c62d2555 3690 mapping_gfp_constraint(mapping, ~__GFP_FS));
d863dc36
LC
3691 if (!page)
3692 return -ENOMEM;
3693
3694 blocksize = inode->i_sb->s_blocksize;
d863dc36 3695
09cbfeaf 3696 iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
d863dc36
LC
3697
3698 if (!page_has_buffers(page))
3699 create_empty_buffers(page, blocksize, 0);
3700
3701 /* Find the buffer that contains "offset" */
3702 bh = page_buffers(page);
3703 pos = blocksize;
3704 while (offset >= pos) {
3705 bh = bh->b_this_page;
3706 iblock++;
3707 pos += blocksize;
3708 }
d863dc36
LC
3709 if (buffer_freed(bh)) {
3710 BUFFER_TRACE(bh, "freed: skip");
3711 goto unlock;
3712 }
d863dc36
LC
3713 if (!buffer_mapped(bh)) {
3714 BUFFER_TRACE(bh, "unmapped");
3715 ext4_get_block(inode, iblock, bh, 0);
3716 /* unmapped? It's a hole - nothing to do */
3717 if (!buffer_mapped(bh)) {
3718 BUFFER_TRACE(bh, "still unmapped");
3719 goto unlock;
3720 }
3721 }
3722
3723 /* Ok, it's mapped. Make sure it's up-to-date */
3724 if (PageUptodate(page))
3725 set_buffer_uptodate(bh);
3726
3727 if (!buffer_uptodate(bh)) {
3728 err = -EIO;
dfec8a14 3729 ll_rw_block(REQ_OP_READ, 0, 1, &bh);
d863dc36
LC
3730 wait_on_buffer(bh);
3731 /* Uhhuh. Read error. Complain and punt. */
3732 if (!buffer_uptodate(bh))
3733 goto unlock;
c9c7429c
MH
3734 if (S_ISREG(inode->i_mode) &&
3735 ext4_encrypted_inode(inode)) {
3736 /* We expect the key to be set. */
a7550b30 3737 BUG_ON(!fscrypt_has_encryption_key(inode));
09cbfeaf 3738 BUG_ON(blocksize != PAGE_SIZE);
a7550b30 3739 WARN_ON_ONCE(fscrypt_decrypt_page(page));
c9c7429c 3740 }
d863dc36 3741 }
d863dc36
LC
3742 if (ext4_should_journal_data(inode)) {
3743 BUFFER_TRACE(bh, "get write access");
3744 err = ext4_journal_get_write_access(handle, bh);
3745 if (err)
3746 goto unlock;
3747 }
d863dc36 3748 zero_user(page, offset, length);
d863dc36
LC
3749 BUFFER_TRACE(bh, "zeroed end of block");
3750
d863dc36
LC
3751 if (ext4_should_journal_data(inode)) {
3752 err = ext4_handle_dirty_metadata(handle, inode, bh);
0713ed0c 3753 } else {
353eefd3 3754 err = 0;
d863dc36 3755 mark_buffer_dirty(bh);
3957ef53 3756 if (ext4_should_order_data(inode))
ee0876bc 3757 err = ext4_jbd2_inode_add_write(handle, inode);
0713ed0c 3758 }
d863dc36
LC
3759
3760unlock:
3761 unlock_page(page);
09cbfeaf 3762 put_page(page);
d863dc36
LC
3763 return err;
3764}
3765
923ae0ff
RZ
3766/*
3767 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3768 * starting from file offset 'from'. The range to be zero'd must
3769 * be contained with in one block. If the specified range exceeds
3770 * the end of the block it will be shortened to end of the block
3771 * that cooresponds to 'from'
3772 */
3773static int ext4_block_zero_page_range(handle_t *handle,
3774 struct address_space *mapping, loff_t from, loff_t length)
3775{
3776 struct inode *inode = mapping->host;
09cbfeaf 3777 unsigned offset = from & (PAGE_SIZE-1);
923ae0ff
RZ
3778 unsigned blocksize = inode->i_sb->s_blocksize;
3779 unsigned max = blocksize - (offset & (blocksize - 1));
3780
3781 /*
3782 * correct length if it does not fall between
3783 * 'from' and the end of the block
3784 */
3785 if (length > max || length < 0)
3786 length = max;
3787
3788 if (IS_DAX(inode))
3789 return dax_zero_page_range(inode, from, length, ext4_get_block);
3790 return __ext4_block_zero_page_range(handle, mapping, from, length);
3791}
3792
94350ab5
MW
3793/*
3794 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3795 * up to the end of the block which corresponds to `from'.
3796 * This required during truncate. We need to physically zero the tail end
3797 * of that block so it doesn't yield old data if the file is later grown.
3798 */
c197855e 3799static int ext4_block_truncate_page(handle_t *handle,
94350ab5
MW
3800 struct address_space *mapping, loff_t from)
3801{
09cbfeaf 3802 unsigned offset = from & (PAGE_SIZE-1);
94350ab5
MW
3803 unsigned length;
3804 unsigned blocksize;
3805 struct inode *inode = mapping->host;
3806
3807 blocksize = inode->i_sb->s_blocksize;
3808 length = blocksize - (offset & (blocksize - 1));
3809
3810 return ext4_block_zero_page_range(handle, mapping, from, length);
3811}
3812
a87dd18c
LC
3813int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3814 loff_t lstart, loff_t length)
3815{
3816 struct super_block *sb = inode->i_sb;
3817 struct address_space *mapping = inode->i_mapping;
e1be3a92 3818 unsigned partial_start, partial_end;
a87dd18c
LC
3819 ext4_fsblk_t start, end;
3820 loff_t byte_end = (lstart + length - 1);
3821 int err = 0;
3822
e1be3a92
LC
3823 partial_start = lstart & (sb->s_blocksize - 1);
3824 partial_end = byte_end & (sb->s_blocksize - 1);
3825
a87dd18c
LC
3826 start = lstart >> sb->s_blocksize_bits;
3827 end = byte_end >> sb->s_blocksize_bits;
3828
3829 /* Handle partial zero within the single block */
e1be3a92
LC
3830 if (start == end &&
3831 (partial_start || (partial_end != sb->s_blocksize - 1))) {
a87dd18c
LC
3832 err = ext4_block_zero_page_range(handle, mapping,
3833 lstart, length);
3834 return err;
3835 }
3836 /* Handle partial zero out on the start of the range */
e1be3a92 3837 if (partial_start) {
a87dd18c
LC
3838 err = ext4_block_zero_page_range(handle, mapping,
3839 lstart, sb->s_blocksize);
3840 if (err)
3841 return err;
3842 }
3843 /* Handle partial zero out on the end of the range */
e1be3a92 3844 if (partial_end != sb->s_blocksize - 1)
a87dd18c 3845 err = ext4_block_zero_page_range(handle, mapping,
e1be3a92
LC
3846 byte_end - partial_end,
3847 partial_end + 1);
a87dd18c
LC
3848 return err;
3849}
3850
91ef4caf
DG
3851int ext4_can_truncate(struct inode *inode)
3852{
91ef4caf
DG
3853 if (S_ISREG(inode->i_mode))
3854 return 1;
3855 if (S_ISDIR(inode->i_mode))
3856 return 1;
3857 if (S_ISLNK(inode->i_mode))
3858 return !ext4_inode_is_fast_symlink(inode);
3859 return 0;
3860}
3861
01127848
JK
3862/*
3863 * We have to make sure i_disksize gets properly updated before we truncate
3864 * page cache due to hole punching or zero range. Otherwise i_disksize update
3865 * can get lost as it may have been postponed to submission of writeback but
3866 * that will never happen after we truncate page cache.
3867 */
3868int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
3869 loff_t len)
3870{
3871 handle_t *handle;
3872 loff_t size = i_size_read(inode);
3873
5955102c 3874 WARN_ON(!inode_is_locked(inode));
01127848
JK
3875 if (offset > size || offset + len < size)
3876 return 0;
3877
3878 if (EXT4_I(inode)->i_disksize >= size)
3879 return 0;
3880
3881 handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
3882 if (IS_ERR(handle))
3883 return PTR_ERR(handle);
3884 ext4_update_i_disksize(inode, size);
3885 ext4_mark_inode_dirty(handle, inode);
3886 ext4_journal_stop(handle);
3887
3888 return 0;
3889}
3890
a4bb6b64 3891/*
cca32b7e 3892 * ext4_punch_hole: punches a hole in a file by releasing the blocks
a4bb6b64
AH
3893 * associated with the given offset and length
3894 *
3895 * @inode: File inode
3896 * @offset: The offset where the hole will begin
3897 * @len: The length of the hole
3898 *
4907cb7b 3899 * Returns: 0 on success or negative on failure
a4bb6b64
AH
3900 */
3901
aeb2817a 3902int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
a4bb6b64 3903{
26a4c0c6
TT
3904 struct super_block *sb = inode->i_sb;
3905 ext4_lblk_t first_block, stop_block;
3906 struct address_space *mapping = inode->i_mapping;
a87dd18c 3907 loff_t first_block_offset, last_block_offset;
26a4c0c6
TT
3908 handle_t *handle;
3909 unsigned int credits;
3910 int ret = 0;
3911
a4bb6b64 3912 if (!S_ISREG(inode->i_mode))
73355192 3913 return -EOPNOTSUPP;
a4bb6b64 3914
b8a86845 3915 trace_ext4_punch_hole(inode, offset, length, 0);
aaddea81 3916
26a4c0c6
TT
3917 /*
3918 * Write out all dirty pages to avoid race conditions
3919 * Then release them.
3920 */
cca32b7e 3921 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
26a4c0c6
TT
3922 ret = filemap_write_and_wait_range(mapping, offset,
3923 offset + length - 1);
3924 if (ret)
3925 return ret;
3926 }
3927
5955102c 3928 inode_lock(inode);
9ef06cec 3929
26a4c0c6
TT
3930 /* No need to punch hole beyond i_size */
3931 if (offset >= inode->i_size)
3932 goto out_mutex;
3933
3934 /*
3935 * If the hole extends beyond i_size, set the hole
3936 * to end after the page that contains i_size
3937 */
3938 if (offset + length > inode->i_size) {
3939 length = inode->i_size +
09cbfeaf 3940 PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
26a4c0c6
TT
3941 offset;
3942 }
3943
a361293f
JK
3944 if (offset & (sb->s_blocksize - 1) ||
3945 (offset + length) & (sb->s_blocksize - 1)) {
3946 /*
3947 * Attach jinode to inode for jbd2 if we do any zeroing of
3948 * partial block
3949 */
3950 ret = ext4_inode_attach_jinode(inode);
3951 if (ret < 0)
3952 goto out_mutex;
3953
3954 }
3955
ea3d7209
JK
3956 /* Wait all existing dio workers, newcomers will block on i_mutex */
3957 ext4_inode_block_unlocked_dio(inode);
3958 inode_dio_wait(inode);
3959
3960 /*
3961 * Prevent page faults from reinstantiating pages we have released from
3962 * page cache.
3963 */
3964 down_write(&EXT4_I(inode)->i_mmap_sem);
a87dd18c
LC
3965 first_block_offset = round_up(offset, sb->s_blocksize);
3966 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
26a4c0c6 3967
a87dd18c 3968 /* Now release the pages and zero block aligned part of pages*/
01127848
JK
3969 if (last_block_offset > first_block_offset) {
3970 ret = ext4_update_disksize_before_punch(inode, offset, length);
3971 if (ret)
3972 goto out_dio;
a87dd18c
LC
3973 truncate_pagecache_range(inode, first_block_offset,
3974 last_block_offset);
01127848 3975 }
26a4c0c6
TT
3976
3977 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3978 credits = ext4_writepage_trans_blocks(inode);
3979 else
3980 credits = ext4_blocks_for_truncate(inode);
3981 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
3982 if (IS_ERR(handle)) {
3983 ret = PTR_ERR(handle);
3984 ext4_std_error(sb, ret);
3985 goto out_dio;
3986 }
3987
a87dd18c
LC
3988 ret = ext4_zero_partial_blocks(handle, inode, offset,
3989 length);
3990 if (ret)
3991 goto out_stop;
26a4c0c6
TT
3992
3993 first_block = (offset + sb->s_blocksize - 1) >>
3994 EXT4_BLOCK_SIZE_BITS(sb);
3995 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
3996
3997 /* If there are no blocks to remove, return now */
3998 if (first_block >= stop_block)
3999 goto out_stop;
4000
4001 down_write(&EXT4_I(inode)->i_data_sem);
4002 ext4_discard_preallocations(inode);
4003
4004 ret = ext4_es_remove_extent(inode, first_block,
4005 stop_block - first_block);
4006 if (ret) {
4007 up_write(&EXT4_I(inode)->i_data_sem);
4008 goto out_stop;
4009 }
4010
4011 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4012 ret = ext4_ext_remove_space(inode, first_block,
4013 stop_block - 1);
4014 else
4f579ae7 4015 ret = ext4_ind_remove_space(handle, inode, first_block,
26a4c0c6
TT
4016 stop_block);
4017
819c4920 4018 up_write(&EXT4_I(inode)->i_data_sem);
26a4c0c6
TT
4019 if (IS_SYNC(inode))
4020 ext4_handle_sync(handle);
e251f9bc 4021
26a4c0c6
TT
4022 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4023 ext4_mark_inode_dirty(handle, inode);
4024out_stop:
4025 ext4_journal_stop(handle);
4026out_dio:
ea3d7209 4027 up_write(&EXT4_I(inode)->i_mmap_sem);
26a4c0c6
TT
4028 ext4_inode_resume_unlocked_dio(inode);
4029out_mutex:
5955102c 4030 inode_unlock(inode);
26a4c0c6 4031 return ret;
a4bb6b64
AH
4032}
4033
a361293f
JK
4034int ext4_inode_attach_jinode(struct inode *inode)
4035{
4036 struct ext4_inode_info *ei = EXT4_I(inode);
4037 struct jbd2_inode *jinode;
4038
4039 if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
4040 return 0;
4041
4042 jinode = jbd2_alloc_inode(GFP_KERNEL);
4043 spin_lock(&inode->i_lock);
4044 if (!ei->jinode) {
4045 if (!jinode) {
4046 spin_unlock(&inode->i_lock);
4047 return -ENOMEM;
4048 }
4049 ei->jinode = jinode;
4050 jbd2_journal_init_jbd_inode(ei->jinode, inode);
4051 jinode = NULL;
4052 }
4053 spin_unlock(&inode->i_lock);
4054 if (unlikely(jinode != NULL))
4055 jbd2_free_inode(jinode);
4056 return 0;
4057}
4058
ac27a0ec 4059/*
617ba13b 4060 * ext4_truncate()
ac27a0ec 4061 *
617ba13b
MC
4062 * We block out ext4_get_block() block instantiations across the entire
4063 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
ac27a0ec
DK
4064 * simultaneously on behalf of the same inode.
4065 *
42b2aa86 4066 * As we work through the truncate and commit bits of it to the journal there
ac27a0ec
DK
4067 * is one core, guiding principle: the file's tree must always be consistent on
4068 * disk. We must be able to restart the truncate after a crash.
4069 *
4070 * The file's tree may be transiently inconsistent in memory (although it
4071 * probably isn't), but whenever we close off and commit a journal transaction,
4072 * the contents of (the filesystem + the journal) must be consistent and
4073 * restartable. It's pretty simple, really: bottom up, right to left (although
4074 * left-to-right works OK too).
4075 *
4076 * Note that at recovery time, journal replay occurs *before* the restart of
4077 * truncate against the orphan inode list.
4078 *
4079 * The committed inode has the new, desired i_size (which is the same as
617ba13b 4080 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
ac27a0ec 4081 * that this inode's truncate did not complete and it will again call
617ba13b
MC
4082 * ext4_truncate() to have another go. So there will be instantiated blocks
4083 * to the right of the truncation point in a crashed ext4 filesystem. But
ac27a0ec 4084 * that's fine - as long as they are linked from the inode, the post-crash
617ba13b 4085 * ext4_truncate() run will find them and release them.
ac27a0ec 4086 */
617ba13b 4087void ext4_truncate(struct inode *inode)
ac27a0ec 4088{
819c4920
TT
4089 struct ext4_inode_info *ei = EXT4_I(inode);
4090 unsigned int credits;
4091 handle_t *handle;
4092 struct address_space *mapping = inode->i_mapping;
819c4920 4093
19b5ef61
TT
4094 /*
4095 * There is a possibility that we're either freeing the inode
e04027e8 4096 * or it's a completely new inode. In those cases we might not
19b5ef61
TT
4097 * have i_mutex locked because it's not necessary.
4098 */
4099 if (!(inode->i_state & (I_NEW|I_FREEING)))
5955102c 4100 WARN_ON(!inode_is_locked(inode));
0562e0ba
JZ
4101 trace_ext4_truncate_enter(inode);
4102
91ef4caf 4103 if (!ext4_can_truncate(inode))
ac27a0ec
DK
4104 return;
4105
12e9b892 4106 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
c8d46e41 4107
5534fb5b 4108 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
19f5fb7a 4109 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
7d8f9f7d 4110
aef1c851
TM
4111 if (ext4_has_inline_data(inode)) {
4112 int has_inline = 1;
4113
4114 ext4_inline_data_truncate(inode, &has_inline);
4115 if (has_inline)
4116 return;
4117 }
4118
a361293f
JK
4119 /* If we zero-out tail of the page, we have to create jinode for jbd2 */
4120 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
4121 if (ext4_inode_attach_jinode(inode) < 0)
4122 return;
4123 }
4124
819c4920
TT
4125 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4126 credits = ext4_writepage_trans_blocks(inode);
4127 else
4128 credits = ext4_blocks_for_truncate(inode);
4129
4130 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4131 if (IS_ERR(handle)) {
4132 ext4_std_error(inode->i_sb, PTR_ERR(handle));
4133 return;
4134 }
4135
eb3544c6
LC
4136 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
4137 ext4_block_truncate_page(handle, mapping, inode->i_size);
819c4920
TT
4138
4139 /*
4140 * We add the inode to the orphan list, so that if this
4141 * truncate spans multiple transactions, and we crash, we will
4142 * resume the truncate when the filesystem recovers. It also
4143 * marks the inode dirty, to catch the new size.
4144 *
4145 * Implication: the file must always be in a sane, consistent
4146 * truncatable state while each transaction commits.
4147 */
4148 if (ext4_orphan_add(handle, inode))
4149 goto out_stop;
4150
4151 down_write(&EXT4_I(inode)->i_data_sem);
4152
4153 ext4_discard_preallocations(inode);
4154
ff9893dc 4155 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
819c4920 4156 ext4_ext_truncate(handle, inode);
ff9893dc 4157 else
819c4920
TT
4158 ext4_ind_truncate(handle, inode);
4159
4160 up_write(&ei->i_data_sem);
4161
4162 if (IS_SYNC(inode))
4163 ext4_handle_sync(handle);
4164
4165out_stop:
4166 /*
4167 * If this was a simple ftruncate() and the file will remain alive,
4168 * then we need to clear up the orphan record which we created above.
4169 * However, if this was a real unlink then we were called by
58d86a50 4170 * ext4_evict_inode(), and we allow that function to clean up the
819c4920
TT
4171 * orphan info for us.
4172 */
4173 if (inode->i_nlink)
4174 ext4_orphan_del(handle, inode);
4175
4176 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4177 ext4_mark_inode_dirty(handle, inode);
4178 ext4_journal_stop(handle);
ac27a0ec 4179
0562e0ba 4180 trace_ext4_truncate_exit(inode);
ac27a0ec
DK
4181}
4182
ac27a0ec 4183/*
617ba13b 4184 * ext4_get_inode_loc returns with an extra refcount against the inode's
ac27a0ec
DK
4185 * underlying buffer_head on success. If 'in_mem' is true, we have all
4186 * data in memory that is needed to recreate the on-disk version of this
4187 * inode.
4188 */
617ba13b
MC
4189static int __ext4_get_inode_loc(struct inode *inode,
4190 struct ext4_iloc *iloc, int in_mem)
ac27a0ec 4191{
240799cd
TT
4192 struct ext4_group_desc *gdp;
4193 struct buffer_head *bh;
4194 struct super_block *sb = inode->i_sb;
4195 ext4_fsblk_t block;
4196 int inodes_per_block, inode_offset;
4197
3a06d778 4198 iloc->bh = NULL;
240799cd 4199 if (!ext4_valid_inum(sb, inode->i_ino))
6a797d27 4200 return -EFSCORRUPTED;
ac27a0ec 4201
240799cd
TT
4202 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4203 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4204 if (!gdp)
ac27a0ec
DK
4205 return -EIO;
4206
240799cd
TT
4207 /*
4208 * Figure out the offset within the block group inode table
4209 */
00d09882 4210 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
240799cd
TT
4211 inode_offset = ((inode->i_ino - 1) %
4212 EXT4_INODES_PER_GROUP(sb));
4213 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4214 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4215
4216 bh = sb_getblk(sb, block);
aebf0243 4217 if (unlikely(!bh))
860d21e2 4218 return -ENOMEM;
ac27a0ec
DK
4219 if (!buffer_uptodate(bh)) {
4220 lock_buffer(bh);
9c83a923
HK
4221
4222 /*
4223 * If the buffer has the write error flag, we have failed
4224 * to write out another inode in the same block. In this
4225 * case, we don't have to read the block because we may
4226 * read the old inode data successfully.
4227 */
4228 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4229 set_buffer_uptodate(bh);
4230
ac27a0ec
DK
4231 if (buffer_uptodate(bh)) {
4232 /* someone brought it uptodate while we waited */
4233 unlock_buffer(bh);
4234 goto has_buffer;
4235 }
4236
4237 /*
4238 * If we have all information of the inode in memory and this
4239 * is the only valid inode in the block, we need not read the
4240 * block.
4241 */
4242 if (in_mem) {
4243 struct buffer_head *bitmap_bh;
240799cd 4244 int i, start;
ac27a0ec 4245
240799cd 4246 start = inode_offset & ~(inodes_per_block - 1);
ac27a0ec 4247
240799cd
TT
4248 /* Is the inode bitmap in cache? */
4249 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
aebf0243 4250 if (unlikely(!bitmap_bh))
ac27a0ec
DK
4251 goto make_io;
4252
4253 /*
4254 * If the inode bitmap isn't in cache then the
4255 * optimisation may end up performing two reads instead
4256 * of one, so skip it.
4257 */
4258 if (!buffer_uptodate(bitmap_bh)) {
4259 brelse(bitmap_bh);
4260 goto make_io;
4261 }
240799cd 4262 for (i = start; i < start + inodes_per_block; i++) {
ac27a0ec
DK
4263 if (i == inode_offset)
4264 continue;
617ba13b 4265 if (ext4_test_bit(i, bitmap_bh->b_data))
ac27a0ec
DK
4266 break;
4267 }
4268 brelse(bitmap_bh);
240799cd 4269 if (i == start + inodes_per_block) {
ac27a0ec
DK
4270 /* all other inodes are free, so skip I/O */
4271 memset(bh->b_data, 0, bh->b_size);
4272 set_buffer_uptodate(bh);
4273 unlock_buffer(bh);
4274 goto has_buffer;
4275 }
4276 }
4277
4278make_io:
240799cd
TT
4279 /*
4280 * If we need to do any I/O, try to pre-readahead extra
4281 * blocks from the inode table.
4282 */
4283 if (EXT4_SB(sb)->s_inode_readahead_blks) {
4284 ext4_fsblk_t b, end, table;
4285 unsigned num;
0d606e2c 4286 __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
240799cd
TT
4287
4288 table = ext4_inode_table(sb, gdp);
b713a5ec 4289 /* s_inode_readahead_blks is always a power of 2 */
0d606e2c 4290 b = block & ~((ext4_fsblk_t) ra_blks - 1);
240799cd
TT
4291 if (table > b)
4292 b = table;
0d606e2c 4293 end = b + ra_blks;
240799cd 4294 num = EXT4_INODES_PER_GROUP(sb);
feb0ab32 4295 if (ext4_has_group_desc_csum(sb))
560671a0 4296 num -= ext4_itable_unused_count(sb, gdp);
240799cd
TT
4297 table += num / inodes_per_block;
4298 if (end > table)
4299 end = table;
4300 while (b <= end)
4301 sb_breadahead(sb, b++);
4302 }
4303
ac27a0ec
DK
4304 /*
4305 * There are other valid inodes in the buffer, this inode
4306 * has in-inode xattrs, or we don't have this inode in memory.
4307 * Read the block from disk.
4308 */
0562e0ba 4309 trace_ext4_load_inode(inode);
ac27a0ec
DK
4310 get_bh(bh);
4311 bh->b_end_io = end_buffer_read_sync;
2a222ca9 4312 submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
ac27a0ec
DK
4313 wait_on_buffer(bh);
4314 if (!buffer_uptodate(bh)) {
c398eda0
TT
4315 EXT4_ERROR_INODE_BLOCK(inode, block,
4316 "unable to read itable block");
ac27a0ec
DK
4317 brelse(bh);
4318 return -EIO;
4319 }
4320 }
4321has_buffer:
4322 iloc->bh = bh;
4323 return 0;
4324}
4325
617ba13b 4326int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
ac27a0ec
DK
4327{
4328 /* We have all inode data except xattrs in memory here. */
617ba13b 4329 return __ext4_get_inode_loc(inode, iloc,
19f5fb7a 4330 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
ac27a0ec
DK
4331}
4332
617ba13b 4333void ext4_set_inode_flags(struct inode *inode)
ac27a0ec 4334{
617ba13b 4335 unsigned int flags = EXT4_I(inode)->i_flags;
00a1a053 4336 unsigned int new_fl = 0;
ac27a0ec 4337
617ba13b 4338 if (flags & EXT4_SYNC_FL)
00a1a053 4339 new_fl |= S_SYNC;
617ba13b 4340 if (flags & EXT4_APPEND_FL)
00a1a053 4341 new_fl |= S_APPEND;
617ba13b 4342 if (flags & EXT4_IMMUTABLE_FL)
00a1a053 4343 new_fl |= S_IMMUTABLE;
617ba13b 4344 if (flags & EXT4_NOATIME_FL)
00a1a053 4345 new_fl |= S_NOATIME;
617ba13b 4346 if (flags & EXT4_DIRSYNC_FL)
00a1a053 4347 new_fl |= S_DIRSYNC;
0a6cf913 4348 if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
923ae0ff 4349 new_fl |= S_DAX;
5f16f322 4350 inode_set_flags(inode, new_fl,
923ae0ff 4351 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
ac27a0ec
DK
4352}
4353
ff9ddf7e
JK
4354/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4355void ext4_get_inode_flags(struct ext4_inode_info *ei)
4356{
84a8dce2
DM
4357 unsigned int vfs_fl;
4358 unsigned long old_fl, new_fl;
4359
4360 do {
4361 vfs_fl = ei->vfs_inode.i_flags;
4362 old_fl = ei->i_flags;
4363 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4364 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4365 EXT4_DIRSYNC_FL);
4366 if (vfs_fl & S_SYNC)
4367 new_fl |= EXT4_SYNC_FL;
4368 if (vfs_fl & S_APPEND)
4369 new_fl |= EXT4_APPEND_FL;
4370 if (vfs_fl & S_IMMUTABLE)
4371 new_fl |= EXT4_IMMUTABLE_FL;
4372 if (vfs_fl & S_NOATIME)
4373 new_fl |= EXT4_NOATIME_FL;
4374 if (vfs_fl & S_DIRSYNC)
4375 new_fl |= EXT4_DIRSYNC_FL;
4376 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
ff9ddf7e 4377}
de9a55b8 4378
0fc1b451 4379static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
de9a55b8 4380 struct ext4_inode_info *ei)
0fc1b451
AK
4381{
4382 blkcnt_t i_blocks ;
8180a562
AK
4383 struct inode *inode = &(ei->vfs_inode);
4384 struct super_block *sb = inode->i_sb;
0fc1b451 4385
e2b911c5 4386 if (ext4_has_feature_huge_file(sb)) {
0fc1b451
AK
4387 /* we are using combined 48 bit field */
4388 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4389 le32_to_cpu(raw_inode->i_blocks_lo);
07a03824 4390 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
8180a562
AK
4391 /* i_blocks represent file system block size */
4392 return i_blocks << (inode->i_blkbits - 9);
4393 } else {
4394 return i_blocks;
4395 }
0fc1b451
AK
4396 } else {
4397 return le32_to_cpu(raw_inode->i_blocks_lo);
4398 }
4399}
ff9ddf7e 4400
152a7b0a
TM
4401static inline void ext4_iget_extra_inode(struct inode *inode,
4402 struct ext4_inode *raw_inode,
4403 struct ext4_inode_info *ei)
4404{
4405 __le32 *magic = (void *)raw_inode +
4406 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
67cf5b09 4407 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
152a7b0a 4408 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
67cf5b09 4409 ext4_find_inline_data_nolock(inode);
f19d5870
TM
4410 } else
4411 EXT4_I(inode)->i_inline_off = 0;
152a7b0a
TM
4412}
4413
040cb378
LX
4414int ext4_get_projid(struct inode *inode, kprojid_t *projid)
4415{
0b7b7779 4416 if (!ext4_has_feature_project(inode->i_sb))
040cb378
LX
4417 return -EOPNOTSUPP;
4418 *projid = EXT4_I(inode)->i_projid;
4419 return 0;
4420}
4421
1d1fe1ee 4422struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ac27a0ec 4423{
617ba13b
MC
4424 struct ext4_iloc iloc;
4425 struct ext4_inode *raw_inode;
1d1fe1ee 4426 struct ext4_inode_info *ei;
1d1fe1ee 4427 struct inode *inode;
b436b9be 4428 journal_t *journal = EXT4_SB(sb)->s_journal;
1d1fe1ee 4429 long ret;
ac27a0ec 4430 int block;
08cefc7a
EB
4431 uid_t i_uid;
4432 gid_t i_gid;
040cb378 4433 projid_t i_projid;
ac27a0ec 4434
1d1fe1ee
DH
4435 inode = iget_locked(sb, ino);
4436 if (!inode)
4437 return ERR_PTR(-ENOMEM);
4438 if (!(inode->i_state & I_NEW))
4439 return inode;
4440
4441 ei = EXT4_I(inode);
7dc57615 4442 iloc.bh = NULL;
ac27a0ec 4443
1d1fe1ee
DH
4444 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4445 if (ret < 0)
ac27a0ec 4446 goto bad_inode;
617ba13b 4447 raw_inode = ext4_raw_inode(&iloc);
814525f4
DW
4448
4449 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4450 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4451 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4452 EXT4_INODE_SIZE(inode->i_sb)) {
4453 EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
4454 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
4455 EXT4_INODE_SIZE(inode->i_sb));
6a797d27 4456 ret = -EFSCORRUPTED;
814525f4
DW
4457 goto bad_inode;
4458 }
4459 } else
4460 ei->i_extra_isize = 0;
4461
4462 /* Precompute checksum seed for inode metadata */
9aa5d32b 4463 if (ext4_has_metadata_csum(sb)) {
814525f4
DW
4464 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4465 __u32 csum;
4466 __le32 inum = cpu_to_le32(inode->i_ino);
4467 __le32 gen = raw_inode->i_generation;
4468 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
4469 sizeof(inum));
4470 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
4471 sizeof(gen));
4472 }
4473
4474 if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
4475 EXT4_ERROR_INODE(inode, "checksum invalid");
6a797d27 4476 ret = -EFSBADCRC;
814525f4
DW
4477 goto bad_inode;
4478 }
4479
ac27a0ec 4480 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
08cefc7a
EB
4481 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4482 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
0b7b7779 4483 if (ext4_has_feature_project(sb) &&
040cb378
LX
4484 EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
4485 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
4486 i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
4487 else
4488 i_projid = EXT4_DEF_PROJID;
4489
af5bc92d 4490 if (!(test_opt(inode->i_sb, NO_UID32))) {
08cefc7a
EB
4491 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4492 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
ac27a0ec 4493 }
08cefc7a
EB
4494 i_uid_write(inode, i_uid);
4495 i_gid_write(inode, i_gid);
040cb378 4496 ei->i_projid = make_kprojid(&init_user_ns, i_projid);
bfe86848 4497 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ac27a0ec 4498
353eb83c 4499 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
67cf5b09 4500 ei->i_inline_off = 0;
ac27a0ec
DK
4501 ei->i_dir_start_lookup = 0;
4502 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4503 /* We now have enough fields to check if the inode was active or not.
4504 * This is needed because nfsd might try to access dead inodes
4505 * the test is that same one that e2fsck uses
4506 * NeilBrown 1999oct15
4507 */
4508 if (inode->i_nlink == 0) {
393d1d1d
DTB
4509 if ((inode->i_mode == 0 ||
4510 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
4511 ino != EXT4_BOOT_LOADER_INO) {
ac27a0ec 4512 /* this inode is deleted */
1d1fe1ee 4513 ret = -ESTALE;
ac27a0ec
DK
4514 goto bad_inode;
4515 }
4516 /* The only unlinked inodes we let through here have
4517 * valid i_mode and are being read by the orphan
4518 * recovery code: that's fine, we're about to complete
393d1d1d
DTB
4519 * the process of deleting those.
4520 * OR it is the EXT4_BOOT_LOADER_INO which is
4521 * not initialized on a new filesystem. */
ac27a0ec 4522 }
ac27a0ec 4523 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
0fc1b451 4524 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
7973c0c1 4525 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
e2b911c5 4526 if (ext4_has_feature_64bit(sb))
a1ddeb7e
BP
4527 ei->i_file_acl |=
4528 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
a48380f7 4529 inode->i_size = ext4_isize(raw_inode);
ac27a0ec 4530 ei->i_disksize = inode->i_size;
a9e7f447
DM
4531#ifdef CONFIG_QUOTA
4532 ei->i_reserved_quota = 0;
4533#endif
ac27a0ec
DK
4534 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4535 ei->i_block_group = iloc.block_group;
a4912123 4536 ei->i_last_alloc_group = ~0;
ac27a0ec
DK
4537 /*
4538 * NOTE! The in-memory inode i_data array is in little-endian order
4539 * even on big-endian machines: we do NOT byteswap the block numbers!
4540 */
617ba13b 4541 for (block = 0; block < EXT4_N_BLOCKS; block++)
ac27a0ec
DK
4542 ei->i_data[block] = raw_inode->i_block[block];
4543 INIT_LIST_HEAD(&ei->i_orphan);
4544
b436b9be
JK
4545 /*
4546 * Set transaction id's of transactions that have to be committed
4547 * to finish f[data]sync. We set them to currently running transaction
4548 * as we cannot be sure that the inode or some of its metadata isn't
4549 * part of the transaction - the inode could have been reclaimed and
4550 * now it is reread from disk.
4551 */
4552 if (journal) {
4553 transaction_t *transaction;
4554 tid_t tid;
4555
a931da6a 4556 read_lock(&journal->j_state_lock);
b436b9be
JK
4557 if (journal->j_running_transaction)
4558 transaction = journal->j_running_transaction;
4559 else
4560 transaction = journal->j_committing_transaction;
4561 if (transaction)
4562 tid = transaction->t_tid;
4563 else
4564 tid = journal->j_commit_sequence;
a931da6a 4565 read_unlock(&journal->j_state_lock);
b436b9be
JK
4566 ei->i_sync_tid = tid;
4567 ei->i_datasync_tid = tid;
4568 }
4569
0040d987 4570 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ac27a0ec
DK
4571 if (ei->i_extra_isize == 0) {
4572 /* The extra space is currently unused. Use it. */
617ba13b
MC
4573 ei->i_extra_isize = sizeof(struct ext4_inode) -
4574 EXT4_GOOD_OLD_INODE_SIZE;
ac27a0ec 4575 } else {
152a7b0a 4576 ext4_iget_extra_inode(inode, raw_inode, ei);
ac27a0ec 4577 }
814525f4 4578 }
ac27a0ec 4579
ef7f3835
KS
4580 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4581 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4582 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4583 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4584
ed3654eb 4585 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
c4f65706
TT
4586 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4587 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4588 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4589 inode->i_version |=
4590 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4591 }
25ec56b5
JNC
4592 }
4593
c4b5a614 4594 ret = 0;
485c26ec 4595 if (ei->i_file_acl &&
1032988c 4596 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
24676da4
TT
4597 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
4598 ei->i_file_acl);
6a797d27 4599 ret = -EFSCORRUPTED;
485c26ec 4600 goto bad_inode;
f19d5870
TM
4601 } else if (!ext4_has_inline_data(inode)) {
4602 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4603 if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4604 (S_ISLNK(inode->i_mode) &&
4605 !ext4_inode_is_fast_symlink(inode))))
4606 /* Validate extent which is part of inode */
4607 ret = ext4_ext_check_inode(inode);
4608 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4609 (S_ISLNK(inode->i_mode) &&
4610 !ext4_inode_is_fast_symlink(inode))) {
4611 /* Validate block references which are part of inode */
4612 ret = ext4_ind_check_inode(inode);
4613 }
fe2c8191 4614 }
567f3e9a 4615 if (ret)
de9a55b8 4616 goto bad_inode;
7a262f7c 4617
ac27a0ec 4618 if (S_ISREG(inode->i_mode)) {
617ba13b 4619 inode->i_op = &ext4_file_inode_operations;
be64f884 4620 inode->i_fop = &ext4_file_operations;
617ba13b 4621 ext4_set_aops(inode);
ac27a0ec 4622 } else if (S_ISDIR(inode->i_mode)) {
617ba13b
MC
4623 inode->i_op = &ext4_dir_inode_operations;
4624 inode->i_fop = &ext4_dir_operations;
ac27a0ec 4625 } else if (S_ISLNK(inode->i_mode)) {
a7a67e8a
AV
4626 if (ext4_encrypted_inode(inode)) {
4627 inode->i_op = &ext4_encrypted_symlink_inode_operations;
4628 ext4_set_aops(inode);
4629 } else if (ext4_inode_is_fast_symlink(inode)) {
75e7566b 4630 inode->i_link = (char *)ei->i_data;
617ba13b 4631 inode->i_op = &ext4_fast_symlink_inode_operations;
e83c1397
DG
4632 nd_terminate_link(ei->i_data, inode->i_size,
4633 sizeof(ei->i_data) - 1);
4634 } else {
617ba13b
MC
4635 inode->i_op = &ext4_symlink_inode_operations;
4636 ext4_set_aops(inode);
ac27a0ec 4637 }
21fc61c7 4638 inode_nohighmem(inode);
563bdd61
TT
4639 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4640 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
617ba13b 4641 inode->i_op = &ext4_special_inode_operations;
ac27a0ec
DK
4642 if (raw_inode->i_block[0])
4643 init_special_inode(inode, inode->i_mode,
4644 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
4645 else
4646 init_special_inode(inode, inode->i_mode,
4647 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
393d1d1d
DTB
4648 } else if (ino == EXT4_BOOT_LOADER_INO) {
4649 make_bad_inode(inode);
563bdd61 4650 } else {
6a797d27 4651 ret = -EFSCORRUPTED;
24676da4 4652 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
563bdd61 4653 goto bad_inode;
ac27a0ec 4654 }
af5bc92d 4655 brelse(iloc.bh);
617ba13b 4656 ext4_set_inode_flags(inode);
1d1fe1ee
DH
4657 unlock_new_inode(inode);
4658 return inode;
ac27a0ec
DK
4659
4660bad_inode:
567f3e9a 4661 brelse(iloc.bh);
1d1fe1ee
DH
4662 iget_failed(inode);
4663 return ERR_PTR(ret);
ac27a0ec
DK
4664}
4665
f4bb2981
TT
4666struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
4667{
4668 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
6a797d27 4669 return ERR_PTR(-EFSCORRUPTED);
f4bb2981
TT
4670 return ext4_iget(sb, ino);
4671}
4672
0fc1b451
AK
4673static int ext4_inode_blocks_set(handle_t *handle,
4674 struct ext4_inode *raw_inode,
4675 struct ext4_inode_info *ei)
4676{
4677 struct inode *inode = &(ei->vfs_inode);
4678 u64 i_blocks = inode->i_blocks;
4679 struct super_block *sb = inode->i_sb;
0fc1b451
AK
4680
4681 if (i_blocks <= ~0U) {
4682 /*
4907cb7b 4683 * i_blocks can be represented in a 32 bit variable
0fc1b451
AK
4684 * as multiple of 512 bytes
4685 */
8180a562 4686 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
0fc1b451 4687 raw_inode->i_blocks_high = 0;
84a8dce2 4688 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
f287a1a5
TT
4689 return 0;
4690 }
e2b911c5 4691 if (!ext4_has_feature_huge_file(sb))
f287a1a5
TT
4692 return -EFBIG;
4693
4694 if (i_blocks <= 0xffffffffffffULL) {
0fc1b451
AK
4695 /*
4696 * i_blocks can be represented in a 48 bit variable
4697 * as multiple of 512 bytes
4698 */
8180a562 4699 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
0fc1b451 4700 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
84a8dce2 4701 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
0fc1b451 4702 } else {
84a8dce2 4703 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
8180a562
AK
4704 /* i_block is stored in file system block size */
4705 i_blocks = i_blocks >> (inode->i_blkbits - 9);
4706 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
4707 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
0fc1b451 4708 }
f287a1a5 4709 return 0;
0fc1b451
AK
4710}
4711
a26f4992
TT
4712struct other_inode {
4713 unsigned long orig_ino;
4714 struct ext4_inode *raw_inode;
4715};
4716
4717static int other_inode_match(struct inode * inode, unsigned long ino,
4718 void *data)
4719{
4720 struct other_inode *oi = (struct other_inode *) data;
4721
4722 if ((inode->i_ino != ino) ||
4723 (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
4724 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
4725 ((inode->i_state & I_DIRTY_TIME) == 0))
4726 return 0;
4727 spin_lock(&inode->i_lock);
4728 if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
4729 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
4730 (inode->i_state & I_DIRTY_TIME)) {
4731 struct ext4_inode_info *ei = EXT4_I(inode);
4732
4733 inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
4734 spin_unlock(&inode->i_lock);
4735
4736 spin_lock(&ei->i_raw_lock);
4737 EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
4738 EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
4739 EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
4740 ext4_inode_csum_set(inode, oi->raw_inode, ei);
4741 spin_unlock(&ei->i_raw_lock);
4742 trace_ext4_other_inode_update_time(inode, oi->orig_ino);
4743 return -1;
4744 }
4745 spin_unlock(&inode->i_lock);
4746 return -1;
4747}
4748
4749/*
4750 * Opportunistically update the other time fields for other inodes in
4751 * the same inode table block.
4752 */
4753static void ext4_update_other_inodes_time(struct super_block *sb,
4754 unsigned long orig_ino, char *buf)
4755{
4756 struct other_inode oi;
4757 unsigned long ino;
4758 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4759 int inode_size = EXT4_INODE_SIZE(sb);
4760
4761 oi.orig_ino = orig_ino;
0f0ff9a9
TT
4762 /*
4763 * Calculate the first inode in the inode table block. Inode
4764 * numbers are one-based. That is, the first inode in a block
4765 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
4766 */
4767 ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
a26f4992
TT
4768 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
4769 if (ino == orig_ino)
4770 continue;
4771 oi.raw_inode = (struct ext4_inode *) buf;
4772 (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
4773 }
4774}
4775
ac27a0ec
DK
4776/*
4777 * Post the struct inode info into an on-disk inode location in the
4778 * buffer-cache. This gobbles the caller's reference to the
4779 * buffer_head in the inode location struct.
4780 *
4781 * The caller must have write access to iloc->bh.
4782 */
617ba13b 4783static int ext4_do_update_inode(handle_t *handle,
ac27a0ec 4784 struct inode *inode,
830156c7 4785 struct ext4_iloc *iloc)
ac27a0ec 4786{
617ba13b
MC
4787 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4788 struct ext4_inode_info *ei = EXT4_I(inode);
ac27a0ec 4789 struct buffer_head *bh = iloc->bh;
202ee5df 4790 struct super_block *sb = inode->i_sb;
ac27a0ec 4791 int err = 0, rc, block;
202ee5df 4792 int need_datasync = 0, set_large_file = 0;
08cefc7a
EB
4793 uid_t i_uid;
4794 gid_t i_gid;
040cb378 4795 projid_t i_projid;
ac27a0ec 4796
202ee5df
TT
4797 spin_lock(&ei->i_raw_lock);
4798
4799 /* For fields not tracked in the in-memory inode,
ac27a0ec 4800 * initialise them to zero for new inodes. */
19f5fb7a 4801 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
617ba13b 4802 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ac27a0ec 4803
ff9ddf7e 4804 ext4_get_inode_flags(ei);
ac27a0ec 4805 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
08cefc7a
EB
4806 i_uid = i_uid_read(inode);
4807 i_gid = i_gid_read(inode);
040cb378 4808 i_projid = from_kprojid(&init_user_ns, ei->i_projid);
af5bc92d 4809 if (!(test_opt(inode->i_sb, NO_UID32))) {
08cefc7a
EB
4810 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
4811 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
ac27a0ec
DK
4812/*
4813 * Fix up interoperability with old kernels. Otherwise, old inodes get
4814 * re-used with the upper 16 bits of the uid/gid intact
4815 */
93e3b4e6
DJ
4816 if (ei->i_dtime && list_empty(&ei->i_orphan)) {
4817 raw_inode->i_uid_high = 0;
4818 raw_inode->i_gid_high = 0;
4819 } else {
ac27a0ec 4820 raw_inode->i_uid_high =
08cefc7a 4821 cpu_to_le16(high_16_bits(i_uid));
ac27a0ec 4822 raw_inode->i_gid_high =
08cefc7a 4823 cpu_to_le16(high_16_bits(i_gid));
ac27a0ec
DK
4824 }
4825 } else {
08cefc7a
EB
4826 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
4827 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
ac27a0ec
DK
4828 raw_inode->i_uid_high = 0;
4829 raw_inode->i_gid_high = 0;
4830 }
4831 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
ef7f3835
KS
4832
4833 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4834 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4835 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4836 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
4837
bce92d56
LX
4838 err = ext4_inode_blocks_set(handle, raw_inode, ei);
4839 if (err) {
202ee5df 4840 spin_unlock(&ei->i_raw_lock);
0fc1b451 4841 goto out_brelse;
202ee5df 4842 }
ac27a0ec 4843 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
353eb83c 4844 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
ed3654eb 4845 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
a1ddeb7e
BP
4846 raw_inode->i_file_acl_high =
4847 cpu_to_le16(ei->i_file_acl >> 32);
7973c0c1 4848 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
b71fc079
JK
4849 if (ei->i_disksize != ext4_isize(raw_inode)) {
4850 ext4_isize_set(raw_inode, ei->i_disksize);
4851 need_datasync = 1;
4852 }
a48380f7 4853 if (ei->i_disksize > 0x7fffffffULL) {
e2b911c5 4854 if (!ext4_has_feature_large_file(sb) ||
a48380f7 4855 EXT4_SB(sb)->s_es->s_rev_level ==
202ee5df
TT
4856 cpu_to_le32(EXT4_GOOD_OLD_REV))
4857 set_large_file = 1;
ac27a0ec
DK
4858 }
4859 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
4860 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
4861 if (old_valid_dev(inode->i_rdev)) {
4862 raw_inode->i_block[0] =
4863 cpu_to_le32(old_encode_dev(inode->i_rdev));
4864 raw_inode->i_block[1] = 0;
4865 } else {
4866 raw_inode->i_block[0] = 0;
4867 raw_inode->i_block[1] =
4868 cpu_to_le32(new_encode_dev(inode->i_rdev));
4869 raw_inode->i_block[2] = 0;
4870 }
f19d5870 4871 } else if (!ext4_has_inline_data(inode)) {
de9a55b8
TT
4872 for (block = 0; block < EXT4_N_BLOCKS; block++)
4873 raw_inode->i_block[block] = ei->i_data[block];
f19d5870 4874 }
ac27a0ec 4875
ed3654eb 4876 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
c4f65706
TT
4877 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4878 if (ei->i_extra_isize) {
4879 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4880 raw_inode->i_version_hi =
4881 cpu_to_le32(inode->i_version >> 32);
4882 raw_inode->i_extra_isize =
4883 cpu_to_le16(ei->i_extra_isize);
4884 }
25ec56b5 4885 }
040cb378 4886
0b7b7779 4887 BUG_ON(!ext4_has_feature_project(inode->i_sb) &&
040cb378
LX
4888 i_projid != EXT4_DEF_PROJID);
4889
4890 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
4891 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
4892 raw_inode->i_projid = cpu_to_le32(i_projid);
4893
814525f4 4894 ext4_inode_csum_set(inode, raw_inode, ei);
202ee5df 4895 spin_unlock(&ei->i_raw_lock);
a26f4992
TT
4896 if (inode->i_sb->s_flags & MS_LAZYTIME)
4897 ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
4898 bh->b_data);
202ee5df 4899
830156c7 4900 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
73b50c1c 4901 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
830156c7
FM
4902 if (!err)
4903 err = rc;
19f5fb7a 4904 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
202ee5df 4905 if (set_large_file) {
5d601255 4906 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
202ee5df
TT
4907 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
4908 if (err)
4909 goto out_brelse;
4910 ext4_update_dynamic_rev(sb);
e2b911c5 4911 ext4_set_feature_large_file(sb);
202ee5df
TT
4912 ext4_handle_sync(handle);
4913 err = ext4_handle_dirty_super(handle, sb);
4914 }
b71fc079 4915 ext4_update_inode_fsync_trans(handle, inode, need_datasync);
ac27a0ec 4916out_brelse:
af5bc92d 4917 brelse(bh);
617ba13b 4918 ext4_std_error(inode->i_sb, err);
ac27a0ec
DK
4919 return err;
4920}
4921
4922/*
617ba13b 4923 * ext4_write_inode()
ac27a0ec
DK
4924 *
4925 * We are called from a few places:
4926 *
87f7e416 4927 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
ac27a0ec 4928 * Here, there will be no transaction running. We wait for any running
4907cb7b 4929 * transaction to commit.
ac27a0ec 4930 *
87f7e416
TT
4931 * - Within flush work (sys_sync(), kupdate and such).
4932 * We wait on commit, if told to.
ac27a0ec 4933 *
87f7e416
TT
4934 * - Within iput_final() -> write_inode_now()
4935 * We wait on commit, if told to.
ac27a0ec
DK
4936 *
4937 * In all cases it is actually safe for us to return without doing anything,
4938 * because the inode has been copied into a raw inode buffer in
87f7e416
TT
4939 * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
4940 * writeback.
ac27a0ec
DK
4941 *
4942 * Note that we are absolutely dependent upon all inode dirtiers doing the
4943 * right thing: they *must* call mark_inode_dirty() after dirtying info in
4944 * which we are interested.
4945 *
4946 * It would be a bug for them to not do this. The code:
4947 *
4948 * mark_inode_dirty(inode)
4949 * stuff();
4950 * inode->i_size = expr;
4951 *
87f7e416
TT
4952 * is in error because write_inode() could occur while `stuff()' is running,
4953 * and the new i_size will be lost. Plus the inode will no longer be on the
4954 * superblock's dirty inode list.
ac27a0ec 4955 */
a9185b41 4956int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
ac27a0ec 4957{
91ac6f43
FM
4958 int err;
4959
87f7e416 4960 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
ac27a0ec
DK
4961 return 0;
4962
91ac6f43
FM
4963 if (EXT4_SB(inode->i_sb)->s_journal) {
4964 if (ext4_journal_current_handle()) {
4965 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4966 dump_stack();
4967 return -EIO;
4968 }
ac27a0ec 4969
10542c22
JK
4970 /*
4971 * No need to force transaction in WB_SYNC_NONE mode. Also
4972 * ext4_sync_fs() will force the commit after everything is
4973 * written.
4974 */
4975 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
91ac6f43
FM
4976 return 0;
4977
4978 err = ext4_force_commit(inode->i_sb);
4979 } else {
4980 struct ext4_iloc iloc;
ac27a0ec 4981
8b472d73 4982 err = __ext4_get_inode_loc(inode, &iloc, 0);
91ac6f43
FM
4983 if (err)
4984 return err;
10542c22
JK
4985 /*
4986 * sync(2) will flush the whole buffer cache. No need to do
4987 * it here separately for each inode.
4988 */
4989 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
830156c7
FM
4990 sync_dirty_buffer(iloc.bh);
4991 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
c398eda0
TT
4992 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
4993 "IO error syncing inode");
830156c7
FM
4994 err = -EIO;
4995 }
fd2dd9fb 4996 brelse(iloc.bh);
91ac6f43
FM
4997 }
4998 return err;
ac27a0ec
DK
4999}
5000
53e87268
JK
5001/*
5002 * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
5003 * buffers that are attached to a page stradding i_size and are undergoing
5004 * commit. In that case we have to wait for commit to finish and try again.
5005 */
5006static void ext4_wait_for_tail_page_commit(struct inode *inode)
5007{
5008 struct page *page;
5009 unsigned offset;
5010 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
5011 tid_t commit_tid = 0;
5012 int ret;
5013
09cbfeaf 5014 offset = inode->i_size & (PAGE_SIZE - 1);
53e87268
JK
5015 /*
5016 * All buffers in the last page remain valid? Then there's nothing to
ea1754a0 5017 * do. We do the check mainly to optimize the common PAGE_SIZE ==
53e87268
JK
5018 * blocksize case
5019 */
09cbfeaf 5020 if (offset > PAGE_SIZE - (1 << inode->i_blkbits))
53e87268
JK
5021 return;
5022 while (1) {
5023 page = find_lock_page(inode->i_mapping,
09cbfeaf 5024 inode->i_size >> PAGE_SHIFT);
53e87268
JK
5025 if (!page)
5026 return;
ca99fdd2 5027 ret = __ext4_journalled_invalidatepage(page, offset,
09cbfeaf 5028 PAGE_SIZE - offset);
53e87268 5029 unlock_page(page);
09cbfeaf 5030 put_page(page);
53e87268
JK
5031 if (ret != -EBUSY)
5032 return;
5033 commit_tid = 0;
5034 read_lock(&journal->j_state_lock);
5035 if (journal->j_committing_transaction)
5036 commit_tid = journal->j_committing_transaction->t_tid;
5037 read_unlock(&journal->j_state_lock);
5038 if (commit_tid)
5039 jbd2_log_wait_commit(journal, commit_tid);
5040 }
5041}
5042
ac27a0ec 5043/*
617ba13b 5044 * ext4_setattr()
ac27a0ec
DK
5045 *
5046 * Called from notify_change.
5047 *
5048 * We want to trap VFS attempts to truncate the file as soon as
5049 * possible. In particular, we want to make sure that when the VFS
5050 * shrinks i_size, we put the inode on the orphan list and modify
5051 * i_disksize immediately, so that during the subsequent flushing of
5052 * dirty pages and freeing of disk blocks, we can guarantee that any
5053 * commit will leave the blocks being flushed in an unused state on
5054 * disk. (On recovery, the inode will get truncated and the blocks will
5055 * be freed, so we have a strong guarantee that no future commit will
5056 * leave these blocks visible to the user.)
5057 *
678aaf48
JK
5058 * Another thing we have to assure is that if we are in ordered mode
5059 * and inode is still attached to the committing transaction, we must
5060 * we start writeout of all the dirty pages which are being truncated.
5061 * This way we are sure that all the data written in the previous
5062 * transaction are already on disk (truncate waits for pages under
5063 * writeback).
5064 *
5065 * Called with inode->i_mutex down.
ac27a0ec 5066 */
617ba13b 5067int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ac27a0ec 5068{
2b0143b5 5069 struct inode *inode = d_inode(dentry);
ac27a0ec 5070 int error, rc = 0;
3d287de3 5071 int orphan = 0;
ac27a0ec
DK
5072 const unsigned int ia_valid = attr->ia_valid;
5073
31051c85 5074 error = setattr_prepare(dentry, attr);
ac27a0ec
DK
5075 if (error)
5076 return error;
5077
a7cdadee
JK
5078 if (is_quota_modification(inode, attr)) {
5079 error = dquot_initialize(inode);
5080 if (error)
5081 return error;
5082 }
08cefc7a
EB
5083 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
5084 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
ac27a0ec
DK
5085 handle_t *handle;
5086
5087 /* (user+group)*(old+new) structure, inode write (sb,
5088 * inode block, ? - but truncate inode update has it) */
9924a92a
TT
5089 handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
5090 (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
5091 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
ac27a0ec
DK
5092 if (IS_ERR(handle)) {
5093 error = PTR_ERR(handle);
5094 goto err_out;
5095 }
b43fa828 5096 error = dquot_transfer(inode, attr);
ac27a0ec 5097 if (error) {
617ba13b 5098 ext4_journal_stop(handle);
ac27a0ec
DK
5099 return error;
5100 }
5101 /* Update corresponding info in inode so that everything is in
5102 * one transaction */
5103 if (attr->ia_valid & ATTR_UID)
5104 inode->i_uid = attr->ia_uid;
5105 if (attr->ia_valid & ATTR_GID)
5106 inode->i_gid = attr->ia_gid;
617ba13b
MC
5107 error = ext4_mark_inode_dirty(handle, inode);
5108 ext4_journal_stop(handle);
ac27a0ec
DK
5109 }
5110
3da40c7b 5111 if (attr->ia_valid & ATTR_SIZE) {
5208386c 5112 handle_t *handle;
3da40c7b
JB
5113 loff_t oldsize = inode->i_size;
5114 int shrink = (attr->ia_size <= inode->i_size);
562c72aa 5115
12e9b892 5116 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
e2b46574
ES
5117 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5118
0c095c7f
TT
5119 if (attr->ia_size > sbi->s_bitmap_maxbytes)
5120 return -EFBIG;
e2b46574 5121 }
3da40c7b
JB
5122 if (!S_ISREG(inode->i_mode))
5123 return -EINVAL;
dff6efc3
CH
5124
5125 if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
5126 inode_inc_iversion(inode);
5127
3da40c7b 5128 if (ext4_should_order_data(inode) &&
5208386c 5129 (attr->ia_size < inode->i_size)) {
3da40c7b 5130 error = ext4_begin_ordered_truncate(inode,
678aaf48 5131 attr->ia_size);
3da40c7b
JB
5132 if (error)
5133 goto err_out;
5134 }
5135 if (attr->ia_size != inode->i_size) {
5208386c
JK
5136 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
5137 if (IS_ERR(handle)) {
5138 error = PTR_ERR(handle);
5139 goto err_out;
5140 }
3da40c7b 5141 if (ext4_handle_valid(handle) && shrink) {
5208386c
JK
5142 error = ext4_orphan_add(handle, inode);
5143 orphan = 1;
5144 }
911af577
EG
5145 /*
5146 * Update c/mtime on truncate up, ext4_truncate() will
5147 * update c/mtime in shrink case below
5148 */
5149 if (!shrink) {
5150 inode->i_mtime = ext4_current_time(inode);
5151 inode->i_ctime = inode->i_mtime;
5152 }
90e775b7 5153 down_write(&EXT4_I(inode)->i_data_sem);
5208386c
JK
5154 EXT4_I(inode)->i_disksize = attr->ia_size;
5155 rc = ext4_mark_inode_dirty(handle, inode);
5156 if (!error)
5157 error = rc;
90e775b7
JK
5158 /*
5159 * We have to update i_size under i_data_sem together
5160 * with i_disksize to avoid races with writeback code
5161 * running ext4_wb_update_i_disksize().
5162 */
5163 if (!error)
5164 i_size_write(inode, attr->ia_size);
5165 up_write(&EXT4_I(inode)->i_data_sem);
5208386c
JK
5166 ext4_journal_stop(handle);
5167 if (error) {
3da40c7b
JB
5168 if (orphan)
5169 ext4_orphan_del(NULL, inode);
678aaf48
JK
5170 goto err_out;
5171 }
d6320cbf 5172 }
3da40c7b
JB
5173 if (!shrink)
5174 pagecache_isize_extended(inode, oldsize, inode->i_size);
53e87268 5175
5208386c
JK
5176 /*
5177 * Blocks are going to be removed from the inode. Wait
5178 * for dio in flight. Temporarily disable
5179 * dioread_nolock to prevent livelock.
5180 */
5181 if (orphan) {
5182 if (!ext4_should_journal_data(inode)) {
5183 ext4_inode_block_unlocked_dio(inode);
5184 inode_dio_wait(inode);
5185 ext4_inode_resume_unlocked_dio(inode);
5186 } else
5187 ext4_wait_for_tail_page_commit(inode);
1c9114f9 5188 }
ea3d7209 5189 down_write(&EXT4_I(inode)->i_mmap_sem);
5208386c
JK
5190 /*
5191 * Truncate pagecache after we've waited for commit
5192 * in data=journal mode to make pages freeable.
5193 */
923ae0ff 5194 truncate_pagecache(inode, inode->i_size);
3da40c7b
JB
5195 if (shrink)
5196 ext4_truncate(inode);
ea3d7209 5197 up_write(&EXT4_I(inode)->i_mmap_sem);
072bd7ea 5198 }
ac27a0ec 5199
1025774c
CH
5200 if (!rc) {
5201 setattr_copy(inode, attr);
5202 mark_inode_dirty(inode);
5203 }
5204
5205 /*
5206 * If the call to ext4_truncate failed to get a transaction handle at
5207 * all, we need to clean up the in-core orphan list manually.
5208 */
3d287de3 5209 if (orphan && inode->i_nlink)
617ba13b 5210 ext4_orphan_del(NULL, inode);
ac27a0ec
DK
5211
5212 if (!rc && (ia_valid & ATTR_MODE))
64e178a7 5213 rc = posix_acl_chmod(inode, inode->i_mode);
ac27a0ec
DK
5214
5215err_out:
617ba13b 5216 ext4_std_error(inode->i_sb, error);
ac27a0ec
DK
5217 if (!error)
5218 error = rc;
5219 return error;
5220}
5221
3e3398a0
MC
5222int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5223 struct kstat *stat)
5224{
5225 struct inode *inode;
8af8eecc 5226 unsigned long long delalloc_blocks;
3e3398a0 5227
2b0143b5 5228 inode = d_inode(dentry);
3e3398a0
MC
5229 generic_fillattr(inode, stat);
5230
9206c561
AD
5231 /*
5232 * If there is inline data in the inode, the inode will normally not
5233 * have data blocks allocated (it may have an external xattr block).
5234 * Report at least one sector for such files, so tools like tar, rsync,
5235 * others doen't incorrectly think the file is completely sparse.
5236 */
5237 if (unlikely(ext4_has_inline_data(inode)))
5238 stat->blocks += (stat->size + 511) >> 9;
5239
3e3398a0
MC
5240 /*
5241 * We can't update i_blocks if the block allocation is delayed
5242 * otherwise in the case of system crash before the real block
5243 * allocation is done, we will have i_blocks inconsistent with
5244 * on-disk file blocks.
5245 * We always keep i_blocks updated together with real
5246 * allocation. But to not confuse with user, stat
5247 * will return the blocks that include the delayed allocation
5248 * blocks for this file.
5249 */
96607551 5250 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
9206c561
AD
5251 EXT4_I(inode)->i_reserved_data_blocks);
5252 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
3e3398a0
MC
5253 return 0;
5254}
ac27a0ec 5255
fffb2739
JK
5256static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
5257 int pextents)
a02908f1 5258{
12e9b892 5259 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
fffb2739
JK
5260 return ext4_ind_trans_blocks(inode, lblocks);
5261 return ext4_ext_index_trans_blocks(inode, pextents);
a02908f1 5262}
ac51d837 5263
ac27a0ec 5264/*
a02908f1
MC
5265 * Account for index blocks, block groups bitmaps and block group
5266 * descriptor blocks if modify datablocks and index blocks
5267 * worse case, the indexs blocks spread over different block groups
ac27a0ec 5268 *
a02908f1 5269 * If datablocks are discontiguous, they are possible to spread over
4907cb7b 5270 * different block groups too. If they are contiguous, with flexbg,
a02908f1 5271 * they could still across block group boundary.
ac27a0ec 5272 *
a02908f1
MC
5273 * Also account for superblock, inode, quota and xattr blocks
5274 */
fffb2739
JK
5275static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
5276 int pextents)
a02908f1 5277{
8df9675f
TT
5278 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5279 int gdpblocks;
a02908f1
MC
5280 int idxblocks;
5281 int ret = 0;
5282
5283 /*
fffb2739
JK
5284 * How many index blocks need to touch to map @lblocks logical blocks
5285 * to @pextents physical extents?
a02908f1 5286 */
fffb2739 5287 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
a02908f1
MC
5288
5289 ret = idxblocks;
5290
5291 /*
5292 * Now let's see how many group bitmaps and group descriptors need
5293 * to account
5294 */
fffb2739 5295 groups = idxblocks + pextents;
a02908f1 5296 gdpblocks = groups;
8df9675f
TT
5297 if (groups > ngroups)
5298 groups = ngroups;
a02908f1
MC
5299 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5300 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5301
5302 /* bitmaps and block group descriptor blocks */
5303 ret += groups + gdpblocks;
5304
5305 /* Blocks for super block, inode, quota and xattr blocks */
5306 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5307
5308 return ret;
5309}
5310
5311/*
25985edc 5312 * Calculate the total number of credits to reserve to fit
f3bd1f3f
MC
5313 * the modification of a single pages into a single transaction,
5314 * which may include multiple chunks of block allocations.
ac27a0ec 5315 *
525f4ed8 5316 * This could be called via ext4_write_begin()
ac27a0ec 5317 *
525f4ed8 5318 * We need to consider the worse case, when
a02908f1 5319 * one new block per extent.
ac27a0ec 5320 */
a86c6181 5321int ext4_writepage_trans_blocks(struct inode *inode)
ac27a0ec 5322{
617ba13b 5323 int bpp = ext4_journal_blocks_per_page(inode);
ac27a0ec
DK
5324 int ret;
5325
fffb2739 5326 ret = ext4_meta_trans_blocks(inode, bpp, bpp);
a86c6181 5327
a02908f1 5328 /* Account for data blocks for journalled mode */
617ba13b 5329 if (ext4_should_journal_data(inode))
a02908f1 5330 ret += bpp;
ac27a0ec
DK
5331 return ret;
5332}
f3bd1f3f
MC
5333
5334/*
5335 * Calculate the journal credits for a chunk of data modification.
5336 *
5337 * This is called from DIO, fallocate or whoever calling
79e83036 5338 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
f3bd1f3f
MC
5339 *
5340 * journal buffers for data blocks are not included here, as DIO
5341 * and fallocate do no need to journal data buffers.
5342 */
5343int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5344{
5345 return ext4_meta_trans_blocks(inode, nrblocks, 1);
5346}
5347
ac27a0ec 5348/*
617ba13b 5349 * The caller must have previously called ext4_reserve_inode_write().
ac27a0ec
DK
5350 * Give this, we know that the caller already has write access to iloc->bh.
5351 */
617ba13b 5352int ext4_mark_iloc_dirty(handle_t *handle,
de9a55b8 5353 struct inode *inode, struct ext4_iloc *iloc)
ac27a0ec
DK
5354{
5355 int err = 0;
5356
c64db50e 5357 if (IS_I_VERSION(inode))
25ec56b5
JNC
5358 inode_inc_iversion(inode);
5359
ac27a0ec
DK
5360 /* the do_update_inode consumes one bh->b_count */
5361 get_bh(iloc->bh);
5362
dab291af 5363 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
830156c7 5364 err = ext4_do_update_inode(handle, inode, iloc);
ac27a0ec
DK
5365 put_bh(iloc->bh);
5366 return err;
5367}
5368
5369/*
5370 * On success, We end up with an outstanding reference count against
5371 * iloc->bh. This _must_ be cleaned up later.
5372 */
5373
5374int
617ba13b
MC
5375ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5376 struct ext4_iloc *iloc)
ac27a0ec 5377{
0390131b
FM
5378 int err;
5379
5380 err = ext4_get_inode_loc(inode, iloc);
5381 if (!err) {
5382 BUFFER_TRACE(iloc->bh, "get_write_access");
5383 err = ext4_journal_get_write_access(handle, iloc->bh);
5384 if (err) {
5385 brelse(iloc->bh);
5386 iloc->bh = NULL;
ac27a0ec
DK
5387 }
5388 }
617ba13b 5389 ext4_std_error(inode->i_sb, err);
ac27a0ec
DK
5390 return err;
5391}
5392
6dd4ee7c
KS
5393/*
5394 * Expand an inode by new_extra_isize bytes.
5395 * Returns 0 on success or negative error number on failure.
5396 */
1d03ec98
AK
5397static int ext4_expand_extra_isize(struct inode *inode,
5398 unsigned int new_extra_isize,
5399 struct ext4_iloc iloc,
5400 handle_t *handle)
6dd4ee7c
KS
5401{
5402 struct ext4_inode *raw_inode;
5403 struct ext4_xattr_ibody_header *header;
6dd4ee7c
KS
5404
5405 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5406 return 0;
5407
5408 raw_inode = ext4_raw_inode(&iloc);
5409
5410 header = IHDR(inode, raw_inode);
6dd4ee7c
KS
5411
5412 /* No extended attributes present */
19f5fb7a
TT
5413 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5414 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
6dd4ee7c
KS
5415 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5416 new_extra_isize);
5417 EXT4_I(inode)->i_extra_isize = new_extra_isize;
5418 return 0;
5419 }
5420
5421 /* try to expand with EAs present */
5422 return ext4_expand_extra_isize_ea(inode, new_extra_isize,
5423 raw_inode, handle);
5424}
5425
ac27a0ec
DK
5426/*
5427 * What we do here is to mark the in-core inode as clean with respect to inode
5428 * dirtiness (it may still be data-dirty).
5429 * This means that the in-core inode may be reaped by prune_icache
5430 * without having to perform any I/O. This is a very good thing,
5431 * because *any* task may call prune_icache - even ones which
5432 * have a transaction open against a different journal.
5433 *
5434 * Is this cheating? Not really. Sure, we haven't written the
5435 * inode out, but prune_icache isn't a user-visible syncing function.
5436 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5437 * we start and wait on commits.
ac27a0ec 5438 */
617ba13b 5439int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
ac27a0ec 5440{
617ba13b 5441 struct ext4_iloc iloc;
6dd4ee7c
KS
5442 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5443 static unsigned int mnt_count;
5444 int err, ret;
ac27a0ec
DK
5445
5446 might_sleep();
7ff9c073 5447 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
617ba13b 5448 err = ext4_reserve_inode_write(handle, inode, &iloc);
5e1021f2
EG
5449 if (err)
5450 return err;
0390131b
FM
5451 if (ext4_handle_valid(handle) &&
5452 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
19f5fb7a 5453 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
6dd4ee7c
KS
5454 /*
5455 * We need extra buffer credits since we may write into EA block
5456 * with this same handle. If journal_extend fails, then it will
5457 * only result in a minor loss of functionality for that inode.
5458 * If this is felt to be critical, then e2fsck should be run to
5459 * force a large enough s_min_extra_isize.
5460 */
5461 if ((jbd2_journal_extend(handle,
5462 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
5463 ret = ext4_expand_extra_isize(inode,
5464 sbi->s_want_extra_isize,
5465 iloc, handle);
5466 if (ret) {
c1bddad9
AK
5467 if (mnt_count !=
5468 le16_to_cpu(sbi->s_es->s_mnt_count)) {
12062ddd 5469 ext4_warning(inode->i_sb,
6dd4ee7c
KS
5470 "Unable to expand inode %lu. Delete"
5471 " some EAs or run e2fsck.",
5472 inode->i_ino);
c1bddad9
AK
5473 mnt_count =
5474 le16_to_cpu(sbi->s_es->s_mnt_count);
6dd4ee7c
KS
5475 }
5476 }
5477 }
5478 }
5e1021f2 5479 return ext4_mark_iloc_dirty(handle, inode, &iloc);
ac27a0ec
DK
5480}
5481
5482/*
617ba13b 5483 * ext4_dirty_inode() is called from __mark_inode_dirty()
ac27a0ec
DK
5484 *
5485 * We're really interested in the case where a file is being extended.
5486 * i_size has been changed by generic_commit_write() and we thus need
5487 * to include the updated inode in the current transaction.
5488 *
5dd4056d 5489 * Also, dquot_alloc_block() will always dirty the inode when blocks
ac27a0ec
DK
5490 * are allocated to the file.
5491 *
5492 * If the inode is marked synchronous, we don't honour that here - doing
5493 * so would cause a commit on atime updates, which we don't bother doing.
5494 * We handle synchronous inodes at the highest possible level.
0ae45f63
TT
5495 *
5496 * If only the I_DIRTY_TIME flag is set, we can skip everything. If
5497 * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
5498 * to copy into the on-disk inode structure are the timestamp files.
ac27a0ec 5499 */
aa385729 5500void ext4_dirty_inode(struct inode *inode, int flags)
ac27a0ec 5501{
ac27a0ec
DK
5502 handle_t *handle;
5503
0ae45f63
TT
5504 if (flags == I_DIRTY_TIME)
5505 return;
9924a92a 5506 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
ac27a0ec
DK
5507 if (IS_ERR(handle))
5508 goto out;
f3dc272f 5509
f3dc272f
CW
5510 ext4_mark_inode_dirty(handle, inode);
5511
617ba13b 5512 ext4_journal_stop(handle);
ac27a0ec
DK
5513out:
5514 return;
5515}
5516
5517#if 0
5518/*
5519 * Bind an inode's backing buffer_head into this transaction, to prevent
5520 * it from being flushed to disk early. Unlike
617ba13b 5521 * ext4_reserve_inode_write, this leaves behind no bh reference and
ac27a0ec
DK
5522 * returns no iloc structure, so the caller needs to repeat the iloc
5523 * lookup to mark the inode dirty later.
5524 */
617ba13b 5525static int ext4_pin_inode(handle_t *handle, struct inode *inode)
ac27a0ec 5526{
617ba13b 5527 struct ext4_iloc iloc;
ac27a0ec
DK
5528
5529 int err = 0;
5530 if (handle) {
617ba13b 5531 err = ext4_get_inode_loc(inode, &iloc);
ac27a0ec
DK
5532 if (!err) {
5533 BUFFER_TRACE(iloc.bh, "get_write_access");
dab291af 5534 err = jbd2_journal_get_write_access(handle, iloc.bh);
ac27a0ec 5535 if (!err)
0390131b 5536 err = ext4_handle_dirty_metadata(handle,
73b50c1c 5537 NULL,
0390131b 5538 iloc.bh);
ac27a0ec
DK
5539 brelse(iloc.bh);
5540 }
5541 }
617ba13b 5542 ext4_std_error(inode->i_sb, err);
ac27a0ec
DK
5543 return err;
5544}
5545#endif
5546
617ba13b 5547int ext4_change_inode_journal_flag(struct inode *inode, int val)
ac27a0ec
DK
5548{
5549 journal_t *journal;
5550 handle_t *handle;
5551 int err;
c8585c6f 5552 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ac27a0ec
DK
5553
5554 /*
5555 * We have to be very careful here: changing a data block's
5556 * journaling status dynamically is dangerous. If we write a
5557 * data block to the journal, change the status and then delete
5558 * that block, we risk forgetting to revoke the old log record
5559 * from the journal and so a subsequent replay can corrupt data.
5560 * So, first we make sure that the journal is empty and that
5561 * nobody is changing anything.
5562 */
5563
617ba13b 5564 journal = EXT4_JOURNAL(inode);
0390131b
FM
5565 if (!journal)
5566 return 0;
d699594d 5567 if (is_journal_aborted(journal))
ac27a0ec
DK
5568 return -EROFS;
5569
17335dcc
DM
5570 /* Wait for all existing dio workers */
5571 ext4_inode_block_unlocked_dio(inode);
5572 inode_dio_wait(inode);
5573
4c546592
DJ
5574 /*
5575 * Before flushing the journal and switching inode's aops, we have
5576 * to flush all dirty data the inode has. There can be outstanding
5577 * delayed allocations, there can be unwritten extents created by
5578 * fallocate or buffered writes in dioread_nolock mode covered by
5579 * dirty data which can be converted only after flushing the dirty
5580 * data (and journalled aops don't know how to handle these cases).
5581 */
5582 if (val) {
5583 down_write(&EXT4_I(inode)->i_mmap_sem);
5584 err = filemap_write_and_wait(inode->i_mapping);
5585 if (err < 0) {
5586 up_write(&EXT4_I(inode)->i_mmap_sem);
5587 ext4_inode_resume_unlocked_dio(inode);
5588 return err;
5589 }
5590 }
5591
c8585c6f 5592 percpu_down_write(&sbi->s_journal_flag_rwsem);
dab291af 5593 jbd2_journal_lock_updates(journal);
ac27a0ec
DK
5594
5595 /*
5596 * OK, there are no updates running now, and all cached data is
5597 * synced to disk. We are now in a completely consistent state
5598 * which doesn't have anything in the journal, and we know that
5599 * no filesystem updates are running, so it is safe to modify
5600 * the inode's in-core data-journaling state flag now.
5601 */
5602
5603 if (val)
12e9b892 5604 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5872ddaa 5605 else {
4f879ca6
JK
5606 err = jbd2_journal_flush(journal);
5607 if (err < 0) {
5608 jbd2_journal_unlock_updates(journal);
c8585c6f 5609 percpu_up_write(&sbi->s_journal_flag_rwsem);
4f879ca6
JK
5610 ext4_inode_resume_unlocked_dio(inode);
5611 return err;
5612 }
12e9b892 5613 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5872ddaa 5614 }
617ba13b 5615 ext4_set_aops(inode);
ac27a0ec 5616
dab291af 5617 jbd2_journal_unlock_updates(journal);
c8585c6f
DJ
5618 percpu_up_write(&sbi->s_journal_flag_rwsem);
5619
4c546592
DJ
5620 if (val)
5621 up_write(&EXT4_I(inode)->i_mmap_sem);
17335dcc 5622 ext4_inode_resume_unlocked_dio(inode);
ac27a0ec
DK
5623
5624 /* Finally we can mark the inode as dirty. */
5625
9924a92a 5626 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
ac27a0ec
DK
5627 if (IS_ERR(handle))
5628 return PTR_ERR(handle);
5629
617ba13b 5630 err = ext4_mark_inode_dirty(handle, inode);
0390131b 5631 ext4_handle_sync(handle);
617ba13b
MC
5632 ext4_journal_stop(handle);
5633 ext4_std_error(inode->i_sb, err);
ac27a0ec
DK
5634
5635 return err;
5636}
2e9ee850
AK
5637
5638static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5639{
5640 return !buffer_mapped(bh);
5641}
5642
c2ec175c 5643int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
2e9ee850 5644{
c2ec175c 5645 struct page *page = vmf->page;
2e9ee850
AK
5646 loff_t size;
5647 unsigned long len;
9ea7df53 5648 int ret;
2e9ee850 5649 struct file *file = vma->vm_file;
496ad9aa 5650 struct inode *inode = file_inode(file);
2e9ee850 5651 struct address_space *mapping = inode->i_mapping;
9ea7df53
JK
5652 handle_t *handle;
5653 get_block_t *get_block;
5654 int retries = 0;
2e9ee850 5655
8e8ad8a5 5656 sb_start_pagefault(inode->i_sb);
041bbb6d 5657 file_update_time(vma->vm_file);
ea3d7209
JK
5658
5659 down_read(&EXT4_I(inode)->i_mmap_sem);
9ea7df53
JK
5660 /* Delalloc case is easy... */
5661 if (test_opt(inode->i_sb, DELALLOC) &&
5662 !ext4_should_journal_data(inode) &&
5663 !ext4_nonda_switch(inode->i_sb)) {
5664 do {
5c500029 5665 ret = block_page_mkwrite(vma, vmf,
9ea7df53
JK
5666 ext4_da_get_block_prep);
5667 } while (ret == -ENOSPC &&
5668 ext4_should_retry_alloc(inode->i_sb, &retries));
5669 goto out_ret;
2e9ee850 5670 }
0e499890
DW
5671
5672 lock_page(page);
9ea7df53
JK
5673 size = i_size_read(inode);
5674 /* Page got truncated from under us? */
5675 if (page->mapping != mapping || page_offset(page) > size) {
5676 unlock_page(page);
5677 ret = VM_FAULT_NOPAGE;
5678 goto out;
0e499890 5679 }
2e9ee850 5680
09cbfeaf
KS
5681 if (page->index == size >> PAGE_SHIFT)
5682 len = size & ~PAGE_MASK;
2e9ee850 5683 else
09cbfeaf 5684 len = PAGE_SIZE;
a827eaff 5685 /*
9ea7df53
JK
5686 * Return if we have all the buffers mapped. This avoids the need to do
5687 * journal_start/journal_stop which can block and take a long time
a827eaff 5688 */
2e9ee850 5689 if (page_has_buffers(page)) {
f19d5870
TM
5690 if (!ext4_walk_page_buffers(NULL, page_buffers(page),
5691 0, len, NULL,
5692 ext4_bh_unmapped)) {
9ea7df53 5693 /* Wait so that we don't change page under IO */
1d1d1a76 5694 wait_for_stable_page(page);
9ea7df53
JK
5695 ret = VM_FAULT_LOCKED;
5696 goto out;
a827eaff 5697 }
2e9ee850 5698 }
a827eaff 5699 unlock_page(page);
9ea7df53
JK
5700 /* OK, we need to fill the hole... */
5701 if (ext4_should_dioread_nolock(inode))
705965bd 5702 get_block = ext4_get_block_unwritten;
9ea7df53
JK
5703 else
5704 get_block = ext4_get_block;
5705retry_alloc:
9924a92a
TT
5706 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
5707 ext4_writepage_trans_blocks(inode));
9ea7df53 5708 if (IS_ERR(handle)) {
c2ec175c 5709 ret = VM_FAULT_SIGBUS;
9ea7df53
JK
5710 goto out;
5711 }
5c500029 5712 ret = block_page_mkwrite(vma, vmf, get_block);
9ea7df53 5713 if (!ret && ext4_should_journal_data(inode)) {
f19d5870 5714 if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
09cbfeaf 5715 PAGE_SIZE, NULL, do_journal_get_write_access)) {
9ea7df53
JK
5716 unlock_page(page);
5717 ret = VM_FAULT_SIGBUS;
fcbb5515 5718 ext4_journal_stop(handle);
9ea7df53
JK
5719 goto out;
5720 }
5721 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
5722 }
5723 ext4_journal_stop(handle);
5724 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
5725 goto retry_alloc;
5726out_ret:
5727 ret = block_page_mkwrite_return(ret);
5728out:
ea3d7209 5729 up_read(&EXT4_I(inode)->i_mmap_sem);
8e8ad8a5 5730 sb_end_pagefault(inode->i_sb);
2e9ee850
AK
5731 return ret;
5732}
ea3d7209
JK
5733
5734int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5735{
5736 struct inode *inode = file_inode(vma->vm_file);
5737 int err;
5738
5739 down_read(&EXT4_I(inode)->i_mmap_sem);
5740 err = filemap_fault(vma, vmf);
5741 up_read(&EXT4_I(inode)->i_mmap_sem);
5742
5743 return err;
5744}
2d90c160
JK
5745
5746/*
5747 * Find the first extent at or after @lblk in an inode that is not a hole.
5748 * Search for @map_len blocks at most. The extent is returned in @result.
5749 *
5750 * The function returns 1 if we found an extent. The function returns 0 in
5751 * case there is no extent at or after @lblk and in that case also sets
5752 * @result->es_len to 0. In case of error, the error code is returned.
5753 */
5754int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
5755 unsigned int map_len, struct extent_status *result)
5756{
5757 struct ext4_map_blocks map;
5758 struct extent_status es = {};
5759 int ret;
5760
5761 map.m_lblk = lblk;
5762 map.m_len = map_len;
5763
5764 /*
5765 * For non-extent based files this loop may iterate several times since
5766 * we do not determine full hole size.
5767 */
5768 while (map.m_len > 0) {
5769 ret = ext4_map_blocks(NULL, inode, &map, 0);
5770 if (ret < 0)
5771 return ret;
5772 /* There's extent covering m_lblk? Just return it. */
5773 if (ret > 0) {
5774 int status;
5775
5776 ext4_es_store_pblock(result, map.m_pblk);
5777 result->es_lblk = map.m_lblk;
5778 result->es_len = map.m_len;
5779 if (map.m_flags & EXT4_MAP_UNWRITTEN)
5780 status = EXTENT_STATUS_UNWRITTEN;
5781 else
5782 status = EXTENT_STATUS_WRITTEN;
5783 ext4_es_store_status(result, status);
5784 return 1;
5785 }
5786 ext4_es_find_delayed_extent_range(inode, map.m_lblk,
5787 map.m_lblk + map.m_len - 1,
5788 &es);
5789 /* Is delalloc data before next block in extent tree? */
5790 if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
5791 ext4_lblk_t offset = 0;
5792
5793 if (es.es_lblk < lblk)
5794 offset = lblk - es.es_lblk;
5795 result->es_lblk = es.es_lblk + offset;
5796 ext4_es_store_pblock(result,
5797 ext4_es_pblock(&es) + offset);
5798 result->es_len = es.es_len - offset;
5799 ext4_es_store_status(result, ext4_es_status(&es));
5800
5801 return 1;
5802 }
5803 /* There's a hole at m_lblk, advance us after it */
5804 map.m_lblk += map.m_len;
5805 map_len -= map.m_len;
5806 map.m_len = map_len;
5807 cond_resched();
5808 }
5809 result->es_len = 0;
5810 return 0;
5811}