]>
git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - fs/iomap/direct-io.c
1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010 Red Hat, Inc.
4 * Copyright (c) 2016-2018 Christoph Hellwig.
6 #include <linux/module.h>
7 #include <linux/compiler.h>
9 #include <linux/iomap.h>
10 #include <linux/backing-dev.h>
11 #include <linux/uio.h>
12 #include <linux/task_io_accounting_ops.h>
15 #include "../internal.h"
18 * Private flags for iomap_dio, must not overlap with the public ones in
21 #define IOMAP_DIO_WRITE_FUA (1 << 28)
22 #define IOMAP_DIO_NEED_SYNC (1 << 29)
23 #define IOMAP_DIO_WRITE (1 << 30)
24 #define IOMAP_DIO_DIRTY (1 << 31)
28 const struct iomap_dio_ops
*dops
;
34 bool wait_for_completion
;
37 /* used during submission and for synchronous completion: */
39 struct iov_iter
*iter
;
40 struct task_struct
*waiter
;
41 struct request_queue
*last_queue
;
45 /* used for aio completion: */
47 struct work_struct work
;
52 int iomap_dio_iopoll(struct kiocb
*kiocb
, bool spin
)
54 struct request_queue
*q
= READ_ONCE(kiocb
->private);
58 return blk_poll(q
, READ_ONCE(kiocb
->ki_cookie
), spin
);
60 EXPORT_SYMBOL_GPL(iomap_dio_iopoll
);
62 static void iomap_dio_submit_bio(struct iomap_dio
*dio
, struct iomap
*iomap
,
63 struct bio
*bio
, loff_t pos
)
65 atomic_inc(&dio
->ref
);
67 if (dio
->iocb
->ki_flags
& IOCB_HIPRI
)
68 bio_set_polled(bio
, dio
->iocb
);
70 dio
->submit
.last_queue
= bdev_get_queue(iomap
->bdev
);
71 if (dio
->dops
&& dio
->dops
->submit_io
)
72 dio
->submit
.cookie
= dio
->dops
->submit_io(
73 file_inode(dio
->iocb
->ki_filp
),
76 dio
->submit
.cookie
= submit_bio(bio
);
79 static ssize_t
iomap_dio_complete(struct iomap_dio
*dio
)
81 const struct iomap_dio_ops
*dops
= dio
->dops
;
82 struct kiocb
*iocb
= dio
->iocb
;
83 struct inode
*inode
= file_inode(iocb
->ki_filp
);
84 loff_t offset
= iocb
->ki_pos
;
85 ssize_t ret
= dio
->error
;
87 if (dops
&& dops
->end_io
)
88 ret
= dops
->end_io(iocb
, dio
->size
, ret
, dio
->flags
);
92 /* check for short read */
93 if (offset
+ ret
> dio
->i_size
&&
94 !(dio
->flags
& IOMAP_DIO_WRITE
))
95 ret
= dio
->i_size
- offset
;
100 * Try again to invalidate clean pages which might have been cached by
101 * non-direct readahead, or faulted in by get_user_pages() if the source
102 * of the write was an mmap'ed region of the file we're writing. Either
103 * one is a pretty crazy thing to do, so we don't support it 100%. If
104 * this invalidation fails, tough, the write still worked...
106 * And this page cache invalidation has to be after ->end_io(), as some
107 * filesystems convert unwritten extents to real allocations in
108 * ->end_io() when necessary, otherwise a racing buffer read would cache
109 * zeros from unwritten extents.
112 (dio
->flags
& IOMAP_DIO_WRITE
) && inode
->i_mapping
->nrpages
) {
114 err
= invalidate_inode_pages2_range(inode
->i_mapping
,
115 offset
>> PAGE_SHIFT
,
116 (offset
+ dio
->size
- 1) >> PAGE_SHIFT
);
118 dio_warn_stale_pagecache(iocb
->ki_filp
);
122 * If this is a DSYNC write, make sure we push it to stable storage now
123 * that we've written data.
125 if (ret
> 0 && (dio
->flags
& IOMAP_DIO_NEED_SYNC
))
126 ret
= generic_write_sync(iocb
, ret
);
128 inode_dio_end(file_inode(iocb
->ki_filp
));
134 static void iomap_dio_complete_work(struct work_struct
*work
)
136 struct iomap_dio
*dio
= container_of(work
, struct iomap_dio
, aio
.work
);
137 struct kiocb
*iocb
= dio
->iocb
;
139 iocb
->ki_complete(iocb
, iomap_dio_complete(dio
), 0);
143 * Set an error in the dio if none is set yet. We have to use cmpxchg
144 * as the submission context and the completion context(s) can race to
147 static inline void iomap_dio_set_error(struct iomap_dio
*dio
, int ret
)
149 cmpxchg(&dio
->error
, 0, ret
);
152 static void iomap_dio_bio_end_io(struct bio
*bio
)
154 struct iomap_dio
*dio
= bio
->bi_private
;
155 bool should_dirty
= (dio
->flags
& IOMAP_DIO_DIRTY
);
158 iomap_dio_set_error(dio
, blk_status_to_errno(bio
->bi_status
));
160 if (atomic_dec_and_test(&dio
->ref
)) {
161 if (dio
->wait_for_completion
) {
162 struct task_struct
*waiter
= dio
->submit
.waiter
;
163 WRITE_ONCE(dio
->submit
.waiter
, NULL
);
164 blk_wake_io_task(waiter
);
165 } else if (dio
->flags
& IOMAP_DIO_WRITE
) {
166 struct inode
*inode
= file_inode(dio
->iocb
->ki_filp
);
168 INIT_WORK(&dio
->aio
.work
, iomap_dio_complete_work
);
169 queue_work(inode
->i_sb
->s_dio_done_wq
, &dio
->aio
.work
);
171 iomap_dio_complete_work(&dio
->aio
.work
);
176 bio_check_pages_dirty(bio
);
178 bio_release_pages(bio
, false);
184 iomap_dio_zero(struct iomap_dio
*dio
, struct iomap
*iomap
, loff_t pos
,
187 struct page
*page
= ZERO_PAGE(0);
188 int flags
= REQ_SYNC
| REQ_IDLE
;
191 bio
= bio_alloc(GFP_KERNEL
, 1);
192 bio_set_dev(bio
, iomap
->bdev
);
193 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
194 bio
->bi_private
= dio
;
195 bio
->bi_end_io
= iomap_dio_bio_end_io
;
198 __bio_add_page(bio
, page
, len
, 0);
199 bio_set_op_attrs(bio
, REQ_OP_WRITE
, flags
);
200 iomap_dio_submit_bio(dio
, iomap
, bio
, pos
);
204 iomap_dio_bio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
205 struct iomap_dio
*dio
, struct iomap
*iomap
)
207 unsigned int blkbits
= blksize_bits(bdev_logical_block_size(iomap
->bdev
));
208 unsigned int fs_block_size
= i_blocksize(inode
), pad
;
209 unsigned int align
= iov_iter_alignment(dio
->submit
.iter
);
211 bool need_zeroout
= false;
212 bool use_fua
= false;
213 int nr_pages
, ret
= 0;
217 if ((pos
| length
| align
) & ((1 << blkbits
) - 1))
220 if (iomap
->type
== IOMAP_UNWRITTEN
) {
221 dio
->flags
|= IOMAP_DIO_UNWRITTEN
;
225 if (iomap
->flags
& IOMAP_F_SHARED
)
226 dio
->flags
|= IOMAP_DIO_COW
;
228 if (iomap
->flags
& IOMAP_F_NEW
) {
230 } else if (iomap
->type
== IOMAP_MAPPED
) {
232 * Use a FUA write if we need datasync semantics, this is a pure
233 * data IO that doesn't require any metadata updates (including
234 * after IO completion such as unwritten extent conversion) and
235 * the underlying device supports FUA. This allows us to avoid
236 * cache flushes on IO completion.
238 if (!(iomap
->flags
& (IOMAP_F_SHARED
|IOMAP_F_DIRTY
)) &&
239 (dio
->flags
& IOMAP_DIO_WRITE_FUA
) &&
240 blk_queue_fua(bdev_get_queue(iomap
->bdev
)))
245 * Save the original count and trim the iter to just the extent we
246 * are operating on right now. The iter will be re-expanded once
249 orig_count
= iov_iter_count(dio
->submit
.iter
);
250 iov_iter_truncate(dio
->submit
.iter
, length
);
252 nr_pages
= iov_iter_npages(dio
->submit
.iter
, BIO_MAX_PAGES
);
259 /* zero out from the start of the block to the write offset */
260 pad
= pos
& (fs_block_size
- 1);
262 iomap_dio_zero(dio
, iomap
, pos
- pad
, pad
);
268 iov_iter_revert(dio
->submit
.iter
, copied
);
273 bio
= bio_alloc(GFP_KERNEL
, nr_pages
);
274 bio_set_dev(bio
, iomap
->bdev
);
275 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
276 bio
->bi_write_hint
= dio
->iocb
->ki_hint
;
277 bio
->bi_ioprio
= dio
->iocb
->ki_ioprio
;
278 bio
->bi_private
= dio
;
279 bio
->bi_end_io
= iomap_dio_bio_end_io
;
281 ret
= bio_iov_iter_get_pages(bio
, dio
->submit
.iter
);
284 * We have to stop part way through an IO. We must fall
285 * through to the sub-block tail zeroing here, otherwise
286 * this short IO may expose stale data in the tail of
287 * the block we haven't written data to.
293 n
= bio
->bi_iter
.bi_size
;
294 if (dio
->flags
& IOMAP_DIO_WRITE
) {
295 bio
->bi_opf
= REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
;
297 bio
->bi_opf
|= REQ_FUA
;
299 dio
->flags
&= ~IOMAP_DIO_WRITE_FUA
;
300 task_io_account_write(n
);
302 bio
->bi_opf
= REQ_OP_READ
;
303 if (dio
->flags
& IOMAP_DIO_DIRTY
)
304 bio_set_pages_dirty(bio
);
310 nr_pages
= iov_iter_npages(dio
->submit
.iter
, BIO_MAX_PAGES
);
311 iomap_dio_submit_bio(dio
, iomap
, bio
, pos
);
316 * We need to zeroout the tail of a sub-block write if the extent type
317 * requires zeroing or the write extends beyond EOF. If we don't zero
318 * the block tail in the latter case, we can expose stale data via mmap
319 * reads of the EOF block.
323 ((dio
->flags
& IOMAP_DIO_WRITE
) && pos
>= i_size_read(inode
))) {
324 /* zero out from the end of the write to the end of the block */
325 pad
= pos
& (fs_block_size
- 1);
327 iomap_dio_zero(dio
, iomap
, pos
, fs_block_size
- pad
);
330 /* Undo iter limitation to current extent */
331 iov_iter_reexpand(dio
->submit
.iter
, orig_count
- copied
);
338 iomap_dio_hole_actor(loff_t length
, struct iomap_dio
*dio
)
340 length
= iov_iter_zero(length
, dio
->submit
.iter
);
346 iomap_dio_inline_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
347 struct iomap_dio
*dio
, struct iomap
*iomap
)
349 struct iov_iter
*iter
= dio
->submit
.iter
;
352 BUG_ON(pos
+ length
> PAGE_SIZE
- offset_in_page(iomap
->inline_data
));
354 if (dio
->flags
& IOMAP_DIO_WRITE
) {
355 loff_t size
= inode
->i_size
;
358 memset(iomap
->inline_data
+ size
, 0, pos
- size
);
359 copied
= copy_from_iter(iomap
->inline_data
+ pos
, length
, iter
);
361 if (pos
+ copied
> size
)
362 i_size_write(inode
, pos
+ copied
);
363 mark_inode_dirty(inode
);
366 copied
= copy_to_iter(iomap
->inline_data
+ pos
, length
, iter
);
373 iomap_dio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
374 void *data
, struct iomap
*iomap
, struct iomap
*srcmap
)
376 struct iomap_dio
*dio
= data
;
378 switch (iomap
->type
) {
380 if (WARN_ON_ONCE(dio
->flags
& IOMAP_DIO_WRITE
))
382 return iomap_dio_hole_actor(length
, dio
);
383 case IOMAP_UNWRITTEN
:
384 if (!(dio
->flags
& IOMAP_DIO_WRITE
))
385 return iomap_dio_hole_actor(length
, dio
);
386 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
388 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
390 return iomap_dio_inline_actor(inode
, pos
, length
, dio
, iomap
);
398 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
399 * is being issued as AIO or not. This allows us to optimise pure data writes
400 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
401 * REQ_FLUSH post write. This is slightly tricky because a single request here
402 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
403 * may be pure data writes. In that case, we still need to do a full data sync
406 * Returns -ENOTBLK In case of a page invalidation invalidation failure for
407 * writes. The callers needs to fall back to buffered I/O in this case.
410 iomap_dio_rw(struct kiocb
*iocb
, struct iov_iter
*iter
,
411 const struct iomap_ops
*ops
, const struct iomap_dio_ops
*dops
,
412 bool wait_for_completion
)
414 struct address_space
*mapping
= iocb
->ki_filp
->f_mapping
;
415 struct inode
*inode
= file_inode(iocb
->ki_filp
);
416 size_t count
= iov_iter_count(iter
);
417 loff_t pos
= iocb
->ki_pos
;
418 loff_t end
= iocb
->ki_pos
+ count
- 1, ret
= 0;
419 unsigned int flags
= IOMAP_DIRECT
;
420 struct blk_plug plug
;
421 struct iomap_dio
*dio
;
426 if (WARN_ON(is_sync_kiocb(iocb
) && !wait_for_completion
))
429 dio
= kmalloc(sizeof(*dio
), GFP_KERNEL
);
434 atomic_set(&dio
->ref
, 1);
436 dio
->i_size
= i_size_read(inode
);
441 dio
->submit
.iter
= iter
;
442 dio
->submit
.waiter
= current
;
443 dio
->submit
.cookie
= BLK_QC_T_NONE
;
444 dio
->submit
.last_queue
= NULL
;
446 if (iov_iter_rw(iter
) == READ
) {
447 if (pos
>= dio
->i_size
)
450 if (iter_is_iovec(iter
))
451 dio
->flags
|= IOMAP_DIO_DIRTY
;
453 flags
|= IOMAP_WRITE
;
454 dio
->flags
|= IOMAP_DIO_WRITE
;
456 /* for data sync or sync, we need sync completion processing */
457 if (iocb
->ki_flags
& IOCB_DSYNC
)
458 dio
->flags
|= IOMAP_DIO_NEED_SYNC
;
461 * For datasync only writes, we optimistically try using FUA for
462 * this IO. Any non-FUA write that occurs will clear this flag,
463 * hence we know before completion whether a cache flush is
466 if ((iocb
->ki_flags
& (IOCB_DSYNC
| IOCB_SYNC
)) == IOCB_DSYNC
)
467 dio
->flags
|= IOMAP_DIO_WRITE_FUA
;
470 if (iocb
->ki_flags
& IOCB_NOWAIT
) {
471 if (filemap_range_has_page(mapping
, pos
, end
)) {
475 flags
|= IOMAP_NOWAIT
;
478 ret
= filemap_write_and_wait_range(mapping
, pos
, end
);
482 if (iov_iter_rw(iter
) == WRITE
) {
484 * Try to invalidate cache pages for the range we are writing.
485 * If this invalidation fails, let the caller fall back to
488 if (invalidate_inode_pages2_range(mapping
, pos
>> PAGE_SHIFT
,
489 end
>> PAGE_SHIFT
)) {
490 trace_iomap_dio_invalidate_fail(inode
, pos
, count
);
495 if (!wait_for_completion
&& !inode
->i_sb
->s_dio_done_wq
) {
496 ret
= sb_init_dio_done_wq(inode
->i_sb
);
502 inode_dio_begin(inode
);
504 blk_start_plug(&plug
);
506 ret
= iomap_apply(inode
, pos
, count
, flags
, ops
, dio
,
509 /* magic error code to fall back to buffered I/O */
510 if (ret
== -ENOTBLK
) {
511 wait_for_completion
= true;
518 if (iov_iter_rw(iter
) == READ
&& pos
>= dio
->i_size
) {
520 * We only report that we've read data up to i_size.
521 * Revert iter to a state corresponding to that as
522 * some callers (such as splice code) rely on it.
524 iov_iter_revert(iter
, pos
- dio
->i_size
);
527 } while ((count
= iov_iter_count(iter
)) > 0);
528 blk_finish_plug(&plug
);
531 iomap_dio_set_error(dio
, ret
);
534 * If all the writes we issued were FUA, we don't need to flush the
535 * cache on IO completion. Clear the sync flag for this case.
537 if (dio
->flags
& IOMAP_DIO_WRITE_FUA
)
538 dio
->flags
&= ~IOMAP_DIO_NEED_SYNC
;
540 WRITE_ONCE(iocb
->ki_cookie
, dio
->submit
.cookie
);
541 WRITE_ONCE(iocb
->private, dio
->submit
.last_queue
);
544 * We are about to drop our additional submission reference, which
545 * might be the last reference to the dio. There are three different
546 * ways we can progress here:
548 * (a) If this is the last reference we will always complete and free
550 * (b) If this is not the last reference, and we serve an asynchronous
551 * iocb, we must never touch the dio after the decrement, the
552 * I/O completion handler will complete and free it.
553 * (c) If this is not the last reference, but we serve a synchronous
554 * iocb, the I/O completion handler will wake us up on the drop
555 * of the final reference, and we will complete and free it here
556 * after we got woken by the I/O completion handler.
558 dio
->wait_for_completion
= wait_for_completion
;
559 if (!atomic_dec_and_test(&dio
->ref
)) {
560 if (!wait_for_completion
)
564 set_current_state(TASK_UNINTERRUPTIBLE
);
565 if (!READ_ONCE(dio
->submit
.waiter
))
568 if (!(iocb
->ki_flags
& IOCB_HIPRI
) ||
569 !dio
->submit
.last_queue
||
570 !blk_poll(dio
->submit
.last_queue
,
571 dio
->submit
.cookie
, true))
574 __set_current_state(TASK_RUNNING
);
577 return iomap_dio_complete(dio
);
583 EXPORT_SYMBOL_GPL(iomap_dio_rw
);