1 // SPDX-License-Identifier: GPL-2.0
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side.
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
30 * Also see the examples in the liburing library:
32 * git://git.kernel.dk/liburing
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
39 * Copyright (C) 2018-2019 Jens Axboe
40 * Copyright (c) 2018-2019 Christoph Hellwig
42 #include <linux/kernel.h>
43 #include <linux/init.h>
44 #include <linux/errno.h>
45 #include <linux/syscalls.h>
46 #include <linux/compat.h>
47 #include <linux/refcount.h>
48 #include <linux/uio.h>
50 #include <linux/sched/signal.h>
52 #include <linux/file.h>
53 #include <linux/fdtable.h>
55 #include <linux/mman.h>
56 #include <linux/mmu_context.h>
57 #include <linux/percpu.h>
58 #include <linux/slab.h>
59 #include <linux/kthread.h>
60 #include <linux/blkdev.h>
61 #include <linux/bvec.h>
62 #include <linux/net.h>
64 #include <net/af_unix.h>
66 #include <linux/anon_inodes.h>
67 #include <linux/sched/mm.h>
68 #include <linux/uaccess.h>
69 #include <linux/nospec.h>
70 #include <linux/sizes.h>
71 #include <linux/hugetlb.h>
72 #include <linux/highmem.h>
73 #include <linux/namei.h>
74 #include <linux/fsnotify.h>
75 #include <linux/fadvise.h>
77 #define CREATE_TRACE_POINTS
78 #include <trace/events/io_uring.h>
80 #include <uapi/linux/io_uring.h>
85 #define IORING_MAX_ENTRIES 32768
86 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
89 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
91 #define IORING_FILE_TABLE_SHIFT 9
92 #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
93 #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
94 #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
97 u32 head ____cacheline_aligned_in_smp
;
98 u32 tail ____cacheline_aligned_in_smp
;
102 * This data is shared with the application through the mmap at offsets
103 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
105 * The offsets to the member fields are published through struct
106 * io_sqring_offsets when calling io_uring_setup.
110 * Head and tail offsets into the ring; the offsets need to be
111 * masked to get valid indices.
113 * The kernel controls head of the sq ring and the tail of the cq ring,
114 * and the application controls tail of the sq ring and the head of the
117 struct io_uring sq
, cq
;
119 * Bitmasks to apply to head and tail offsets (constant, equals
122 u32 sq_ring_mask
, cq_ring_mask
;
123 /* Ring sizes (constant, power of 2) */
124 u32 sq_ring_entries
, cq_ring_entries
;
126 * Number of invalid entries dropped by the kernel due to
127 * invalid index stored in array
129 * Written by the kernel, shouldn't be modified by the
130 * application (i.e. get number of "new events" by comparing to
133 * After a new SQ head value was read by the application this
134 * counter includes all submissions that were dropped reaching
135 * the new SQ head (and possibly more).
141 * Written by the kernel, shouldn't be modified by the
144 * The application needs a full memory barrier before checking
145 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
149 * Number of completion events lost because the queue was full;
150 * this should be avoided by the application by making sure
151 * there are not more requests pending than there is space in
152 * the completion queue.
154 * Written by the kernel, shouldn't be modified by the
155 * application (i.e. get number of "new events" by comparing to
158 * As completion events come in out of order this counter is not
159 * ordered with any other data.
163 * Ring buffer of completion events.
165 * The kernel writes completion events fresh every time they are
166 * produced, so the application is allowed to modify pending
169 struct io_uring_cqe cqes
[] ____cacheline_aligned_in_smp
;
172 struct io_mapped_ubuf
{
175 struct bio_vec
*bvec
;
176 unsigned int nr_bvecs
;
179 struct fixed_file_table
{
187 struct fixed_file_data
{
188 struct fixed_file_table
*table
;
189 struct io_ring_ctx
*ctx
;
191 struct percpu_ref refs
;
192 struct llist_head put_llist
;
194 struct work_struct ref_work
;
195 struct completion done
;
200 struct percpu_ref refs
;
201 } ____cacheline_aligned_in_smp
;
207 bool cq_overflow_flushed
;
211 * Ring buffer of indices into array of io_uring_sqe, which is
212 * mmapped by the application using the IORING_OFF_SQES offset.
214 * This indirection could e.g. be used to assign fixed
215 * io_uring_sqe entries to operations and only submit them to
216 * the queue when needed.
218 * The kernel modifies neither the indices array nor the entries
222 unsigned cached_sq_head
;
225 unsigned sq_thread_idle
;
226 unsigned cached_sq_dropped
;
227 atomic_t cached_cq_overflow
;
228 unsigned long sq_check_overflow
;
230 struct list_head defer_list
;
231 struct list_head timeout_list
;
232 struct list_head cq_overflow_list
;
234 wait_queue_head_t inflight_wait
;
235 struct io_uring_sqe
*sq_sqes
;
236 } ____cacheline_aligned_in_smp
;
238 struct io_rings
*rings
;
242 struct task_struct
*sqo_thread
; /* if using sq thread polling */
243 struct mm_struct
*sqo_mm
;
244 wait_queue_head_t sqo_wait
;
247 * If used, fixed file set. Writers must ensure that ->refs is dead,
248 * readers must ensure that ->refs is alive as long as the file* is
249 * used. Only updated through io_uring_register(2).
251 struct fixed_file_data
*file_data
;
252 unsigned nr_user_files
;
254 /* if used, fixed mapped user buffers */
255 unsigned nr_user_bufs
;
256 struct io_mapped_ubuf
*user_bufs
;
258 struct user_struct
*user
;
260 const struct cred
*creds
;
262 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
263 struct completion
*completions
;
265 /* if all else fails... */
266 struct io_kiocb
*fallback_req
;
268 #if defined(CONFIG_UNIX)
269 struct socket
*ring_sock
;
273 unsigned cached_cq_tail
;
276 atomic_t cq_timeouts
;
277 unsigned long cq_check_overflow
;
278 struct wait_queue_head cq_wait
;
279 struct fasync_struct
*cq_fasync
;
280 struct eventfd_ctx
*cq_ev_fd
;
281 } ____cacheline_aligned_in_smp
;
284 struct mutex uring_lock
;
285 wait_queue_head_t wait
;
286 } ____cacheline_aligned_in_smp
;
289 spinlock_t completion_lock
;
290 struct llist_head poll_llist
;
293 * ->poll_list is protected by the ctx->uring_lock for
294 * io_uring instances that don't use IORING_SETUP_SQPOLL.
295 * For SQPOLL, only the single threaded io_sq_thread() will
296 * manipulate the list, hence no extra locking is needed there.
298 struct list_head poll_list
;
299 struct hlist_head
*cancel_hash
;
300 unsigned cancel_hash_bits
;
301 bool poll_multi_file
;
303 spinlock_t inflight_lock
;
304 struct list_head inflight_list
;
305 } ____cacheline_aligned_in_smp
;
309 * First field must be the file pointer in all the
310 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
312 struct io_poll_iocb
{
315 struct wait_queue_head
*head
;
321 struct wait_queue_entry wait
;
326 struct file
*put_file
;
330 struct io_timeout_data
{
331 struct io_kiocb
*req
;
332 struct hrtimer timer
;
333 struct timespec64 ts
;
334 enum hrtimer_mode mode
;
340 struct sockaddr __user
*addr
;
341 int __user
*addr_len
;
366 /* NOTE: kiocb has the file as the first member, so don't do it here */
374 struct sockaddr __user
*addr
;
380 struct user_msghdr __user
*msg
;
391 const char __user
*fname
;
392 struct filename
*filename
;
393 struct statx __user
*buffer
;
397 struct io_files_update
{
418 struct io_async_connect
{
419 struct sockaddr_storage address
;
422 struct io_async_msghdr
{
423 struct iovec fast_iov
[UIO_FASTIOV
];
425 struct sockaddr __user
*uaddr
;
430 struct iovec fast_iov
[UIO_FASTIOV
];
436 struct io_async_open
{
437 struct filename
*filename
;
440 struct io_async_ctx
{
442 struct io_async_rw rw
;
443 struct io_async_msghdr msg
;
444 struct io_async_connect connect
;
445 struct io_timeout_data timeout
;
446 struct io_async_open open
;
451 * NOTE! Each of the iocb union members has the file pointer
452 * as the first entry in their struct definition. So you can
453 * access the file pointer through any of the sub-structs,
454 * or directly as just 'ki_filp' in this struct.
460 struct io_poll_iocb poll
;
461 struct io_accept accept
;
463 struct io_cancel cancel
;
464 struct io_timeout timeout
;
465 struct io_connect connect
;
466 struct io_sr_msg sr_msg
;
468 struct io_close close
;
469 struct io_files_update files_update
;
470 struct io_fadvise fadvise
;
471 struct io_madvise madvise
;
474 struct io_async_ctx
*io
;
477 * ring_file is only used in the submission path, and
478 * llist_node is only used for poll deferred completions
480 struct file
*ring_file
;
481 struct llist_node llist_node
;
486 bool needs_fixed_file
;
489 struct io_ring_ctx
*ctx
;
491 struct list_head list
;
492 struct hlist_node hash_node
;
494 struct list_head link_list
;
497 #define REQ_F_NOWAIT 1 /* must not punt to workers */
498 #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
499 #define REQ_F_FIXED_FILE 4 /* ctx owns file */
500 #define REQ_F_LINK_NEXT 8 /* already grabbed next link */
501 #define REQ_F_IO_DRAIN 16 /* drain existing IO first */
502 #define REQ_F_IO_DRAINED 32 /* drain done */
503 #define REQ_F_LINK 64 /* linked sqes */
504 #define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
505 #define REQ_F_FAIL_LINK 256 /* fail rest of links */
506 #define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
507 #define REQ_F_TIMEOUT 1024 /* timeout request */
508 #define REQ_F_ISREG 2048 /* regular file */
509 #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
510 #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
511 #define REQ_F_INFLIGHT 16384 /* on inflight list */
512 #define REQ_F_COMP_LOCKED 32768 /* completion under lock */
513 #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
514 #define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */
515 #define REQ_F_CUR_POS 262144 /* read/write uses file position */
520 struct list_head inflight_entry
;
522 struct io_wq_work work
;
525 #define IO_PLUG_THRESHOLD 2
526 #define IO_IOPOLL_BATCH 8
528 struct io_submit_state
{
529 struct blk_plug plug
;
532 * io_kiocb alloc cache
534 void *reqs
[IO_IOPOLL_BATCH
];
535 unsigned int free_reqs
;
536 unsigned int cur_req
;
539 * File reference cache
543 unsigned int has_refs
;
544 unsigned int used_refs
;
545 unsigned int ios_left
;
549 /* needs req->io allocated for deferral/async */
550 unsigned async_ctx
: 1;
551 /* needs current->mm setup, does mm access */
552 unsigned needs_mm
: 1;
553 /* needs req->file assigned */
554 unsigned needs_file
: 1;
555 /* needs req->file assigned IFF fd is >= 0 */
556 unsigned fd_non_neg
: 1;
557 /* hash wq insertion if file is a regular file */
558 unsigned hash_reg_file
: 1;
559 /* unbound wq insertion if file is a non-regular file */
560 unsigned unbound_nonreg_file
: 1;
563 static const struct io_op_def io_op_defs
[] = {
568 /* IORING_OP_READV */
572 .unbound_nonreg_file
= 1,
575 /* IORING_OP_WRITEV */
580 .unbound_nonreg_file
= 1,
583 /* IORING_OP_FSYNC */
587 /* IORING_OP_READ_FIXED */
589 .unbound_nonreg_file
= 1,
592 /* IORING_OP_WRITE_FIXED */
595 .unbound_nonreg_file
= 1,
598 /* IORING_OP_POLL_ADD */
600 .unbound_nonreg_file
= 1,
603 /* IORING_OP_POLL_REMOVE */
606 /* IORING_OP_SYNC_FILE_RANGE */
610 /* IORING_OP_SENDMSG */
614 .unbound_nonreg_file
= 1,
617 /* IORING_OP_RECVMSG */
621 .unbound_nonreg_file
= 1,
624 /* IORING_OP_TIMEOUT */
629 /* IORING_OP_TIMEOUT_REMOVE */
632 /* IORING_OP_ACCEPT */
635 .unbound_nonreg_file
= 1,
638 /* IORING_OP_ASYNC_CANCEL */
641 /* IORING_OP_LINK_TIMEOUT */
646 /* IORING_OP_CONNECT */
650 .unbound_nonreg_file
= 1,
653 /* IORING_OP_FALLOCATE */
657 /* IORING_OP_OPENAT */
662 /* IORING_OP_CLOSE */
666 /* IORING_OP_FILES_UPDATE */
670 /* IORING_OP_STATX */
679 .unbound_nonreg_file
= 1,
682 /* IORING_OP_WRITE */
685 .unbound_nonreg_file
= 1,
688 /* IORING_OP_FADVISE */
692 /* IORING_OP_MADVISE */
697 static void io_wq_submit_work(struct io_wq_work
**workptr
);
698 static void io_cqring_fill_event(struct io_kiocb
*req
, long res
);
699 static void io_put_req(struct io_kiocb
*req
);
700 static void __io_double_put_req(struct io_kiocb
*req
);
701 static struct io_kiocb
*io_prep_linked_timeout(struct io_kiocb
*req
);
702 static void io_queue_linked_timeout(struct io_kiocb
*req
);
703 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
704 struct io_uring_files_update
*ip
,
707 static struct kmem_cache
*req_cachep
;
709 static const struct file_operations io_uring_fops
;
711 struct sock
*io_uring_get_socket(struct file
*file
)
713 #if defined(CONFIG_UNIX)
714 if (file
->f_op
== &io_uring_fops
) {
715 struct io_ring_ctx
*ctx
= file
->private_data
;
717 return ctx
->ring_sock
->sk
;
722 EXPORT_SYMBOL(io_uring_get_socket
);
724 static void io_ring_ctx_ref_free(struct percpu_ref
*ref
)
726 struct io_ring_ctx
*ctx
= container_of(ref
, struct io_ring_ctx
, refs
);
728 complete(&ctx
->completions
[0]);
731 static struct io_ring_ctx
*io_ring_ctx_alloc(struct io_uring_params
*p
)
733 struct io_ring_ctx
*ctx
;
736 ctx
= kzalloc(sizeof(*ctx
), GFP_KERNEL
);
740 ctx
->fallback_req
= kmem_cache_alloc(req_cachep
, GFP_KERNEL
);
741 if (!ctx
->fallback_req
)
744 ctx
->completions
= kmalloc(2 * sizeof(struct completion
), GFP_KERNEL
);
745 if (!ctx
->completions
)
749 * Use 5 bits less than the max cq entries, that should give us around
750 * 32 entries per hash list if totally full and uniformly spread.
752 hash_bits
= ilog2(p
->cq_entries
);
756 ctx
->cancel_hash_bits
= hash_bits
;
757 ctx
->cancel_hash
= kmalloc((1U << hash_bits
) * sizeof(struct hlist_head
),
759 if (!ctx
->cancel_hash
)
761 __hash_init(ctx
->cancel_hash
, 1U << hash_bits
);
763 if (percpu_ref_init(&ctx
->refs
, io_ring_ctx_ref_free
,
764 PERCPU_REF_ALLOW_REINIT
, GFP_KERNEL
))
767 ctx
->flags
= p
->flags
;
768 init_waitqueue_head(&ctx
->cq_wait
);
769 INIT_LIST_HEAD(&ctx
->cq_overflow_list
);
770 init_completion(&ctx
->completions
[0]);
771 init_completion(&ctx
->completions
[1]);
772 mutex_init(&ctx
->uring_lock
);
773 init_waitqueue_head(&ctx
->wait
);
774 spin_lock_init(&ctx
->completion_lock
);
775 init_llist_head(&ctx
->poll_llist
);
776 INIT_LIST_HEAD(&ctx
->poll_list
);
777 INIT_LIST_HEAD(&ctx
->defer_list
);
778 INIT_LIST_HEAD(&ctx
->timeout_list
);
779 init_waitqueue_head(&ctx
->inflight_wait
);
780 spin_lock_init(&ctx
->inflight_lock
);
781 INIT_LIST_HEAD(&ctx
->inflight_list
);
784 if (ctx
->fallback_req
)
785 kmem_cache_free(req_cachep
, ctx
->fallback_req
);
786 kfree(ctx
->completions
);
787 kfree(ctx
->cancel_hash
);
792 static inline bool __req_need_defer(struct io_kiocb
*req
)
794 struct io_ring_ctx
*ctx
= req
->ctx
;
796 return req
->sequence
!= ctx
->cached_cq_tail
+ ctx
->cached_sq_dropped
797 + atomic_read(&ctx
->cached_cq_overflow
);
800 static inline bool req_need_defer(struct io_kiocb
*req
)
802 if ((req
->flags
& (REQ_F_IO_DRAIN
|REQ_F_IO_DRAINED
)) == REQ_F_IO_DRAIN
)
803 return __req_need_defer(req
);
808 static struct io_kiocb
*io_get_deferred_req(struct io_ring_ctx
*ctx
)
810 struct io_kiocb
*req
;
812 req
= list_first_entry_or_null(&ctx
->defer_list
, struct io_kiocb
, list
);
813 if (req
&& !req_need_defer(req
)) {
814 list_del_init(&req
->list
);
821 static struct io_kiocb
*io_get_timeout_req(struct io_ring_ctx
*ctx
)
823 struct io_kiocb
*req
;
825 req
= list_first_entry_or_null(&ctx
->timeout_list
, struct io_kiocb
, list
);
827 if (req
->flags
& REQ_F_TIMEOUT_NOSEQ
)
829 if (!__req_need_defer(req
)) {
830 list_del_init(&req
->list
);
838 static void __io_commit_cqring(struct io_ring_ctx
*ctx
)
840 struct io_rings
*rings
= ctx
->rings
;
842 if (ctx
->cached_cq_tail
!= READ_ONCE(rings
->cq
.tail
)) {
843 /* order cqe stores with ring update */
844 smp_store_release(&rings
->cq
.tail
, ctx
->cached_cq_tail
);
846 if (wq_has_sleeper(&ctx
->cq_wait
)) {
847 wake_up_interruptible(&ctx
->cq_wait
);
848 kill_fasync(&ctx
->cq_fasync
, SIGIO
, POLL_IN
);
853 static inline bool io_prep_async_work(struct io_kiocb
*req
,
854 struct io_kiocb
**link
)
856 const struct io_op_def
*def
= &io_op_defs
[req
->opcode
];
857 bool do_hashed
= false;
859 if (req
->flags
& REQ_F_ISREG
) {
860 if (def
->hash_reg_file
)
863 if (def
->unbound_nonreg_file
)
864 req
->work
.flags
|= IO_WQ_WORK_UNBOUND
;
867 req
->work
.flags
|= IO_WQ_WORK_NEEDS_USER
;
869 *link
= io_prep_linked_timeout(req
);
873 static inline void io_queue_async_work(struct io_kiocb
*req
)
875 struct io_ring_ctx
*ctx
= req
->ctx
;
876 struct io_kiocb
*link
;
879 do_hashed
= io_prep_async_work(req
, &link
);
881 trace_io_uring_queue_async_work(ctx
, do_hashed
, req
, &req
->work
,
884 io_wq_enqueue(ctx
->io_wq
, &req
->work
);
886 io_wq_enqueue_hashed(ctx
->io_wq
, &req
->work
,
887 file_inode(req
->file
));
891 io_queue_linked_timeout(link
);
894 static void io_kill_timeout(struct io_kiocb
*req
)
898 ret
= hrtimer_try_to_cancel(&req
->io
->timeout
.timer
);
900 atomic_inc(&req
->ctx
->cq_timeouts
);
901 list_del_init(&req
->list
);
902 io_cqring_fill_event(req
, 0);
907 static void io_kill_timeouts(struct io_ring_ctx
*ctx
)
909 struct io_kiocb
*req
, *tmp
;
911 spin_lock_irq(&ctx
->completion_lock
);
912 list_for_each_entry_safe(req
, tmp
, &ctx
->timeout_list
, list
)
913 io_kill_timeout(req
);
914 spin_unlock_irq(&ctx
->completion_lock
);
917 static void io_commit_cqring(struct io_ring_ctx
*ctx
)
919 struct io_kiocb
*req
;
921 while ((req
= io_get_timeout_req(ctx
)) != NULL
)
922 io_kill_timeout(req
);
924 __io_commit_cqring(ctx
);
926 while ((req
= io_get_deferred_req(ctx
)) != NULL
) {
927 req
->flags
|= REQ_F_IO_DRAINED
;
928 io_queue_async_work(req
);
932 static struct io_uring_cqe
*io_get_cqring(struct io_ring_ctx
*ctx
)
934 struct io_rings
*rings
= ctx
->rings
;
937 tail
= ctx
->cached_cq_tail
;
939 * writes to the cq entry need to come after reading head; the
940 * control dependency is enough as we're using WRITE_ONCE to
943 if (tail
- READ_ONCE(rings
->cq
.head
) == rings
->cq_ring_entries
)
946 ctx
->cached_cq_tail
++;
947 return &rings
->cqes
[tail
& ctx
->cq_mask
];
950 static void io_cqring_ev_posted(struct io_ring_ctx
*ctx
)
952 if (waitqueue_active(&ctx
->wait
))
954 if (waitqueue_active(&ctx
->sqo_wait
))
955 wake_up(&ctx
->sqo_wait
);
957 eventfd_signal(ctx
->cq_ev_fd
, 1);
960 /* Returns true if there are no backlogged entries after the flush */
961 static bool io_cqring_overflow_flush(struct io_ring_ctx
*ctx
, bool force
)
963 struct io_rings
*rings
= ctx
->rings
;
964 struct io_uring_cqe
*cqe
;
965 struct io_kiocb
*req
;
970 if (list_empty_careful(&ctx
->cq_overflow_list
))
972 if ((ctx
->cached_cq_tail
- READ_ONCE(rings
->cq
.head
) ==
973 rings
->cq_ring_entries
))
977 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
979 /* if force is set, the ring is going away. always drop after that */
981 ctx
->cq_overflow_flushed
= true;
984 while (!list_empty(&ctx
->cq_overflow_list
)) {
985 cqe
= io_get_cqring(ctx
);
989 req
= list_first_entry(&ctx
->cq_overflow_list
, struct io_kiocb
,
991 list_move(&req
->list
, &list
);
993 WRITE_ONCE(cqe
->user_data
, req
->user_data
);
994 WRITE_ONCE(cqe
->res
, req
->result
);
995 WRITE_ONCE(cqe
->flags
, 0);
997 WRITE_ONCE(ctx
->rings
->cq_overflow
,
998 atomic_inc_return(&ctx
->cached_cq_overflow
));
1002 io_commit_cqring(ctx
);
1004 clear_bit(0, &ctx
->sq_check_overflow
);
1005 clear_bit(0, &ctx
->cq_check_overflow
);
1007 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
1008 io_cqring_ev_posted(ctx
);
1010 while (!list_empty(&list
)) {
1011 req
= list_first_entry(&list
, struct io_kiocb
, list
);
1012 list_del(&req
->list
);
1019 static void io_cqring_fill_event(struct io_kiocb
*req
, long res
)
1021 struct io_ring_ctx
*ctx
= req
->ctx
;
1022 struct io_uring_cqe
*cqe
;
1024 trace_io_uring_complete(ctx
, req
->user_data
, res
);
1027 * If we can't get a cq entry, userspace overflowed the
1028 * submission (by quite a lot). Increment the overflow count in
1031 cqe
= io_get_cqring(ctx
);
1033 WRITE_ONCE(cqe
->user_data
, req
->user_data
);
1034 WRITE_ONCE(cqe
->res
, res
);
1035 WRITE_ONCE(cqe
->flags
, 0);
1036 } else if (ctx
->cq_overflow_flushed
) {
1037 WRITE_ONCE(ctx
->rings
->cq_overflow
,
1038 atomic_inc_return(&ctx
->cached_cq_overflow
));
1040 if (list_empty(&ctx
->cq_overflow_list
)) {
1041 set_bit(0, &ctx
->sq_check_overflow
);
1042 set_bit(0, &ctx
->cq_check_overflow
);
1044 refcount_inc(&req
->refs
);
1046 list_add_tail(&req
->list
, &ctx
->cq_overflow_list
);
1050 static void io_cqring_add_event(struct io_kiocb
*req
, long res
)
1052 struct io_ring_ctx
*ctx
= req
->ctx
;
1053 unsigned long flags
;
1055 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
1056 io_cqring_fill_event(req
, res
);
1057 io_commit_cqring(ctx
);
1058 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
1060 io_cqring_ev_posted(ctx
);
1063 static inline bool io_is_fallback_req(struct io_kiocb
*req
)
1065 return req
== (struct io_kiocb
*)
1066 ((unsigned long) req
->ctx
->fallback_req
& ~1UL);
1069 static struct io_kiocb
*io_get_fallback_req(struct io_ring_ctx
*ctx
)
1071 struct io_kiocb
*req
;
1073 req
= ctx
->fallback_req
;
1074 if (!test_and_set_bit_lock(0, (unsigned long *) ctx
->fallback_req
))
1080 static struct io_kiocb
*io_get_req(struct io_ring_ctx
*ctx
,
1081 struct io_submit_state
*state
)
1083 gfp_t gfp
= GFP_KERNEL
| __GFP_NOWARN
;
1084 struct io_kiocb
*req
;
1087 req
= kmem_cache_alloc(req_cachep
, gfp
);
1090 } else if (!state
->free_reqs
) {
1094 sz
= min_t(size_t, state
->ios_left
, ARRAY_SIZE(state
->reqs
));
1095 ret
= kmem_cache_alloc_bulk(req_cachep
, gfp
, sz
, state
->reqs
);
1098 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1099 * retry single alloc to be on the safe side.
1101 if (unlikely(ret
<= 0)) {
1102 state
->reqs
[0] = kmem_cache_alloc(req_cachep
, gfp
);
1103 if (!state
->reqs
[0])
1107 state
->free_reqs
= ret
- 1;
1109 req
= state
->reqs
[0];
1111 req
= state
->reqs
[state
->cur_req
];
1118 req
->ring_file
= NULL
;
1122 /* one is dropped after submission, the other at completion */
1123 refcount_set(&req
->refs
, 2);
1125 INIT_IO_WORK(&req
->work
, io_wq_submit_work
);
1128 req
= io_get_fallback_req(ctx
);
1131 percpu_ref_put(&ctx
->refs
);
1135 static void io_free_req_many(struct io_ring_ctx
*ctx
, void **reqs
, int *nr
)
1138 kmem_cache_free_bulk(req_cachep
, *nr
, reqs
);
1139 percpu_ref_put_many(&ctx
->refs
, *nr
);
1140 percpu_ref_put_many(&ctx
->file_data
->refs
, *nr
);
1145 static void __io_req_do_free(struct io_kiocb
*req
)
1147 if (likely(!io_is_fallback_req(req
)))
1148 kmem_cache_free(req_cachep
, req
);
1150 clear_bit_unlock(0, (unsigned long *) req
->ctx
->fallback_req
);
1153 static void __io_free_req(struct io_kiocb
*req
)
1155 struct io_ring_ctx
*ctx
= req
->ctx
;
1160 if (req
->flags
& REQ_F_FIXED_FILE
)
1161 percpu_ref_put(&ctx
->file_data
->refs
);
1165 if (req
->flags
& REQ_F_INFLIGHT
) {
1166 unsigned long flags
;
1168 spin_lock_irqsave(&ctx
->inflight_lock
, flags
);
1169 list_del(&req
->inflight_entry
);
1170 if (waitqueue_active(&ctx
->inflight_wait
))
1171 wake_up(&ctx
->inflight_wait
);
1172 spin_unlock_irqrestore(&ctx
->inflight_lock
, flags
);
1175 percpu_ref_put(&req
->ctx
->refs
);
1176 __io_req_do_free(req
);
1179 static bool io_link_cancel_timeout(struct io_kiocb
*req
)
1181 struct io_ring_ctx
*ctx
= req
->ctx
;
1184 ret
= hrtimer_try_to_cancel(&req
->io
->timeout
.timer
);
1186 io_cqring_fill_event(req
, -ECANCELED
);
1187 io_commit_cqring(ctx
);
1188 req
->flags
&= ~REQ_F_LINK
;
1196 static void io_req_link_next(struct io_kiocb
*req
, struct io_kiocb
**nxtptr
)
1198 struct io_ring_ctx
*ctx
= req
->ctx
;
1199 bool wake_ev
= false;
1201 /* Already got next link */
1202 if (req
->flags
& REQ_F_LINK_NEXT
)
1206 * The list should never be empty when we are called here. But could
1207 * potentially happen if the chain is messed up, check to be on the
1210 while (!list_empty(&req
->link_list
)) {
1211 struct io_kiocb
*nxt
= list_first_entry(&req
->link_list
,
1212 struct io_kiocb
, link_list
);
1214 if (unlikely((req
->flags
& REQ_F_LINK_TIMEOUT
) &&
1215 (nxt
->flags
& REQ_F_TIMEOUT
))) {
1216 list_del_init(&nxt
->link_list
);
1217 wake_ev
|= io_link_cancel_timeout(nxt
);
1218 req
->flags
&= ~REQ_F_LINK_TIMEOUT
;
1222 list_del_init(&req
->link_list
);
1223 if (!list_empty(&nxt
->link_list
))
1224 nxt
->flags
|= REQ_F_LINK
;
1229 req
->flags
|= REQ_F_LINK_NEXT
;
1231 io_cqring_ev_posted(ctx
);
1235 * Called if REQ_F_LINK is set, and we fail the head request
1237 static void io_fail_links(struct io_kiocb
*req
)
1239 struct io_ring_ctx
*ctx
= req
->ctx
;
1240 unsigned long flags
;
1242 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
1244 while (!list_empty(&req
->link_list
)) {
1245 struct io_kiocb
*link
= list_first_entry(&req
->link_list
,
1246 struct io_kiocb
, link_list
);
1248 list_del_init(&link
->link_list
);
1249 trace_io_uring_fail_link(req
, link
);
1251 if ((req
->flags
& REQ_F_LINK_TIMEOUT
) &&
1252 link
->opcode
== IORING_OP_LINK_TIMEOUT
) {
1253 io_link_cancel_timeout(link
);
1255 io_cqring_fill_event(link
, -ECANCELED
);
1256 __io_double_put_req(link
);
1258 req
->flags
&= ~REQ_F_LINK_TIMEOUT
;
1261 io_commit_cqring(ctx
);
1262 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
1263 io_cqring_ev_posted(ctx
);
1266 static void io_req_find_next(struct io_kiocb
*req
, struct io_kiocb
**nxt
)
1268 if (likely(!(req
->flags
& REQ_F_LINK
)))
1272 * If LINK is set, we have dependent requests in this chain. If we
1273 * didn't fail this request, queue the first one up, moving any other
1274 * dependencies to the next request. In case of failure, fail the rest
1277 if (req
->flags
& REQ_F_FAIL_LINK
) {
1279 } else if ((req
->flags
& (REQ_F_LINK_TIMEOUT
| REQ_F_COMP_LOCKED
)) ==
1280 REQ_F_LINK_TIMEOUT
) {
1281 struct io_ring_ctx
*ctx
= req
->ctx
;
1282 unsigned long flags
;
1285 * If this is a timeout link, we could be racing with the
1286 * timeout timer. Grab the completion lock for this case to
1287 * protect against that.
1289 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
1290 io_req_link_next(req
, nxt
);
1291 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
1293 io_req_link_next(req
, nxt
);
1297 static void io_free_req(struct io_kiocb
*req
)
1299 struct io_kiocb
*nxt
= NULL
;
1301 io_req_find_next(req
, &nxt
);
1305 io_queue_async_work(nxt
);
1309 * Drop reference to request, return next in chain (if there is one) if this
1310 * was the last reference to this request.
1312 __attribute__((nonnull
))
1313 static void io_put_req_find_next(struct io_kiocb
*req
, struct io_kiocb
**nxtptr
)
1315 io_req_find_next(req
, nxtptr
);
1317 if (refcount_dec_and_test(&req
->refs
))
1321 static void io_put_req(struct io_kiocb
*req
)
1323 if (refcount_dec_and_test(&req
->refs
))
1328 * Must only be used if we don't need to care about links, usually from
1329 * within the completion handling itself.
1331 static void __io_double_put_req(struct io_kiocb
*req
)
1333 /* drop both submit and complete references */
1334 if (refcount_sub_and_test(2, &req
->refs
))
1338 static void io_double_put_req(struct io_kiocb
*req
)
1340 /* drop both submit and complete references */
1341 if (refcount_sub_and_test(2, &req
->refs
))
1345 static unsigned io_cqring_events(struct io_ring_ctx
*ctx
, bool noflush
)
1347 struct io_rings
*rings
= ctx
->rings
;
1349 if (test_bit(0, &ctx
->cq_check_overflow
)) {
1351 * noflush == true is from the waitqueue handler, just ensure
1352 * we wake up the task, and the next invocation will flush the
1353 * entries. We cannot safely to it from here.
1355 if (noflush
&& !list_empty(&ctx
->cq_overflow_list
))
1358 io_cqring_overflow_flush(ctx
, false);
1361 /* See comment at the top of this file */
1363 return ctx
->cached_cq_tail
- READ_ONCE(rings
->cq
.head
);
1366 static inline unsigned int io_sqring_entries(struct io_ring_ctx
*ctx
)
1368 struct io_rings
*rings
= ctx
->rings
;
1370 /* make sure SQ entry isn't read before tail */
1371 return smp_load_acquire(&rings
->sq
.tail
) - ctx
->cached_sq_head
;
1374 static inline bool io_req_multi_free(struct io_kiocb
*req
)
1377 * If we're not using fixed files, we have to pair the completion part
1378 * with the file put. Use regular completions for those, only batch
1379 * free for fixed file and non-linked commands.
1381 if (((req
->flags
& (REQ_F_FIXED_FILE
|REQ_F_LINK
)) == REQ_F_FIXED_FILE
)
1382 && !io_is_fallback_req(req
) && !req
->io
)
1389 * Find and free completed poll iocbs
1391 static void io_iopoll_complete(struct io_ring_ctx
*ctx
, unsigned int *nr_events
,
1392 struct list_head
*done
)
1394 void *reqs
[IO_IOPOLL_BATCH
];
1395 struct io_kiocb
*req
;
1399 while (!list_empty(done
)) {
1400 req
= list_first_entry(done
, struct io_kiocb
, list
);
1401 list_del(&req
->list
);
1403 io_cqring_fill_event(req
, req
->result
);
1406 if (refcount_dec_and_test(&req
->refs
)) {
1407 if (io_req_multi_free(req
)) {
1408 reqs
[to_free
++] = req
;
1409 if (to_free
== ARRAY_SIZE(reqs
))
1410 io_free_req_many(ctx
, reqs
, &to_free
);
1417 io_commit_cqring(ctx
);
1418 io_free_req_many(ctx
, reqs
, &to_free
);
1421 static int io_do_iopoll(struct io_ring_ctx
*ctx
, unsigned int *nr_events
,
1424 struct io_kiocb
*req
, *tmp
;
1430 * Only spin for completions if we don't have multiple devices hanging
1431 * off our complete list, and we're under the requested amount.
1433 spin
= !ctx
->poll_multi_file
&& *nr_events
< min
;
1436 list_for_each_entry_safe(req
, tmp
, &ctx
->poll_list
, list
) {
1437 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
1440 * Move completed entries to our local list. If we find a
1441 * request that requires polling, break out and complete
1442 * the done list first, if we have entries there.
1444 if (req
->flags
& REQ_F_IOPOLL_COMPLETED
) {
1445 list_move_tail(&req
->list
, &done
);
1448 if (!list_empty(&done
))
1451 ret
= kiocb
->ki_filp
->f_op
->iopoll(kiocb
, spin
);
1460 if (!list_empty(&done
))
1461 io_iopoll_complete(ctx
, nr_events
, &done
);
1467 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1468 * non-spinning poll check - we'll still enter the driver poll loop, but only
1469 * as a non-spinning completion check.
1471 static int io_iopoll_getevents(struct io_ring_ctx
*ctx
, unsigned int *nr_events
,
1474 while (!list_empty(&ctx
->poll_list
) && !need_resched()) {
1477 ret
= io_do_iopoll(ctx
, nr_events
, min
);
1480 if (!min
|| *nr_events
>= min
)
1488 * We can't just wait for polled events to come to us, we have to actively
1489 * find and complete them.
1491 static void io_iopoll_reap_events(struct io_ring_ctx
*ctx
)
1493 if (!(ctx
->flags
& IORING_SETUP_IOPOLL
))
1496 mutex_lock(&ctx
->uring_lock
);
1497 while (!list_empty(&ctx
->poll_list
)) {
1498 unsigned int nr_events
= 0;
1500 io_iopoll_getevents(ctx
, &nr_events
, 1);
1503 * Ensure we allow local-to-the-cpu processing to take place,
1504 * in this case we need to ensure that we reap all events.
1508 mutex_unlock(&ctx
->uring_lock
);
1511 static int __io_iopoll_check(struct io_ring_ctx
*ctx
, unsigned *nr_events
,
1514 int iters
= 0, ret
= 0;
1520 * Don't enter poll loop if we already have events pending.
1521 * If we do, we can potentially be spinning for commands that
1522 * already triggered a CQE (eg in error).
1524 if (io_cqring_events(ctx
, false))
1528 * If a submit got punted to a workqueue, we can have the
1529 * application entering polling for a command before it gets
1530 * issued. That app will hold the uring_lock for the duration
1531 * of the poll right here, so we need to take a breather every
1532 * now and then to ensure that the issue has a chance to add
1533 * the poll to the issued list. Otherwise we can spin here
1534 * forever, while the workqueue is stuck trying to acquire the
1537 if (!(++iters
& 7)) {
1538 mutex_unlock(&ctx
->uring_lock
);
1539 mutex_lock(&ctx
->uring_lock
);
1542 if (*nr_events
< min
)
1543 tmin
= min
- *nr_events
;
1545 ret
= io_iopoll_getevents(ctx
, nr_events
, tmin
);
1549 } while (min
&& !*nr_events
&& !need_resched());
1554 static int io_iopoll_check(struct io_ring_ctx
*ctx
, unsigned *nr_events
,
1560 * We disallow the app entering submit/complete with polling, but we
1561 * still need to lock the ring to prevent racing with polled issue
1562 * that got punted to a workqueue.
1564 mutex_lock(&ctx
->uring_lock
);
1565 ret
= __io_iopoll_check(ctx
, nr_events
, min
);
1566 mutex_unlock(&ctx
->uring_lock
);
1570 static void kiocb_end_write(struct io_kiocb
*req
)
1573 * Tell lockdep we inherited freeze protection from submission
1576 if (req
->flags
& REQ_F_ISREG
) {
1577 struct inode
*inode
= file_inode(req
->file
);
1579 __sb_writers_acquired(inode
->i_sb
, SB_FREEZE_WRITE
);
1581 file_end_write(req
->file
);
1584 static inline void req_set_fail_links(struct io_kiocb
*req
)
1586 if ((req
->flags
& (REQ_F_LINK
| REQ_F_HARDLINK
)) == REQ_F_LINK
)
1587 req
->flags
|= REQ_F_FAIL_LINK
;
1590 static void io_complete_rw_common(struct kiocb
*kiocb
, long res
)
1592 struct io_kiocb
*req
= container_of(kiocb
, struct io_kiocb
, rw
.kiocb
);
1594 if (kiocb
->ki_flags
& IOCB_WRITE
)
1595 kiocb_end_write(req
);
1597 if (res
!= req
->result
)
1598 req_set_fail_links(req
);
1599 io_cqring_add_event(req
, res
);
1602 static void io_complete_rw(struct kiocb
*kiocb
, long res
, long res2
)
1604 struct io_kiocb
*req
= container_of(kiocb
, struct io_kiocb
, rw
.kiocb
);
1606 io_complete_rw_common(kiocb
, res
);
1610 static struct io_kiocb
*__io_complete_rw(struct kiocb
*kiocb
, long res
)
1612 struct io_kiocb
*req
= container_of(kiocb
, struct io_kiocb
, rw
.kiocb
);
1613 struct io_kiocb
*nxt
= NULL
;
1615 io_complete_rw_common(kiocb
, res
);
1616 io_put_req_find_next(req
, &nxt
);
1621 static void io_complete_rw_iopoll(struct kiocb
*kiocb
, long res
, long res2
)
1623 struct io_kiocb
*req
= container_of(kiocb
, struct io_kiocb
, rw
.kiocb
);
1625 if (kiocb
->ki_flags
& IOCB_WRITE
)
1626 kiocb_end_write(req
);
1628 if (res
!= req
->result
)
1629 req_set_fail_links(req
);
1632 req
->flags
|= REQ_F_IOPOLL_COMPLETED
;
1636 * After the iocb has been issued, it's safe to be found on the poll list.
1637 * Adding the kiocb to the list AFTER submission ensures that we don't
1638 * find it from a io_iopoll_getevents() thread before the issuer is done
1639 * accessing the kiocb cookie.
1641 static void io_iopoll_req_issued(struct io_kiocb
*req
)
1643 struct io_ring_ctx
*ctx
= req
->ctx
;
1646 * Track whether we have multiple files in our lists. This will impact
1647 * how we do polling eventually, not spinning if we're on potentially
1648 * different devices.
1650 if (list_empty(&ctx
->poll_list
)) {
1651 ctx
->poll_multi_file
= false;
1652 } else if (!ctx
->poll_multi_file
) {
1653 struct io_kiocb
*list_req
;
1655 list_req
= list_first_entry(&ctx
->poll_list
, struct io_kiocb
,
1657 if (list_req
->file
!= req
->file
)
1658 ctx
->poll_multi_file
= true;
1662 * For fast devices, IO may have already completed. If it has, add
1663 * it to the front so we find it first.
1665 if (req
->flags
& REQ_F_IOPOLL_COMPLETED
)
1666 list_add(&req
->list
, &ctx
->poll_list
);
1668 list_add_tail(&req
->list
, &ctx
->poll_list
);
1671 static void io_file_put(struct io_submit_state
*state
)
1674 int diff
= state
->has_refs
- state
->used_refs
;
1677 fput_many(state
->file
, diff
);
1683 * Get as many references to a file as we have IOs left in this submission,
1684 * assuming most submissions are for one file, or at least that each file
1685 * has more than one submission.
1687 static struct file
*io_file_get(struct io_submit_state
*state
, int fd
)
1693 if (state
->fd
== fd
) {
1700 state
->file
= fget_many(fd
, state
->ios_left
);
1705 state
->has_refs
= state
->ios_left
;
1706 state
->used_refs
= 1;
1712 * If we tracked the file through the SCM inflight mechanism, we could support
1713 * any file. For now, just ensure that anything potentially problematic is done
1716 static bool io_file_supports_async(struct file
*file
)
1718 umode_t mode
= file_inode(file
)->i_mode
;
1720 if (S_ISBLK(mode
) || S_ISCHR(mode
) || S_ISSOCK(mode
))
1722 if (S_ISREG(mode
) && file
->f_op
!= &io_uring_fops
)
1728 static int io_prep_rw(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
,
1729 bool force_nonblock
)
1731 struct io_ring_ctx
*ctx
= req
->ctx
;
1732 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
1739 if (S_ISREG(file_inode(req
->file
)->i_mode
))
1740 req
->flags
|= REQ_F_ISREG
;
1742 kiocb
->ki_pos
= READ_ONCE(sqe
->off
);
1743 if (kiocb
->ki_pos
== -1 && !(req
->file
->f_mode
& FMODE_STREAM
)) {
1744 req
->flags
|= REQ_F_CUR_POS
;
1745 kiocb
->ki_pos
= req
->file
->f_pos
;
1747 kiocb
->ki_flags
= iocb_flags(kiocb
->ki_filp
);
1748 kiocb
->ki_hint
= ki_hint_validate(file_write_hint(kiocb
->ki_filp
));
1750 ioprio
= READ_ONCE(sqe
->ioprio
);
1752 ret
= ioprio_check_cap(ioprio
);
1756 kiocb
->ki_ioprio
= ioprio
;
1758 kiocb
->ki_ioprio
= get_current_ioprio();
1760 ret
= kiocb_set_rw_flags(kiocb
, READ_ONCE(sqe
->rw_flags
));
1764 /* don't allow async punt if RWF_NOWAIT was requested */
1765 if ((kiocb
->ki_flags
& IOCB_NOWAIT
) ||
1766 (req
->file
->f_flags
& O_NONBLOCK
))
1767 req
->flags
|= REQ_F_NOWAIT
;
1770 kiocb
->ki_flags
|= IOCB_NOWAIT
;
1772 if (ctx
->flags
& IORING_SETUP_IOPOLL
) {
1773 if (!(kiocb
->ki_flags
& IOCB_DIRECT
) ||
1774 !kiocb
->ki_filp
->f_op
->iopoll
)
1777 kiocb
->ki_flags
|= IOCB_HIPRI
;
1778 kiocb
->ki_complete
= io_complete_rw_iopoll
;
1781 if (kiocb
->ki_flags
& IOCB_HIPRI
)
1783 kiocb
->ki_complete
= io_complete_rw
;
1786 req
->rw
.addr
= READ_ONCE(sqe
->addr
);
1787 req
->rw
.len
= READ_ONCE(sqe
->len
);
1788 /* we own ->private, reuse it for the buffer index */
1789 req
->rw
.kiocb
.private = (void *) (unsigned long)
1790 READ_ONCE(sqe
->buf_index
);
1794 static inline void io_rw_done(struct kiocb
*kiocb
, ssize_t ret
)
1800 case -ERESTARTNOINTR
:
1801 case -ERESTARTNOHAND
:
1802 case -ERESTART_RESTARTBLOCK
:
1804 * We can't just restart the syscall, since previously
1805 * submitted sqes may already be in progress. Just fail this
1811 kiocb
->ki_complete(kiocb
, ret
, 0);
1815 static void kiocb_done(struct kiocb
*kiocb
, ssize_t ret
, struct io_kiocb
**nxt
,
1818 struct io_kiocb
*req
= container_of(kiocb
, struct io_kiocb
, rw
.kiocb
);
1820 if (req
->flags
& REQ_F_CUR_POS
)
1821 req
->file
->f_pos
= kiocb
->ki_pos
;
1822 if (in_async
&& ret
>= 0 && kiocb
->ki_complete
== io_complete_rw
)
1823 *nxt
= __io_complete_rw(kiocb
, ret
);
1825 io_rw_done(kiocb
, ret
);
1828 static ssize_t
io_import_fixed(struct io_kiocb
*req
, int rw
,
1829 struct iov_iter
*iter
)
1831 struct io_ring_ctx
*ctx
= req
->ctx
;
1832 size_t len
= req
->rw
.len
;
1833 struct io_mapped_ubuf
*imu
;
1834 unsigned index
, buf_index
;
1838 /* attempt to use fixed buffers without having provided iovecs */
1839 if (unlikely(!ctx
->user_bufs
))
1842 buf_index
= (unsigned long) req
->rw
.kiocb
.private;
1843 if (unlikely(buf_index
>= ctx
->nr_user_bufs
))
1846 index
= array_index_nospec(buf_index
, ctx
->nr_user_bufs
);
1847 imu
= &ctx
->user_bufs
[index
];
1848 buf_addr
= req
->rw
.addr
;
1851 if (buf_addr
+ len
< buf_addr
)
1853 /* not inside the mapped region */
1854 if (buf_addr
< imu
->ubuf
|| buf_addr
+ len
> imu
->ubuf
+ imu
->len
)
1858 * May not be a start of buffer, set size appropriately
1859 * and advance us to the beginning.
1861 offset
= buf_addr
- imu
->ubuf
;
1862 iov_iter_bvec(iter
, rw
, imu
->bvec
, imu
->nr_bvecs
, offset
+ len
);
1866 * Don't use iov_iter_advance() here, as it's really slow for
1867 * using the latter parts of a big fixed buffer - it iterates
1868 * over each segment manually. We can cheat a bit here, because
1871 * 1) it's a BVEC iter, we set it up
1872 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1873 * first and last bvec
1875 * So just find our index, and adjust the iterator afterwards.
1876 * If the offset is within the first bvec (or the whole first
1877 * bvec, just use iov_iter_advance(). This makes it easier
1878 * since we can just skip the first segment, which may not
1879 * be PAGE_SIZE aligned.
1881 const struct bio_vec
*bvec
= imu
->bvec
;
1883 if (offset
<= bvec
->bv_len
) {
1884 iov_iter_advance(iter
, offset
);
1886 unsigned long seg_skip
;
1888 /* skip first vec */
1889 offset
-= bvec
->bv_len
;
1890 seg_skip
= 1 + (offset
>> PAGE_SHIFT
);
1892 iter
->bvec
= bvec
+ seg_skip
;
1893 iter
->nr_segs
-= seg_skip
;
1894 iter
->count
-= bvec
->bv_len
+ offset
;
1895 iter
->iov_offset
= offset
& ~PAGE_MASK
;
1902 static ssize_t
io_import_iovec(int rw
, struct io_kiocb
*req
,
1903 struct iovec
**iovec
, struct iov_iter
*iter
)
1905 void __user
*buf
= u64_to_user_ptr(req
->rw
.addr
);
1906 size_t sqe_len
= req
->rw
.len
;
1909 opcode
= req
->opcode
;
1910 if (opcode
== IORING_OP_READ_FIXED
|| opcode
== IORING_OP_WRITE_FIXED
) {
1912 return io_import_fixed(req
, rw
, iter
);
1915 /* buffer index only valid with fixed read/write */
1916 if (req
->rw
.kiocb
.private)
1919 if (opcode
== IORING_OP_READ
|| opcode
== IORING_OP_WRITE
) {
1921 ret
= import_single_range(rw
, buf
, sqe_len
, *iovec
, iter
);
1927 struct io_async_rw
*iorw
= &req
->io
->rw
;
1930 iov_iter_init(iter
, rw
, *iovec
, iorw
->nr_segs
, iorw
->size
);
1931 if (iorw
->iov
== iorw
->fast_iov
)
1939 #ifdef CONFIG_COMPAT
1940 if (req
->ctx
->compat
)
1941 return compat_import_iovec(rw
, buf
, sqe_len
, UIO_FASTIOV
,
1945 return import_iovec(rw
, buf
, sqe_len
, UIO_FASTIOV
, iovec
, iter
);
1949 * For files that don't have ->read_iter() and ->write_iter(), handle them
1950 * by looping over ->read() or ->write() manually.
1952 static ssize_t
loop_rw_iter(int rw
, struct file
*file
, struct kiocb
*kiocb
,
1953 struct iov_iter
*iter
)
1958 * Don't support polled IO through this interface, and we can't
1959 * support non-blocking either. For the latter, this just causes
1960 * the kiocb to be handled from an async context.
1962 if (kiocb
->ki_flags
& IOCB_HIPRI
)
1964 if (kiocb
->ki_flags
& IOCB_NOWAIT
)
1967 while (iov_iter_count(iter
)) {
1971 if (!iov_iter_is_bvec(iter
)) {
1972 iovec
= iov_iter_iovec(iter
);
1974 /* fixed buffers import bvec */
1975 iovec
.iov_base
= kmap(iter
->bvec
->bv_page
)
1977 iovec
.iov_len
= min(iter
->count
,
1978 iter
->bvec
->bv_len
- iter
->iov_offset
);
1982 nr
= file
->f_op
->read(file
, iovec
.iov_base
,
1983 iovec
.iov_len
, &kiocb
->ki_pos
);
1985 nr
= file
->f_op
->write(file
, iovec
.iov_base
,
1986 iovec
.iov_len
, &kiocb
->ki_pos
);
1989 if (iov_iter_is_bvec(iter
))
1990 kunmap(iter
->bvec
->bv_page
);
1998 if (nr
!= iovec
.iov_len
)
2000 iov_iter_advance(iter
, nr
);
2006 static void io_req_map_rw(struct io_kiocb
*req
, ssize_t io_size
,
2007 struct iovec
*iovec
, struct iovec
*fast_iov
,
2008 struct iov_iter
*iter
)
2010 req
->io
->rw
.nr_segs
= iter
->nr_segs
;
2011 req
->io
->rw
.size
= io_size
;
2012 req
->io
->rw
.iov
= iovec
;
2013 if (!req
->io
->rw
.iov
) {
2014 req
->io
->rw
.iov
= req
->io
->rw
.fast_iov
;
2015 memcpy(req
->io
->rw
.iov
, fast_iov
,
2016 sizeof(struct iovec
) * iter
->nr_segs
);
2020 static int io_alloc_async_ctx(struct io_kiocb
*req
)
2022 if (!io_op_defs
[req
->opcode
].async_ctx
)
2024 req
->io
= kmalloc(sizeof(*req
->io
), GFP_KERNEL
);
2025 return req
->io
== NULL
;
2028 static void io_rw_async(struct io_wq_work
**workptr
)
2030 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
2031 struct iovec
*iov
= NULL
;
2033 if (req
->io
->rw
.iov
!= req
->io
->rw
.fast_iov
)
2034 iov
= req
->io
->rw
.iov
;
2035 io_wq_submit_work(workptr
);
2039 static int io_setup_async_rw(struct io_kiocb
*req
, ssize_t io_size
,
2040 struct iovec
*iovec
, struct iovec
*fast_iov
,
2041 struct iov_iter
*iter
)
2043 if (req
->opcode
== IORING_OP_READ_FIXED
||
2044 req
->opcode
== IORING_OP_WRITE_FIXED
)
2046 if (!req
->io
&& io_alloc_async_ctx(req
))
2049 io_req_map_rw(req
, io_size
, iovec
, fast_iov
, iter
);
2050 req
->work
.func
= io_rw_async
;
2054 static int io_read_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
,
2055 bool force_nonblock
)
2057 struct io_async_ctx
*io
;
2058 struct iov_iter iter
;
2061 ret
= io_prep_rw(req
, sqe
, force_nonblock
);
2065 if (unlikely(!(req
->file
->f_mode
& FMODE_READ
)))
2072 io
->rw
.iov
= io
->rw
.fast_iov
;
2074 ret
= io_import_iovec(READ
, req
, &io
->rw
.iov
, &iter
);
2079 io_req_map_rw(req
, ret
, io
->rw
.iov
, io
->rw
.fast_iov
, &iter
);
2083 static int io_read(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2084 bool force_nonblock
)
2086 struct iovec inline_vecs
[UIO_FASTIOV
], *iovec
= inline_vecs
;
2087 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
2088 struct iov_iter iter
;
2090 ssize_t io_size
, ret
;
2092 ret
= io_import_iovec(READ
, req
, &iovec
, &iter
);
2096 /* Ensure we clear previously set non-block flag */
2097 if (!force_nonblock
)
2098 req
->rw
.kiocb
.ki_flags
&= ~IOCB_NOWAIT
;
2102 if (req
->flags
& REQ_F_LINK
)
2103 req
->result
= io_size
;
2106 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2107 * we know to async punt it even if it was opened O_NONBLOCK
2109 if (force_nonblock
&& !io_file_supports_async(req
->file
)) {
2110 req
->flags
|= REQ_F_MUST_PUNT
;
2114 iov_count
= iov_iter_count(&iter
);
2115 ret
= rw_verify_area(READ
, req
->file
, &kiocb
->ki_pos
, iov_count
);
2119 if (req
->file
->f_op
->read_iter
)
2120 ret2
= call_read_iter(req
->file
, kiocb
, &iter
);
2122 ret2
= loop_rw_iter(READ
, req
->file
, kiocb
, &iter
);
2124 /* Catch -EAGAIN return for forced non-blocking submission */
2125 if (!force_nonblock
|| ret2
!= -EAGAIN
) {
2126 kiocb_done(kiocb
, ret2
, nxt
, req
->in_async
);
2129 ret
= io_setup_async_rw(req
, io_size
, iovec
,
2130 inline_vecs
, &iter
);
2137 if (!io_wq_current_is_worker())
2142 static int io_write_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
,
2143 bool force_nonblock
)
2145 struct io_async_ctx
*io
;
2146 struct iov_iter iter
;
2149 ret
= io_prep_rw(req
, sqe
, force_nonblock
);
2153 if (unlikely(!(req
->file
->f_mode
& FMODE_WRITE
)))
2160 io
->rw
.iov
= io
->rw
.fast_iov
;
2162 ret
= io_import_iovec(WRITE
, req
, &io
->rw
.iov
, &iter
);
2167 io_req_map_rw(req
, ret
, io
->rw
.iov
, io
->rw
.fast_iov
, &iter
);
2171 static int io_write(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2172 bool force_nonblock
)
2174 struct iovec inline_vecs
[UIO_FASTIOV
], *iovec
= inline_vecs
;
2175 struct kiocb
*kiocb
= &req
->rw
.kiocb
;
2176 struct iov_iter iter
;
2178 ssize_t ret
, io_size
;
2180 ret
= io_import_iovec(WRITE
, req
, &iovec
, &iter
);
2184 /* Ensure we clear previously set non-block flag */
2185 if (!force_nonblock
)
2186 req
->rw
.kiocb
.ki_flags
&= ~IOCB_NOWAIT
;
2190 if (req
->flags
& REQ_F_LINK
)
2191 req
->result
= io_size
;
2194 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2195 * we know to async punt it even if it was opened O_NONBLOCK
2197 if (force_nonblock
&& !io_file_supports_async(req
->file
)) {
2198 req
->flags
|= REQ_F_MUST_PUNT
;
2202 /* file path doesn't support NOWAIT for non-direct_IO */
2203 if (force_nonblock
&& !(kiocb
->ki_flags
& IOCB_DIRECT
) &&
2204 (req
->flags
& REQ_F_ISREG
))
2207 iov_count
= iov_iter_count(&iter
);
2208 ret
= rw_verify_area(WRITE
, req
->file
, &kiocb
->ki_pos
, iov_count
);
2213 * Open-code file_start_write here to grab freeze protection,
2214 * which will be released by another thread in
2215 * io_complete_rw(). Fool lockdep by telling it the lock got
2216 * released so that it doesn't complain about the held lock when
2217 * we return to userspace.
2219 if (req
->flags
& REQ_F_ISREG
) {
2220 __sb_start_write(file_inode(req
->file
)->i_sb
,
2221 SB_FREEZE_WRITE
, true);
2222 __sb_writers_release(file_inode(req
->file
)->i_sb
,
2225 kiocb
->ki_flags
|= IOCB_WRITE
;
2227 if (req
->file
->f_op
->write_iter
)
2228 ret2
= call_write_iter(req
->file
, kiocb
, &iter
);
2230 ret2
= loop_rw_iter(WRITE
, req
->file
, kiocb
, &iter
);
2231 if (!force_nonblock
|| ret2
!= -EAGAIN
) {
2232 kiocb_done(kiocb
, ret2
, nxt
, req
->in_async
);
2235 ret
= io_setup_async_rw(req
, io_size
, iovec
,
2236 inline_vecs
, &iter
);
2243 if (!io_wq_current_is_worker())
2249 * IORING_OP_NOP just posts a completion event, nothing else.
2251 static int io_nop(struct io_kiocb
*req
)
2253 struct io_ring_ctx
*ctx
= req
->ctx
;
2255 if (unlikely(ctx
->flags
& IORING_SETUP_IOPOLL
))
2258 io_cqring_add_event(req
, 0);
2263 static int io_prep_fsync(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2265 struct io_ring_ctx
*ctx
= req
->ctx
;
2270 if (unlikely(ctx
->flags
& IORING_SETUP_IOPOLL
))
2272 if (unlikely(sqe
->addr
|| sqe
->ioprio
|| sqe
->buf_index
))
2275 req
->sync
.flags
= READ_ONCE(sqe
->fsync_flags
);
2276 if (unlikely(req
->sync
.flags
& ~IORING_FSYNC_DATASYNC
))
2279 req
->sync
.off
= READ_ONCE(sqe
->off
);
2280 req
->sync
.len
= READ_ONCE(sqe
->len
);
2284 static bool io_req_cancelled(struct io_kiocb
*req
)
2286 if (req
->work
.flags
& IO_WQ_WORK_CANCEL
) {
2287 req_set_fail_links(req
);
2288 io_cqring_add_event(req
, -ECANCELED
);
2296 static void io_link_work_cb(struct io_wq_work
**workptr
)
2298 struct io_wq_work
*work
= *workptr
;
2299 struct io_kiocb
*link
= work
->data
;
2301 io_queue_linked_timeout(link
);
2302 work
->func
= io_wq_submit_work
;
2305 static void io_wq_assign_next(struct io_wq_work
**workptr
, struct io_kiocb
*nxt
)
2307 struct io_kiocb
*link
;
2309 io_prep_async_work(nxt
, &link
);
2310 *workptr
= &nxt
->work
;
2312 nxt
->work
.flags
|= IO_WQ_WORK_CB
;
2313 nxt
->work
.func
= io_link_work_cb
;
2314 nxt
->work
.data
= link
;
2318 static void io_fsync_finish(struct io_wq_work
**workptr
)
2320 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
2321 loff_t end
= req
->sync
.off
+ req
->sync
.len
;
2322 struct io_kiocb
*nxt
= NULL
;
2325 if (io_req_cancelled(req
))
2328 ret
= vfs_fsync_range(req
->file
, req
->sync
.off
,
2329 end
> 0 ? end
: LLONG_MAX
,
2330 req
->sync
.flags
& IORING_FSYNC_DATASYNC
);
2332 req_set_fail_links(req
);
2333 io_cqring_add_event(req
, ret
);
2334 io_put_req_find_next(req
, &nxt
);
2336 io_wq_assign_next(workptr
, nxt
);
2339 static int io_fsync(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2340 bool force_nonblock
)
2342 struct io_wq_work
*work
, *old_work
;
2344 /* fsync always requires a blocking context */
2345 if (force_nonblock
) {
2347 req
->work
.func
= io_fsync_finish
;
2351 work
= old_work
= &req
->work
;
2352 io_fsync_finish(&work
);
2353 if (work
&& work
!= old_work
)
2354 *nxt
= container_of(work
, struct io_kiocb
, work
);
2358 static void io_fallocate_finish(struct io_wq_work
**workptr
)
2360 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
2361 struct io_kiocb
*nxt
= NULL
;
2364 ret
= vfs_fallocate(req
->file
, req
->sync
.mode
, req
->sync
.off
,
2367 req_set_fail_links(req
);
2368 io_cqring_add_event(req
, ret
);
2369 io_put_req_find_next(req
, &nxt
);
2371 io_wq_assign_next(workptr
, nxt
);
2374 static int io_fallocate_prep(struct io_kiocb
*req
,
2375 const struct io_uring_sqe
*sqe
)
2377 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->rw_flags
)
2380 req
->sync
.off
= READ_ONCE(sqe
->off
);
2381 req
->sync
.len
= READ_ONCE(sqe
->addr
);
2382 req
->sync
.mode
= READ_ONCE(sqe
->len
);
2386 static int io_fallocate(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2387 bool force_nonblock
)
2389 struct io_wq_work
*work
, *old_work
;
2391 /* fallocate always requiring blocking context */
2392 if (force_nonblock
) {
2394 req
->work
.func
= io_fallocate_finish
;
2398 work
= old_work
= &req
->work
;
2399 io_fallocate_finish(&work
);
2400 if (work
&& work
!= old_work
)
2401 *nxt
= container_of(work
, struct io_kiocb
, work
);
2406 static int io_openat_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2410 if (sqe
->ioprio
|| sqe
->buf_index
)
2413 req
->open
.dfd
= READ_ONCE(sqe
->fd
);
2414 req
->open
.mode
= READ_ONCE(sqe
->len
);
2415 req
->open
.fname
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
2416 req
->open
.flags
= READ_ONCE(sqe
->open_flags
);
2418 req
->open
.filename
= getname(req
->open
.fname
);
2419 if (IS_ERR(req
->open
.filename
)) {
2420 ret
= PTR_ERR(req
->open
.filename
);
2421 req
->open
.filename
= NULL
;
2428 static int io_openat(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2429 bool force_nonblock
)
2431 struct open_flags op
;
2432 struct open_how how
;
2436 if (force_nonblock
) {
2437 req
->work
.flags
|= IO_WQ_WORK_NEEDS_FILES
;
2441 how
= build_open_how(req
->open
.flags
, req
->open
.mode
);
2442 ret
= build_open_flags(&how
, &op
);
2446 ret
= get_unused_fd_flags(how
.flags
);
2450 file
= do_filp_open(req
->open
.dfd
, req
->open
.filename
, &op
);
2453 ret
= PTR_ERR(file
);
2455 fsnotify_open(file
);
2456 fd_install(ret
, file
);
2459 putname(req
->open
.filename
);
2461 req_set_fail_links(req
);
2462 io_cqring_add_event(req
, ret
);
2463 io_put_req_find_next(req
, nxt
);
2467 static int io_madvise_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2469 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2470 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->off
)
2473 req
->madvise
.addr
= READ_ONCE(sqe
->addr
);
2474 req
->madvise
.len
= READ_ONCE(sqe
->len
);
2475 req
->madvise
.advice
= READ_ONCE(sqe
->fadvise_advice
);
2482 static int io_madvise(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2483 bool force_nonblock
)
2485 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2486 struct io_madvise
*ma
= &req
->madvise
;
2492 ret
= do_madvise(ma
->addr
, ma
->len
, ma
->advice
);
2494 req_set_fail_links(req
);
2495 io_cqring_add_event(req
, ret
);
2496 io_put_req_find_next(req
, nxt
);
2503 static int io_fadvise_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2505 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->addr
)
2508 req
->fadvise
.offset
= READ_ONCE(sqe
->off
);
2509 req
->fadvise
.len
= READ_ONCE(sqe
->len
);
2510 req
->fadvise
.advice
= READ_ONCE(sqe
->fadvise_advice
);
2514 static int io_fadvise(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2515 bool force_nonblock
)
2517 struct io_fadvise
*fa
= &req
->fadvise
;
2520 /* DONTNEED may block, others _should_ not */
2521 if (fa
->advice
== POSIX_FADV_DONTNEED
&& force_nonblock
)
2524 ret
= vfs_fadvise(req
->file
, fa
->offset
, fa
->len
, fa
->advice
);
2526 req_set_fail_links(req
);
2527 io_cqring_add_event(req
, ret
);
2528 io_put_req_find_next(req
, nxt
);
2532 static int io_statx_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2534 unsigned lookup_flags
;
2537 if (sqe
->ioprio
|| sqe
->buf_index
)
2540 req
->open
.dfd
= READ_ONCE(sqe
->fd
);
2541 req
->open
.mask
= READ_ONCE(sqe
->len
);
2542 req
->open
.fname
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
2543 req
->open
.buffer
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
2544 req
->open
.flags
= READ_ONCE(sqe
->statx_flags
);
2546 if (vfs_stat_set_lookup_flags(&lookup_flags
, req
->open
.flags
))
2549 req
->open
.filename
= getname_flags(req
->open
.fname
, lookup_flags
, NULL
);
2550 if (IS_ERR(req
->open
.filename
)) {
2551 ret
= PTR_ERR(req
->open
.filename
);
2552 req
->open
.filename
= NULL
;
2559 static int io_statx(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2560 bool force_nonblock
)
2562 struct io_open
*ctx
= &req
->open
;
2563 unsigned lookup_flags
;
2571 if (vfs_stat_set_lookup_flags(&lookup_flags
, ctx
->flags
))
2575 /* filename_lookup() drops it, keep a reference */
2576 ctx
->filename
->refcnt
++;
2578 ret
= filename_lookup(ctx
->dfd
, ctx
->filename
, lookup_flags
, &path
,
2583 ret
= vfs_getattr(&path
, &stat
, ctx
->mask
, ctx
->flags
);
2585 if (retry_estale(ret
, lookup_flags
)) {
2586 lookup_flags
|= LOOKUP_REVAL
;
2590 ret
= cp_statx(&stat
, ctx
->buffer
);
2592 putname(ctx
->filename
);
2594 req_set_fail_links(req
);
2595 io_cqring_add_event(req
, ret
);
2596 io_put_req_find_next(req
, nxt
);
2600 static int io_close_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2603 * If we queue this for async, it must not be cancellable. That would
2604 * leave the 'file' in an undeterminate state.
2606 req
->work
.flags
|= IO_WQ_WORK_NO_CANCEL
;
2608 if (sqe
->ioprio
|| sqe
->off
|| sqe
->addr
|| sqe
->len
||
2609 sqe
->rw_flags
|| sqe
->buf_index
)
2611 if (sqe
->flags
& IOSQE_FIXED_FILE
)
2614 req
->close
.fd
= READ_ONCE(sqe
->fd
);
2615 if (req
->file
->f_op
== &io_uring_fops
||
2616 req
->close
.fd
== req
->ring_fd
)
2622 static void io_close_finish(struct io_wq_work
**workptr
)
2624 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
2625 struct io_kiocb
*nxt
= NULL
;
2627 /* Invoked with files, we need to do the close */
2628 if (req
->work
.files
) {
2631 ret
= filp_close(req
->close
.put_file
, req
->work
.files
);
2633 req_set_fail_links(req
);
2635 io_cqring_add_event(req
, ret
);
2638 fput(req
->close
.put_file
);
2640 /* we bypassed the re-issue, drop the submission reference */
2642 io_put_req_find_next(req
, &nxt
);
2644 io_wq_assign_next(workptr
, nxt
);
2647 static int io_close(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2648 bool force_nonblock
)
2652 req
->close
.put_file
= NULL
;
2653 ret
= __close_fd_get_file(req
->close
.fd
, &req
->close
.put_file
);
2657 /* if the file has a flush method, be safe and punt to async */
2658 if (req
->close
.put_file
->f_op
->flush
&& !io_wq_current_is_worker()) {
2659 req
->work
.flags
|= IO_WQ_WORK_NEEDS_FILES
;
2664 * No ->flush(), safely close from here and just punt the
2665 * fput() to async context.
2667 ret
= filp_close(req
->close
.put_file
, current
->files
);
2670 req_set_fail_links(req
);
2671 io_cqring_add_event(req
, ret
);
2673 if (io_wq_current_is_worker()) {
2674 struct io_wq_work
*old_work
, *work
;
2676 old_work
= work
= &req
->work
;
2677 io_close_finish(&work
);
2678 if (work
&& work
!= old_work
)
2679 *nxt
= container_of(work
, struct io_kiocb
, work
);
2684 req
->work
.func
= io_close_finish
;
2688 static int io_prep_sfr(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2690 struct io_ring_ctx
*ctx
= req
->ctx
;
2695 if (unlikely(ctx
->flags
& IORING_SETUP_IOPOLL
))
2697 if (unlikely(sqe
->addr
|| sqe
->ioprio
|| sqe
->buf_index
))
2700 req
->sync
.off
= READ_ONCE(sqe
->off
);
2701 req
->sync
.len
= READ_ONCE(sqe
->len
);
2702 req
->sync
.flags
= READ_ONCE(sqe
->sync_range_flags
);
2706 static void io_sync_file_range_finish(struct io_wq_work
**workptr
)
2708 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
2709 struct io_kiocb
*nxt
= NULL
;
2712 if (io_req_cancelled(req
))
2715 ret
= sync_file_range(req
->file
, req
->sync
.off
, req
->sync
.len
,
2718 req_set_fail_links(req
);
2719 io_cqring_add_event(req
, ret
);
2720 io_put_req_find_next(req
, &nxt
);
2722 io_wq_assign_next(workptr
, nxt
);
2725 static int io_sync_file_range(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2726 bool force_nonblock
)
2728 struct io_wq_work
*work
, *old_work
;
2730 /* sync_file_range always requires a blocking context */
2731 if (force_nonblock
) {
2733 req
->work
.func
= io_sync_file_range_finish
;
2737 work
= old_work
= &req
->work
;
2738 io_sync_file_range_finish(&work
);
2739 if (work
&& work
!= old_work
)
2740 *nxt
= container_of(work
, struct io_kiocb
, work
);
2744 #if defined(CONFIG_NET)
2745 static void io_sendrecv_async(struct io_wq_work
**workptr
)
2747 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
2748 struct iovec
*iov
= NULL
;
2750 if (req
->io
->rw
.iov
!= req
->io
->rw
.fast_iov
)
2751 iov
= req
->io
->msg
.iov
;
2752 io_wq_submit_work(workptr
);
2757 static int io_sendmsg_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2759 #if defined(CONFIG_NET)
2760 struct io_sr_msg
*sr
= &req
->sr_msg
;
2761 struct io_async_ctx
*io
= req
->io
;
2763 sr
->msg_flags
= READ_ONCE(sqe
->msg_flags
);
2764 sr
->msg
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
2769 io
->msg
.iov
= io
->msg
.fast_iov
;
2770 return sendmsg_copy_msghdr(&io
->msg
.msg
, sr
->msg
, sr
->msg_flags
,
2777 static int io_sendmsg(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2778 bool force_nonblock
)
2780 #if defined(CONFIG_NET)
2781 struct io_async_msghdr
*kmsg
= NULL
;
2782 struct socket
*sock
;
2785 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
2788 sock
= sock_from_file(req
->file
, &ret
);
2790 struct io_async_ctx io
;
2791 struct sockaddr_storage addr
;
2795 kmsg
= &req
->io
->msg
;
2796 kmsg
->msg
.msg_name
= &addr
;
2797 /* if iov is set, it's allocated already */
2799 kmsg
->iov
= kmsg
->fast_iov
;
2800 kmsg
->msg
.msg_iter
.iov
= kmsg
->iov
;
2802 struct io_sr_msg
*sr
= &req
->sr_msg
;
2805 kmsg
->msg
.msg_name
= &addr
;
2807 io
.msg
.iov
= io
.msg
.fast_iov
;
2808 ret
= sendmsg_copy_msghdr(&io
.msg
.msg
, sr
->msg
,
2809 sr
->msg_flags
, &io
.msg
.iov
);
2814 flags
= req
->sr_msg
.msg_flags
;
2815 if (flags
& MSG_DONTWAIT
)
2816 req
->flags
|= REQ_F_NOWAIT
;
2817 else if (force_nonblock
)
2818 flags
|= MSG_DONTWAIT
;
2820 ret
= __sys_sendmsg_sock(sock
, &kmsg
->msg
, flags
);
2821 if (force_nonblock
&& ret
== -EAGAIN
) {
2824 if (io_alloc_async_ctx(req
))
2826 memcpy(&req
->io
->msg
, &io
.msg
, sizeof(io
.msg
));
2827 req
->work
.func
= io_sendrecv_async
;
2830 if (ret
== -ERESTARTSYS
)
2834 if (!io_wq_current_is_worker() && kmsg
&& kmsg
->iov
!= kmsg
->fast_iov
)
2836 io_cqring_add_event(req
, ret
);
2838 req_set_fail_links(req
);
2839 io_put_req_find_next(req
, nxt
);
2846 static int io_recvmsg_prep(struct io_kiocb
*req
,
2847 const struct io_uring_sqe
*sqe
)
2849 #if defined(CONFIG_NET)
2850 struct io_sr_msg
*sr
= &req
->sr_msg
;
2851 struct io_async_ctx
*io
= req
->io
;
2853 sr
->msg_flags
= READ_ONCE(sqe
->msg_flags
);
2854 sr
->msg
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
2859 io
->msg
.iov
= io
->msg
.fast_iov
;
2860 return recvmsg_copy_msghdr(&io
->msg
.msg
, sr
->msg
, sr
->msg_flags
,
2861 &io
->msg
.uaddr
, &io
->msg
.iov
);
2867 static int io_recvmsg(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2868 bool force_nonblock
)
2870 #if defined(CONFIG_NET)
2871 struct io_async_msghdr
*kmsg
= NULL
;
2872 struct socket
*sock
;
2875 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
2878 sock
= sock_from_file(req
->file
, &ret
);
2880 struct io_async_ctx io
;
2881 struct sockaddr_storage addr
;
2885 kmsg
= &req
->io
->msg
;
2886 kmsg
->msg
.msg_name
= &addr
;
2887 /* if iov is set, it's allocated already */
2889 kmsg
->iov
= kmsg
->fast_iov
;
2890 kmsg
->msg
.msg_iter
.iov
= kmsg
->iov
;
2892 struct io_sr_msg
*sr
= &req
->sr_msg
;
2895 kmsg
->msg
.msg_name
= &addr
;
2897 io
.msg
.iov
= io
.msg
.fast_iov
;
2898 ret
= recvmsg_copy_msghdr(&io
.msg
.msg
, sr
->msg
,
2899 sr
->msg_flags
, &io
.msg
.uaddr
,
2905 flags
= req
->sr_msg
.msg_flags
;
2906 if (flags
& MSG_DONTWAIT
)
2907 req
->flags
|= REQ_F_NOWAIT
;
2908 else if (force_nonblock
)
2909 flags
|= MSG_DONTWAIT
;
2911 ret
= __sys_recvmsg_sock(sock
, &kmsg
->msg
, req
->sr_msg
.msg
,
2912 kmsg
->uaddr
, flags
);
2913 if (force_nonblock
&& ret
== -EAGAIN
) {
2916 if (io_alloc_async_ctx(req
))
2918 memcpy(&req
->io
->msg
, &io
.msg
, sizeof(io
.msg
));
2919 req
->work
.func
= io_sendrecv_async
;
2922 if (ret
== -ERESTARTSYS
)
2926 if (!io_wq_current_is_worker() && kmsg
&& kmsg
->iov
!= kmsg
->fast_iov
)
2928 io_cqring_add_event(req
, ret
);
2930 req_set_fail_links(req
);
2931 io_put_req_find_next(req
, nxt
);
2938 static int io_accept_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
2940 #if defined(CONFIG_NET)
2941 struct io_accept
*accept
= &req
->accept
;
2943 if (unlikely(req
->ctx
->flags
& (IORING_SETUP_IOPOLL
|IORING_SETUP_SQPOLL
)))
2945 if (sqe
->ioprio
|| sqe
->len
|| sqe
->buf_index
)
2948 accept
->addr
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
2949 accept
->addr_len
= u64_to_user_ptr(READ_ONCE(sqe
->addr2
));
2950 accept
->flags
= READ_ONCE(sqe
->accept_flags
);
2957 #if defined(CONFIG_NET)
2958 static int __io_accept(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2959 bool force_nonblock
)
2961 struct io_accept
*accept
= &req
->accept
;
2962 unsigned file_flags
;
2965 file_flags
= force_nonblock
? O_NONBLOCK
: 0;
2966 ret
= __sys_accept4_file(req
->file
, file_flags
, accept
->addr
,
2967 accept
->addr_len
, accept
->flags
);
2968 if (ret
== -EAGAIN
&& force_nonblock
)
2970 if (ret
== -ERESTARTSYS
)
2973 req_set_fail_links(req
);
2974 io_cqring_add_event(req
, ret
);
2975 io_put_req_find_next(req
, nxt
);
2979 static void io_accept_finish(struct io_wq_work
**workptr
)
2981 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
2982 struct io_kiocb
*nxt
= NULL
;
2984 if (io_req_cancelled(req
))
2986 __io_accept(req
, &nxt
, false);
2988 io_wq_assign_next(workptr
, nxt
);
2992 static int io_accept(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
2993 bool force_nonblock
)
2995 #if defined(CONFIG_NET)
2998 ret
= __io_accept(req
, nxt
, force_nonblock
);
2999 if (ret
== -EAGAIN
&& force_nonblock
) {
3000 req
->work
.func
= io_accept_finish
;
3001 req
->work
.flags
|= IO_WQ_WORK_NEEDS_FILES
;
3011 static int io_connect_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
3013 #if defined(CONFIG_NET)
3014 struct io_connect
*conn
= &req
->connect
;
3015 struct io_async_ctx
*io
= req
->io
;
3017 if (unlikely(req
->ctx
->flags
& (IORING_SETUP_IOPOLL
|IORING_SETUP_SQPOLL
)))
3019 if (sqe
->ioprio
|| sqe
->len
|| sqe
->buf_index
|| sqe
->rw_flags
)
3022 conn
->addr
= u64_to_user_ptr(READ_ONCE(sqe
->addr
));
3023 conn
->addr_len
= READ_ONCE(sqe
->addr2
);
3028 return move_addr_to_kernel(conn
->addr
, conn
->addr_len
,
3029 &io
->connect
.address
);
3035 static int io_connect(struct io_kiocb
*req
, struct io_kiocb
**nxt
,
3036 bool force_nonblock
)
3038 #if defined(CONFIG_NET)
3039 struct io_async_ctx __io
, *io
;
3040 unsigned file_flags
;
3046 ret
= move_addr_to_kernel(req
->connect
.addr
,
3047 req
->connect
.addr_len
,
3048 &__io
.connect
.address
);
3054 file_flags
= force_nonblock
? O_NONBLOCK
: 0;
3056 ret
= __sys_connect_file(req
->file
, &io
->connect
.address
,
3057 req
->connect
.addr_len
, file_flags
);
3058 if ((ret
== -EAGAIN
|| ret
== -EINPROGRESS
) && force_nonblock
) {
3061 if (io_alloc_async_ctx(req
)) {
3065 memcpy(&req
->io
->connect
, &__io
.connect
, sizeof(__io
.connect
));
3068 if (ret
== -ERESTARTSYS
)
3072 req_set_fail_links(req
);
3073 io_cqring_add_event(req
, ret
);
3074 io_put_req_find_next(req
, nxt
);
3081 static void io_poll_remove_one(struct io_kiocb
*req
)
3083 struct io_poll_iocb
*poll
= &req
->poll
;
3085 spin_lock(&poll
->head
->lock
);
3086 WRITE_ONCE(poll
->canceled
, true);
3087 if (!list_empty(&poll
->wait
.entry
)) {
3088 list_del_init(&poll
->wait
.entry
);
3089 io_queue_async_work(req
);
3091 spin_unlock(&poll
->head
->lock
);
3092 hash_del(&req
->hash_node
);
3095 static void io_poll_remove_all(struct io_ring_ctx
*ctx
)
3097 struct hlist_node
*tmp
;
3098 struct io_kiocb
*req
;
3101 spin_lock_irq(&ctx
->completion_lock
);
3102 for (i
= 0; i
< (1U << ctx
->cancel_hash_bits
); i
++) {
3103 struct hlist_head
*list
;
3105 list
= &ctx
->cancel_hash
[i
];
3106 hlist_for_each_entry_safe(req
, tmp
, list
, hash_node
)
3107 io_poll_remove_one(req
);
3109 spin_unlock_irq(&ctx
->completion_lock
);
3112 static int io_poll_cancel(struct io_ring_ctx
*ctx
, __u64 sqe_addr
)
3114 struct hlist_head
*list
;
3115 struct io_kiocb
*req
;
3117 list
= &ctx
->cancel_hash
[hash_long(sqe_addr
, ctx
->cancel_hash_bits
)];
3118 hlist_for_each_entry(req
, list
, hash_node
) {
3119 if (sqe_addr
== req
->user_data
) {
3120 io_poll_remove_one(req
);
3128 static int io_poll_remove_prep(struct io_kiocb
*req
,
3129 const struct io_uring_sqe
*sqe
)
3131 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
3133 if (sqe
->ioprio
|| sqe
->off
|| sqe
->len
|| sqe
->buf_index
||
3137 req
->poll
.addr
= READ_ONCE(sqe
->addr
);
3142 * Find a running poll command that matches one specified in sqe->addr,
3143 * and remove it if found.
3145 static int io_poll_remove(struct io_kiocb
*req
)
3147 struct io_ring_ctx
*ctx
= req
->ctx
;
3151 addr
= req
->poll
.addr
;
3152 spin_lock_irq(&ctx
->completion_lock
);
3153 ret
= io_poll_cancel(ctx
, addr
);
3154 spin_unlock_irq(&ctx
->completion_lock
);
3156 io_cqring_add_event(req
, ret
);
3158 req_set_fail_links(req
);
3163 static void io_poll_complete(struct io_kiocb
*req
, __poll_t mask
, int error
)
3165 struct io_ring_ctx
*ctx
= req
->ctx
;
3167 req
->poll
.done
= true;
3169 io_cqring_fill_event(req
, error
);
3171 io_cqring_fill_event(req
, mangle_poll(mask
));
3172 io_commit_cqring(ctx
);
3175 static void io_poll_complete_work(struct io_wq_work
**workptr
)
3177 struct io_wq_work
*work
= *workptr
;
3178 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
3179 struct io_poll_iocb
*poll
= &req
->poll
;
3180 struct poll_table_struct pt
= { ._key
= poll
->events
};
3181 struct io_ring_ctx
*ctx
= req
->ctx
;
3182 struct io_kiocb
*nxt
= NULL
;
3186 if (work
->flags
& IO_WQ_WORK_CANCEL
) {
3187 WRITE_ONCE(poll
->canceled
, true);
3189 } else if (READ_ONCE(poll
->canceled
)) {
3193 if (ret
!= -ECANCELED
)
3194 mask
= vfs_poll(poll
->file
, &pt
) & poll
->events
;
3197 * Note that ->ki_cancel callers also delete iocb from active_reqs after
3198 * calling ->ki_cancel. We need the ctx_lock roundtrip here to
3199 * synchronize with them. In the cancellation case the list_del_init
3200 * itself is not actually needed, but harmless so we keep it in to
3201 * avoid further branches in the fast path.
3203 spin_lock_irq(&ctx
->completion_lock
);
3204 if (!mask
&& ret
!= -ECANCELED
) {
3205 add_wait_queue(poll
->head
, &poll
->wait
);
3206 spin_unlock_irq(&ctx
->completion_lock
);
3209 hash_del(&req
->hash_node
);
3210 io_poll_complete(req
, mask
, ret
);
3211 spin_unlock_irq(&ctx
->completion_lock
);
3213 io_cqring_ev_posted(ctx
);
3216 req_set_fail_links(req
);
3217 io_put_req_find_next(req
, &nxt
);
3219 io_wq_assign_next(workptr
, nxt
);
3222 static void __io_poll_flush(struct io_ring_ctx
*ctx
, struct llist_node
*nodes
)
3224 void *reqs
[IO_IOPOLL_BATCH
];
3225 struct io_kiocb
*req
, *tmp
;
3228 spin_lock_irq(&ctx
->completion_lock
);
3229 llist_for_each_entry_safe(req
, tmp
, nodes
, llist_node
) {
3230 hash_del(&req
->hash_node
);
3231 io_poll_complete(req
, req
->result
, 0);
3233 if (refcount_dec_and_test(&req
->refs
)) {
3234 if (io_req_multi_free(req
)) {
3235 reqs
[to_free
++] = req
;
3236 if (to_free
== ARRAY_SIZE(reqs
))
3237 io_free_req_many(ctx
, reqs
, &to_free
);
3239 req
->flags
|= REQ_F_COMP_LOCKED
;
3244 spin_unlock_irq(&ctx
->completion_lock
);
3246 io_cqring_ev_posted(ctx
);
3247 io_free_req_many(ctx
, reqs
, &to_free
);
3250 static void io_poll_flush(struct io_wq_work
**workptr
)
3252 struct io_kiocb
*req
= container_of(*workptr
, struct io_kiocb
, work
);
3253 struct llist_node
*nodes
;
3255 nodes
= llist_del_all(&req
->ctx
->poll_llist
);
3257 __io_poll_flush(req
->ctx
, nodes
);
3260 static int io_poll_wake(struct wait_queue_entry
*wait
, unsigned mode
, int sync
,
3263 struct io_poll_iocb
*poll
= wait
->private;
3264 struct io_kiocb
*req
= container_of(poll
, struct io_kiocb
, poll
);
3265 struct io_ring_ctx
*ctx
= req
->ctx
;
3266 __poll_t mask
= key_to_poll(key
);
3268 /* for instances that support it check for an event match first: */
3269 if (mask
&& !(mask
& poll
->events
))
3272 list_del_init(&poll
->wait
.entry
);
3275 * Run completion inline if we can. We're using trylock here because
3276 * we are violating the completion_lock -> poll wq lock ordering.
3277 * If we have a link timeout we're going to need the completion_lock
3278 * for finalizing the request, mark us as having grabbed that already.
3281 unsigned long flags
;
3283 if (llist_empty(&ctx
->poll_llist
) &&
3284 spin_trylock_irqsave(&ctx
->completion_lock
, flags
)) {
3285 hash_del(&req
->hash_node
);
3286 io_poll_complete(req
, mask
, 0);
3287 req
->flags
|= REQ_F_COMP_LOCKED
;
3289 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
3291 io_cqring_ev_posted(ctx
);
3295 req
->llist_node
.next
= NULL
;
3296 /* if the list wasn't empty, we're done */
3297 if (!llist_add(&req
->llist_node
, &ctx
->poll_llist
))
3300 req
->work
.func
= io_poll_flush
;
3304 io_queue_async_work(req
);
3309 struct io_poll_table
{
3310 struct poll_table_struct pt
;
3311 struct io_kiocb
*req
;
3315 static void io_poll_queue_proc(struct file
*file
, struct wait_queue_head
*head
,
3316 struct poll_table_struct
*p
)
3318 struct io_poll_table
*pt
= container_of(p
, struct io_poll_table
, pt
);
3320 if (unlikely(pt
->req
->poll
.head
)) {
3321 pt
->error
= -EINVAL
;
3326 pt
->req
->poll
.head
= head
;
3327 add_wait_queue(head
, &pt
->req
->poll
.wait
);
3330 static void io_poll_req_insert(struct io_kiocb
*req
)
3332 struct io_ring_ctx
*ctx
= req
->ctx
;
3333 struct hlist_head
*list
;
3335 list
= &ctx
->cancel_hash
[hash_long(req
->user_data
, ctx
->cancel_hash_bits
)];
3336 hlist_add_head(&req
->hash_node
, list
);
3339 static int io_poll_add_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
3341 struct io_poll_iocb
*poll
= &req
->poll
;
3344 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
3346 if (sqe
->addr
|| sqe
->ioprio
|| sqe
->off
|| sqe
->len
|| sqe
->buf_index
)
3351 events
= READ_ONCE(sqe
->poll_events
);
3352 poll
->events
= demangle_poll(events
) | EPOLLERR
| EPOLLHUP
;
3356 static int io_poll_add(struct io_kiocb
*req
, struct io_kiocb
**nxt
)
3358 struct io_poll_iocb
*poll
= &req
->poll
;
3359 struct io_ring_ctx
*ctx
= req
->ctx
;
3360 struct io_poll_table ipt
;
3361 bool cancel
= false;
3364 INIT_IO_WORK(&req
->work
, io_poll_complete_work
);
3365 INIT_HLIST_NODE(&req
->hash_node
);
3369 poll
->canceled
= false;
3371 ipt
.pt
._qproc
= io_poll_queue_proc
;
3372 ipt
.pt
._key
= poll
->events
;
3374 ipt
.error
= -EINVAL
; /* same as no support for IOCB_CMD_POLL */
3376 /* initialized the list so that we can do list_empty checks */
3377 INIT_LIST_HEAD(&poll
->wait
.entry
);
3378 init_waitqueue_func_entry(&poll
->wait
, io_poll_wake
);
3379 poll
->wait
.private = poll
;
3381 INIT_LIST_HEAD(&req
->list
);
3383 mask
= vfs_poll(poll
->file
, &ipt
.pt
) & poll
->events
;
3385 spin_lock_irq(&ctx
->completion_lock
);
3386 if (likely(poll
->head
)) {
3387 spin_lock(&poll
->head
->lock
);
3388 if (unlikely(list_empty(&poll
->wait
.entry
))) {
3394 if (mask
|| ipt
.error
)
3395 list_del_init(&poll
->wait
.entry
);
3397 WRITE_ONCE(poll
->canceled
, true);
3398 else if (!poll
->done
) /* actually waiting for an event */
3399 io_poll_req_insert(req
);
3400 spin_unlock(&poll
->head
->lock
);
3402 if (mask
) { /* no async, we'd stolen it */
3404 io_poll_complete(req
, mask
, 0);
3406 spin_unlock_irq(&ctx
->completion_lock
);
3409 io_cqring_ev_posted(ctx
);
3410 io_put_req_find_next(req
, nxt
);
3415 static enum hrtimer_restart
io_timeout_fn(struct hrtimer
*timer
)
3417 struct io_timeout_data
*data
= container_of(timer
,
3418 struct io_timeout_data
, timer
);
3419 struct io_kiocb
*req
= data
->req
;
3420 struct io_ring_ctx
*ctx
= req
->ctx
;
3421 unsigned long flags
;
3423 atomic_inc(&ctx
->cq_timeouts
);
3425 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
3427 * We could be racing with timeout deletion. If the list is empty,
3428 * then timeout lookup already found it and will be handling it.
3430 if (!list_empty(&req
->list
)) {
3431 struct io_kiocb
*prev
;
3434 * Adjust the reqs sequence before the current one because it
3435 * will consume a slot in the cq_ring and the cq_tail
3436 * pointer will be increased, otherwise other timeout reqs may
3437 * return in advance without waiting for enough wait_nr.
3440 list_for_each_entry_continue_reverse(prev
, &ctx
->timeout_list
, list
)
3442 list_del_init(&req
->list
);
3445 io_cqring_fill_event(req
, -ETIME
);
3446 io_commit_cqring(ctx
);
3447 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
3449 io_cqring_ev_posted(ctx
);
3450 req_set_fail_links(req
);
3452 return HRTIMER_NORESTART
;
3455 static int io_timeout_cancel(struct io_ring_ctx
*ctx
, __u64 user_data
)
3457 struct io_kiocb
*req
;
3460 list_for_each_entry(req
, &ctx
->timeout_list
, list
) {
3461 if (user_data
== req
->user_data
) {
3462 list_del_init(&req
->list
);
3471 ret
= hrtimer_try_to_cancel(&req
->io
->timeout
.timer
);
3475 req_set_fail_links(req
);
3476 io_cqring_fill_event(req
, -ECANCELED
);
3481 static int io_timeout_remove_prep(struct io_kiocb
*req
,
3482 const struct io_uring_sqe
*sqe
)
3484 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
3486 if (sqe
->flags
|| sqe
->ioprio
|| sqe
->buf_index
|| sqe
->len
)
3489 req
->timeout
.addr
= READ_ONCE(sqe
->addr
);
3490 req
->timeout
.flags
= READ_ONCE(sqe
->timeout_flags
);
3491 if (req
->timeout
.flags
)
3498 * Remove or update an existing timeout command
3500 static int io_timeout_remove(struct io_kiocb
*req
)
3502 struct io_ring_ctx
*ctx
= req
->ctx
;
3505 spin_lock_irq(&ctx
->completion_lock
);
3506 ret
= io_timeout_cancel(ctx
, req
->timeout
.addr
);
3508 io_cqring_fill_event(req
, ret
);
3509 io_commit_cqring(ctx
);
3510 spin_unlock_irq(&ctx
->completion_lock
);
3511 io_cqring_ev_posted(ctx
);
3513 req_set_fail_links(req
);
3518 static int io_timeout_prep(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
,
3519 bool is_timeout_link
)
3521 struct io_timeout_data
*data
;
3524 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
3526 if (sqe
->ioprio
|| sqe
->buf_index
|| sqe
->len
!= 1)
3528 if (sqe
->off
&& is_timeout_link
)
3530 flags
= READ_ONCE(sqe
->timeout_flags
);
3531 if (flags
& ~IORING_TIMEOUT_ABS
)
3534 req
->timeout
.count
= READ_ONCE(sqe
->off
);
3536 if (!req
->io
&& io_alloc_async_ctx(req
))
3539 data
= &req
->io
->timeout
;
3541 req
->flags
|= REQ_F_TIMEOUT
;
3543 if (get_timespec64(&data
->ts
, u64_to_user_ptr(sqe
->addr
)))
3546 if (flags
& IORING_TIMEOUT_ABS
)
3547 data
->mode
= HRTIMER_MODE_ABS
;
3549 data
->mode
= HRTIMER_MODE_REL
;
3551 hrtimer_init(&data
->timer
, CLOCK_MONOTONIC
, data
->mode
);
3555 static int io_timeout(struct io_kiocb
*req
)
3558 struct io_ring_ctx
*ctx
= req
->ctx
;
3559 struct io_timeout_data
*data
;
3560 struct list_head
*entry
;
3563 data
= &req
->io
->timeout
;
3566 * sqe->off holds how many events that need to occur for this
3567 * timeout event to be satisfied. If it isn't set, then this is
3568 * a pure timeout request, sequence isn't used.
3570 count
= req
->timeout
.count
;
3572 req
->flags
|= REQ_F_TIMEOUT_NOSEQ
;
3573 spin_lock_irq(&ctx
->completion_lock
);
3574 entry
= ctx
->timeout_list
.prev
;
3578 req
->sequence
= ctx
->cached_sq_head
+ count
- 1;
3579 data
->seq_offset
= count
;
3582 * Insertion sort, ensuring the first entry in the list is always
3583 * the one we need first.
3585 spin_lock_irq(&ctx
->completion_lock
);
3586 list_for_each_prev(entry
, &ctx
->timeout_list
) {
3587 struct io_kiocb
*nxt
= list_entry(entry
, struct io_kiocb
, list
);
3588 unsigned nxt_sq_head
;
3589 long long tmp
, tmp_nxt
;
3590 u32 nxt_offset
= nxt
->io
->timeout
.seq_offset
;
3592 if (nxt
->flags
& REQ_F_TIMEOUT_NOSEQ
)
3596 * Since cached_sq_head + count - 1 can overflow, use type long
3599 tmp
= (long long)ctx
->cached_sq_head
+ count
- 1;
3600 nxt_sq_head
= nxt
->sequence
- nxt_offset
+ 1;
3601 tmp_nxt
= (long long)nxt_sq_head
+ nxt_offset
- 1;
3604 * cached_sq_head may overflow, and it will never overflow twice
3605 * once there is some timeout req still be valid.
3607 if (ctx
->cached_sq_head
< nxt_sq_head
)
3614 * Sequence of reqs after the insert one and itself should
3615 * be adjusted because each timeout req consumes a slot.
3620 req
->sequence
-= span
;
3622 list_add(&req
->list
, entry
);
3623 data
->timer
.function
= io_timeout_fn
;
3624 hrtimer_start(&data
->timer
, timespec64_to_ktime(data
->ts
), data
->mode
);
3625 spin_unlock_irq(&ctx
->completion_lock
);
3629 static bool io_cancel_cb(struct io_wq_work
*work
, void *data
)
3631 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
3633 return req
->user_data
== (unsigned long) data
;
3636 static int io_async_cancel_one(struct io_ring_ctx
*ctx
, void *sqe_addr
)
3638 enum io_wq_cancel cancel_ret
;
3641 cancel_ret
= io_wq_cancel_cb(ctx
->io_wq
, io_cancel_cb
, sqe_addr
);
3642 switch (cancel_ret
) {
3643 case IO_WQ_CANCEL_OK
:
3646 case IO_WQ_CANCEL_RUNNING
:
3649 case IO_WQ_CANCEL_NOTFOUND
:
3657 static void io_async_find_and_cancel(struct io_ring_ctx
*ctx
,
3658 struct io_kiocb
*req
, __u64 sqe_addr
,
3659 struct io_kiocb
**nxt
, int success_ret
)
3661 unsigned long flags
;
3664 ret
= io_async_cancel_one(ctx
, (void *) (unsigned long) sqe_addr
);
3665 if (ret
!= -ENOENT
) {
3666 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
3670 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
3671 ret
= io_timeout_cancel(ctx
, sqe_addr
);
3674 ret
= io_poll_cancel(ctx
, sqe_addr
);
3678 io_cqring_fill_event(req
, ret
);
3679 io_commit_cqring(ctx
);
3680 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
3681 io_cqring_ev_posted(ctx
);
3684 req_set_fail_links(req
);
3685 io_put_req_find_next(req
, nxt
);
3688 static int io_async_cancel_prep(struct io_kiocb
*req
,
3689 const struct io_uring_sqe
*sqe
)
3691 if (unlikely(req
->ctx
->flags
& IORING_SETUP_IOPOLL
))
3693 if (sqe
->flags
|| sqe
->ioprio
|| sqe
->off
|| sqe
->len
||
3697 req
->cancel
.addr
= READ_ONCE(sqe
->addr
);
3701 static int io_async_cancel(struct io_kiocb
*req
, struct io_kiocb
**nxt
)
3703 struct io_ring_ctx
*ctx
= req
->ctx
;
3705 io_async_find_and_cancel(ctx
, req
, req
->cancel
.addr
, nxt
, 0);
3709 static int io_files_update_prep(struct io_kiocb
*req
,
3710 const struct io_uring_sqe
*sqe
)
3712 if (sqe
->flags
|| sqe
->ioprio
|| sqe
->rw_flags
)
3715 req
->files_update
.offset
= READ_ONCE(sqe
->off
);
3716 req
->files_update
.nr_args
= READ_ONCE(sqe
->len
);
3717 if (!req
->files_update
.nr_args
)
3719 req
->files_update
.arg
= READ_ONCE(sqe
->addr
);
3723 static int io_files_update(struct io_kiocb
*req
, bool force_nonblock
)
3725 struct io_ring_ctx
*ctx
= req
->ctx
;
3726 struct io_uring_files_update up
;
3729 if (force_nonblock
) {
3730 req
->work
.flags
|= IO_WQ_WORK_NEEDS_FILES
;
3734 up
.offset
= req
->files_update
.offset
;
3735 up
.fds
= req
->files_update
.arg
;
3737 mutex_lock(&ctx
->uring_lock
);
3738 ret
= __io_sqe_files_update(ctx
, &up
, req
->files_update
.nr_args
);
3739 mutex_unlock(&ctx
->uring_lock
);
3742 req_set_fail_links(req
);
3743 io_cqring_add_event(req
, ret
);
3748 static int io_req_defer_prep(struct io_kiocb
*req
,
3749 const struct io_uring_sqe
*sqe
)
3753 switch (req
->opcode
) {
3756 case IORING_OP_READV
:
3757 case IORING_OP_READ_FIXED
:
3758 case IORING_OP_READ
:
3759 ret
= io_read_prep(req
, sqe
, true);
3761 case IORING_OP_WRITEV
:
3762 case IORING_OP_WRITE_FIXED
:
3763 case IORING_OP_WRITE
:
3764 ret
= io_write_prep(req
, sqe
, true);
3766 case IORING_OP_POLL_ADD
:
3767 ret
= io_poll_add_prep(req
, sqe
);
3769 case IORING_OP_POLL_REMOVE
:
3770 ret
= io_poll_remove_prep(req
, sqe
);
3772 case IORING_OP_FSYNC
:
3773 ret
= io_prep_fsync(req
, sqe
);
3775 case IORING_OP_SYNC_FILE_RANGE
:
3776 ret
= io_prep_sfr(req
, sqe
);
3778 case IORING_OP_SENDMSG
:
3779 ret
= io_sendmsg_prep(req
, sqe
);
3781 case IORING_OP_RECVMSG
:
3782 ret
= io_recvmsg_prep(req
, sqe
);
3784 case IORING_OP_CONNECT
:
3785 ret
= io_connect_prep(req
, sqe
);
3787 case IORING_OP_TIMEOUT
:
3788 ret
= io_timeout_prep(req
, sqe
, false);
3790 case IORING_OP_TIMEOUT_REMOVE
:
3791 ret
= io_timeout_remove_prep(req
, sqe
);
3793 case IORING_OP_ASYNC_CANCEL
:
3794 ret
= io_async_cancel_prep(req
, sqe
);
3796 case IORING_OP_LINK_TIMEOUT
:
3797 ret
= io_timeout_prep(req
, sqe
, true);
3799 case IORING_OP_ACCEPT
:
3800 ret
= io_accept_prep(req
, sqe
);
3802 case IORING_OP_FALLOCATE
:
3803 ret
= io_fallocate_prep(req
, sqe
);
3805 case IORING_OP_OPENAT
:
3806 ret
= io_openat_prep(req
, sqe
);
3808 case IORING_OP_CLOSE
:
3809 ret
= io_close_prep(req
, sqe
);
3811 case IORING_OP_FILES_UPDATE
:
3812 ret
= io_files_update_prep(req
, sqe
);
3814 case IORING_OP_STATX
:
3815 ret
= io_statx_prep(req
, sqe
);
3817 case IORING_OP_FADVISE
:
3818 ret
= io_fadvise_prep(req
, sqe
);
3820 case IORING_OP_MADVISE
:
3821 ret
= io_madvise_prep(req
, sqe
);
3824 printk_once(KERN_WARNING
"io_uring: unhandled opcode %d\n",
3833 static int io_req_defer(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
3835 struct io_ring_ctx
*ctx
= req
->ctx
;
3838 /* Still need defer if there is pending req in defer list. */
3839 if (!req_need_defer(req
) && list_empty(&ctx
->defer_list
))
3842 if (!req
->io
&& io_alloc_async_ctx(req
))
3845 ret
= io_req_defer_prep(req
, sqe
);
3849 spin_lock_irq(&ctx
->completion_lock
);
3850 if (!req_need_defer(req
) && list_empty(&ctx
->defer_list
)) {
3851 spin_unlock_irq(&ctx
->completion_lock
);
3855 trace_io_uring_defer(ctx
, req
, req
->user_data
);
3856 list_add_tail(&req
->list
, &ctx
->defer_list
);
3857 spin_unlock_irq(&ctx
->completion_lock
);
3858 return -EIOCBQUEUED
;
3861 static int io_issue_sqe(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
,
3862 struct io_kiocb
**nxt
, bool force_nonblock
)
3864 struct io_ring_ctx
*ctx
= req
->ctx
;
3867 switch (req
->opcode
) {
3871 case IORING_OP_READV
:
3872 case IORING_OP_READ_FIXED
:
3873 case IORING_OP_READ
:
3875 ret
= io_read_prep(req
, sqe
, force_nonblock
);
3879 ret
= io_read(req
, nxt
, force_nonblock
);
3881 case IORING_OP_WRITEV
:
3882 case IORING_OP_WRITE_FIXED
:
3883 case IORING_OP_WRITE
:
3885 ret
= io_write_prep(req
, sqe
, force_nonblock
);
3889 ret
= io_write(req
, nxt
, force_nonblock
);
3891 case IORING_OP_FSYNC
:
3893 ret
= io_prep_fsync(req
, sqe
);
3897 ret
= io_fsync(req
, nxt
, force_nonblock
);
3899 case IORING_OP_POLL_ADD
:
3901 ret
= io_poll_add_prep(req
, sqe
);
3905 ret
= io_poll_add(req
, nxt
);
3907 case IORING_OP_POLL_REMOVE
:
3909 ret
= io_poll_remove_prep(req
, sqe
);
3913 ret
= io_poll_remove(req
);
3915 case IORING_OP_SYNC_FILE_RANGE
:
3917 ret
= io_prep_sfr(req
, sqe
);
3921 ret
= io_sync_file_range(req
, nxt
, force_nonblock
);
3923 case IORING_OP_SENDMSG
:
3925 ret
= io_sendmsg_prep(req
, sqe
);
3929 ret
= io_sendmsg(req
, nxt
, force_nonblock
);
3931 case IORING_OP_RECVMSG
:
3933 ret
= io_recvmsg_prep(req
, sqe
);
3937 ret
= io_recvmsg(req
, nxt
, force_nonblock
);
3939 case IORING_OP_TIMEOUT
:
3941 ret
= io_timeout_prep(req
, sqe
, false);
3945 ret
= io_timeout(req
);
3947 case IORING_OP_TIMEOUT_REMOVE
:
3949 ret
= io_timeout_remove_prep(req
, sqe
);
3953 ret
= io_timeout_remove(req
);
3955 case IORING_OP_ACCEPT
:
3957 ret
= io_accept_prep(req
, sqe
);
3961 ret
= io_accept(req
, nxt
, force_nonblock
);
3963 case IORING_OP_CONNECT
:
3965 ret
= io_connect_prep(req
, sqe
);
3969 ret
= io_connect(req
, nxt
, force_nonblock
);
3971 case IORING_OP_ASYNC_CANCEL
:
3973 ret
= io_async_cancel_prep(req
, sqe
);
3977 ret
= io_async_cancel(req
, nxt
);
3979 case IORING_OP_FALLOCATE
:
3981 ret
= io_fallocate_prep(req
, sqe
);
3985 ret
= io_fallocate(req
, nxt
, force_nonblock
);
3987 case IORING_OP_OPENAT
:
3989 ret
= io_openat_prep(req
, sqe
);
3993 ret
= io_openat(req
, nxt
, force_nonblock
);
3995 case IORING_OP_CLOSE
:
3997 ret
= io_close_prep(req
, sqe
);
4001 ret
= io_close(req
, nxt
, force_nonblock
);
4003 case IORING_OP_FILES_UPDATE
:
4005 ret
= io_files_update_prep(req
, sqe
);
4009 ret
= io_files_update(req
, force_nonblock
);
4011 case IORING_OP_STATX
:
4013 ret
= io_statx_prep(req
, sqe
);
4017 ret
= io_statx(req
, nxt
, force_nonblock
);
4019 case IORING_OP_FADVISE
:
4021 ret
= io_fadvise_prep(req
, sqe
);
4025 ret
= io_fadvise(req
, nxt
, force_nonblock
);
4027 case IORING_OP_MADVISE
:
4029 ret
= io_madvise_prep(req
, sqe
);
4033 ret
= io_madvise(req
, nxt
, force_nonblock
);
4043 if (ctx
->flags
& IORING_SETUP_IOPOLL
) {
4044 const bool in_async
= io_wq_current_is_worker();
4046 if (req
->result
== -EAGAIN
)
4049 /* workqueue context doesn't hold uring_lock, grab it now */
4051 mutex_lock(&ctx
->uring_lock
);
4053 io_iopoll_req_issued(req
);
4056 mutex_unlock(&ctx
->uring_lock
);
4062 static void io_wq_submit_work(struct io_wq_work
**workptr
)
4064 struct io_wq_work
*work
= *workptr
;
4065 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
4066 struct io_kiocb
*nxt
= NULL
;
4069 /* if NO_CANCEL is set, we must still run the work */
4070 if ((work
->flags
& (IO_WQ_WORK_CANCEL
|IO_WQ_WORK_NO_CANCEL
)) ==
4071 IO_WQ_WORK_CANCEL
) {
4076 req
->has_user
= (work
->flags
& IO_WQ_WORK_HAS_MM
) != 0;
4077 req
->in_async
= true;
4079 ret
= io_issue_sqe(req
, NULL
, &nxt
, false);
4081 * We can get EAGAIN for polled IO even though we're
4082 * forcing a sync submission from here, since we can't
4083 * wait for request slots on the block side.
4091 /* drop submission reference */
4095 req_set_fail_links(req
);
4096 io_cqring_add_event(req
, ret
);
4100 /* if a dependent link is ready, pass it back */
4102 io_wq_assign_next(workptr
, nxt
);
4105 static int io_req_needs_file(struct io_kiocb
*req
, int fd
)
4107 if (!io_op_defs
[req
->opcode
].needs_file
)
4109 if (fd
== -1 && io_op_defs
[req
->opcode
].fd_non_neg
)
4114 static inline struct file
*io_file_from_index(struct io_ring_ctx
*ctx
,
4117 struct fixed_file_table
*table
;
4119 table
= &ctx
->file_data
->table
[index
>> IORING_FILE_TABLE_SHIFT
];
4120 return table
->files
[index
& IORING_FILE_TABLE_MASK
];;
4123 static int io_req_set_file(struct io_submit_state
*state
, struct io_kiocb
*req
,
4124 const struct io_uring_sqe
*sqe
)
4126 struct io_ring_ctx
*ctx
= req
->ctx
;
4130 flags
= READ_ONCE(sqe
->flags
);
4131 fd
= READ_ONCE(sqe
->fd
);
4133 if (flags
& IOSQE_IO_DRAIN
)
4134 req
->flags
|= REQ_F_IO_DRAIN
;
4136 if (!io_req_needs_file(req
, fd
))
4139 if (flags
& IOSQE_FIXED_FILE
) {
4140 if (unlikely(!ctx
->file_data
||
4141 (unsigned) fd
>= ctx
->nr_user_files
))
4143 fd
= array_index_nospec(fd
, ctx
->nr_user_files
);
4144 req
->file
= io_file_from_index(ctx
, fd
);
4147 req
->flags
|= REQ_F_FIXED_FILE
;
4148 percpu_ref_get(&ctx
->file_data
->refs
);
4150 if (req
->needs_fixed_file
)
4152 trace_io_uring_file_get(ctx
, fd
);
4153 req
->file
= io_file_get(state
, fd
);
4154 if (unlikely(!req
->file
))
4161 static int io_grab_files(struct io_kiocb
*req
)
4164 struct io_ring_ctx
*ctx
= req
->ctx
;
4166 if (!req
->ring_file
)
4170 spin_lock_irq(&ctx
->inflight_lock
);
4172 * We use the f_ops->flush() handler to ensure that we can flush
4173 * out work accessing these files if the fd is closed. Check if
4174 * the fd has changed since we started down this path, and disallow
4175 * this operation if it has.
4177 if (fcheck(req
->ring_fd
) == req
->ring_file
) {
4178 list_add(&req
->inflight_entry
, &ctx
->inflight_list
);
4179 req
->flags
|= REQ_F_INFLIGHT
;
4180 req
->work
.files
= current
->files
;
4183 spin_unlock_irq(&ctx
->inflight_lock
);
4189 static enum hrtimer_restart
io_link_timeout_fn(struct hrtimer
*timer
)
4191 struct io_timeout_data
*data
= container_of(timer
,
4192 struct io_timeout_data
, timer
);
4193 struct io_kiocb
*req
= data
->req
;
4194 struct io_ring_ctx
*ctx
= req
->ctx
;
4195 struct io_kiocb
*prev
= NULL
;
4196 unsigned long flags
;
4198 spin_lock_irqsave(&ctx
->completion_lock
, flags
);
4201 * We don't expect the list to be empty, that will only happen if we
4202 * race with the completion of the linked work.
4204 if (!list_empty(&req
->link_list
)) {
4205 prev
= list_entry(req
->link_list
.prev
, struct io_kiocb
,
4207 if (refcount_inc_not_zero(&prev
->refs
)) {
4208 list_del_init(&req
->link_list
);
4209 prev
->flags
&= ~REQ_F_LINK_TIMEOUT
;
4214 spin_unlock_irqrestore(&ctx
->completion_lock
, flags
);
4217 req_set_fail_links(prev
);
4218 io_async_find_and_cancel(ctx
, req
, prev
->user_data
, NULL
,
4222 io_cqring_add_event(req
, -ETIME
);
4225 return HRTIMER_NORESTART
;
4228 static void io_queue_linked_timeout(struct io_kiocb
*req
)
4230 struct io_ring_ctx
*ctx
= req
->ctx
;
4233 * If the list is now empty, then our linked request finished before
4234 * we got a chance to setup the timer
4236 spin_lock_irq(&ctx
->completion_lock
);
4237 if (!list_empty(&req
->link_list
)) {
4238 struct io_timeout_data
*data
= &req
->io
->timeout
;
4240 data
->timer
.function
= io_link_timeout_fn
;
4241 hrtimer_start(&data
->timer
, timespec64_to_ktime(data
->ts
),
4244 spin_unlock_irq(&ctx
->completion_lock
);
4246 /* drop submission reference */
4250 static struct io_kiocb
*io_prep_linked_timeout(struct io_kiocb
*req
)
4252 struct io_kiocb
*nxt
;
4254 if (!(req
->flags
& REQ_F_LINK
))
4257 nxt
= list_first_entry_or_null(&req
->link_list
, struct io_kiocb
,
4259 if (!nxt
|| nxt
->opcode
!= IORING_OP_LINK_TIMEOUT
)
4262 req
->flags
|= REQ_F_LINK_TIMEOUT
;
4266 static void __io_queue_sqe(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4268 struct io_kiocb
*linked_timeout
;
4269 struct io_kiocb
*nxt
= NULL
;
4273 linked_timeout
= io_prep_linked_timeout(req
);
4275 ret
= io_issue_sqe(req
, sqe
, &nxt
, true);
4278 * We async punt it if the file wasn't marked NOWAIT, or if the file
4279 * doesn't support non-blocking read/write attempts
4281 if (ret
== -EAGAIN
&& (!(req
->flags
& REQ_F_NOWAIT
) ||
4282 (req
->flags
& REQ_F_MUST_PUNT
))) {
4283 if (req
->work
.flags
& IO_WQ_WORK_NEEDS_FILES
) {
4284 ret
= io_grab_files(req
);
4290 * Queued up for async execution, worker will release
4291 * submit reference when the iocb is actually submitted.
4293 io_queue_async_work(req
);
4298 /* drop submission reference */
4301 if (linked_timeout
) {
4303 io_queue_linked_timeout(linked_timeout
);
4305 io_put_req(linked_timeout
);
4308 /* and drop final reference, if we failed */
4310 io_cqring_add_event(req
, ret
);
4311 req_set_fail_links(req
);
4322 static void io_queue_sqe(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
)
4326 if (unlikely(req
->ctx
->drain_next
)) {
4327 req
->flags
|= REQ_F_IO_DRAIN
;
4328 req
->ctx
->drain_next
= false;
4330 req
->ctx
->drain_next
= (req
->flags
& REQ_F_DRAIN_LINK
);
4332 ret
= io_req_defer(req
, sqe
);
4334 if (ret
!= -EIOCBQUEUED
) {
4335 io_cqring_add_event(req
, ret
);
4336 req_set_fail_links(req
);
4337 io_double_put_req(req
);
4339 } else if ((req
->flags
& REQ_F_FORCE_ASYNC
) &&
4340 !io_wq_current_is_worker()) {
4342 * Never try inline submit of IOSQE_ASYNC is set, go straight
4343 * to async execution.
4345 req
->work
.flags
|= IO_WQ_WORK_CONCURRENT
;
4346 io_queue_async_work(req
);
4348 __io_queue_sqe(req
, sqe
);
4352 static inline void io_queue_link_head(struct io_kiocb
*req
)
4354 if (unlikely(req
->flags
& REQ_F_FAIL_LINK
)) {
4355 io_cqring_add_event(req
, -ECANCELED
);
4356 io_double_put_req(req
);
4358 io_queue_sqe(req
, NULL
);
4361 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
4362 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
4364 static bool io_submit_sqe(struct io_kiocb
*req
, const struct io_uring_sqe
*sqe
,
4365 struct io_submit_state
*state
, struct io_kiocb
**link
)
4367 struct io_ring_ctx
*ctx
= req
->ctx
;
4368 unsigned int sqe_flags
;
4371 sqe_flags
= READ_ONCE(sqe
->flags
);
4373 /* enforce forwards compatibility on users */
4374 if (unlikely(sqe_flags
& ~SQE_VALID_FLAGS
)) {
4378 if (sqe_flags
& IOSQE_ASYNC
)
4379 req
->flags
|= REQ_F_FORCE_ASYNC
;
4381 ret
= io_req_set_file(state
, req
, sqe
);
4382 if (unlikely(ret
)) {
4384 io_cqring_add_event(req
, ret
);
4385 io_double_put_req(req
);
4390 * If we already have a head request, queue this one for async
4391 * submittal once the head completes. If we don't have a head but
4392 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
4393 * submitted sync once the chain is complete. If none of those
4394 * conditions are true (normal request), then just queue it.
4397 struct io_kiocb
*head
= *link
;
4399 if (sqe_flags
& IOSQE_IO_DRAIN
)
4400 head
->flags
|= REQ_F_DRAIN_LINK
| REQ_F_IO_DRAIN
;
4402 if (sqe_flags
& IOSQE_IO_HARDLINK
)
4403 req
->flags
|= REQ_F_HARDLINK
;
4405 if (io_alloc_async_ctx(req
)) {
4410 ret
= io_req_defer_prep(req
, sqe
);
4412 /* fail even hard links since we don't submit */
4413 head
->flags
|= REQ_F_FAIL_LINK
;
4416 trace_io_uring_link(ctx
, req
, head
);
4417 list_add_tail(&req
->link_list
, &head
->link_list
);
4419 /* last request of a link, enqueue the link */
4420 if (!(sqe_flags
& (IOSQE_IO_LINK
|IOSQE_IO_HARDLINK
))) {
4421 io_queue_link_head(head
);
4424 } else if (sqe_flags
& (IOSQE_IO_LINK
|IOSQE_IO_HARDLINK
)) {
4425 req
->flags
|= REQ_F_LINK
;
4426 if (sqe_flags
& IOSQE_IO_HARDLINK
)
4427 req
->flags
|= REQ_F_HARDLINK
;
4429 INIT_LIST_HEAD(&req
->link_list
);
4430 ret
= io_req_defer_prep(req
, sqe
);
4432 req
->flags
|= REQ_F_FAIL_LINK
;
4435 io_queue_sqe(req
, sqe
);
4442 * Batched submission is done, ensure local IO is flushed out.
4444 static void io_submit_state_end(struct io_submit_state
*state
)
4446 blk_finish_plug(&state
->plug
);
4448 if (state
->free_reqs
)
4449 kmem_cache_free_bulk(req_cachep
, state
->free_reqs
,
4450 &state
->reqs
[state
->cur_req
]);
4454 * Start submission side cache.
4456 static void io_submit_state_start(struct io_submit_state
*state
,
4457 unsigned int max_ios
)
4459 blk_start_plug(&state
->plug
);
4460 state
->free_reqs
= 0;
4462 state
->ios_left
= max_ios
;
4465 static void io_commit_sqring(struct io_ring_ctx
*ctx
)
4467 struct io_rings
*rings
= ctx
->rings
;
4469 if (ctx
->cached_sq_head
!= READ_ONCE(rings
->sq
.head
)) {
4471 * Ensure any loads from the SQEs are done at this point,
4472 * since once we write the new head, the application could
4473 * write new data to them.
4475 smp_store_release(&rings
->sq
.head
, ctx
->cached_sq_head
);
4480 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
4481 * that is mapped by userspace. This means that care needs to be taken to
4482 * ensure that reads are stable, as we cannot rely on userspace always
4483 * being a good citizen. If members of the sqe are validated and then later
4484 * used, it's important that those reads are done through READ_ONCE() to
4485 * prevent a re-load down the line.
4487 static bool io_get_sqring(struct io_ring_ctx
*ctx
, struct io_kiocb
*req
,
4488 const struct io_uring_sqe
**sqe_ptr
)
4490 struct io_rings
*rings
= ctx
->rings
;
4491 u32
*sq_array
= ctx
->sq_array
;
4495 * The cached sq head (or cq tail) serves two purposes:
4497 * 1) allows us to batch the cost of updating the user visible
4499 * 2) allows the kernel side to track the head on its own, even
4500 * though the application is the one updating it.
4502 head
= ctx
->cached_sq_head
;
4503 /* make sure SQ entry isn't read before tail */
4504 if (unlikely(head
== smp_load_acquire(&rings
->sq
.tail
)))
4507 head
= READ_ONCE(sq_array
[head
& ctx
->sq_mask
]);
4508 if (likely(head
< ctx
->sq_entries
)) {
4510 * All io need record the previous position, if LINK vs DARIN,
4511 * it can be used to mark the position of the first IO in the
4514 req
->sequence
= ctx
->cached_sq_head
;
4515 *sqe_ptr
= &ctx
->sq_sqes
[head
];
4516 req
->opcode
= READ_ONCE((*sqe_ptr
)->opcode
);
4517 req
->user_data
= READ_ONCE((*sqe_ptr
)->user_data
);
4518 ctx
->cached_sq_head
++;
4522 /* drop invalid entries */
4523 ctx
->cached_sq_head
++;
4524 ctx
->cached_sq_dropped
++;
4525 WRITE_ONCE(rings
->sq_dropped
, ctx
->cached_sq_dropped
);
4529 static int io_submit_sqes(struct io_ring_ctx
*ctx
, unsigned int nr
,
4530 struct file
*ring_file
, int ring_fd
,
4531 struct mm_struct
**mm
, bool async
)
4533 struct io_submit_state state
, *statep
= NULL
;
4534 struct io_kiocb
*link
= NULL
;
4535 int i
, submitted
= 0;
4536 bool mm_fault
= false;
4538 /* if we have a backlog and couldn't flush it all, return BUSY */
4539 if (test_bit(0, &ctx
->sq_check_overflow
)) {
4540 if (!list_empty(&ctx
->cq_overflow_list
) &&
4541 !io_cqring_overflow_flush(ctx
, false))
4545 if (!percpu_ref_tryget_many(&ctx
->refs
, nr
))
4548 if (nr
> IO_PLUG_THRESHOLD
) {
4549 io_submit_state_start(&state
, nr
);
4553 for (i
= 0; i
< nr
; i
++) {
4554 const struct io_uring_sqe
*sqe
;
4555 struct io_kiocb
*req
;
4557 req
= io_get_req(ctx
, statep
);
4558 if (unlikely(!req
)) {
4560 submitted
= -EAGAIN
;
4563 if (!io_get_sqring(ctx
, req
, &sqe
)) {
4564 __io_req_do_free(req
);
4568 /* will complete beyond this point, count as submitted */
4571 if (unlikely(req
->opcode
>= IORING_OP_LAST
)) {
4572 io_cqring_add_event(req
, -EINVAL
);
4573 io_double_put_req(req
);
4577 if (io_op_defs
[req
->opcode
].needs_mm
&& !*mm
) {
4578 mm_fault
= mm_fault
|| !mmget_not_zero(ctx
->sqo_mm
);
4580 use_mm(ctx
->sqo_mm
);
4585 req
->ring_file
= ring_file
;
4586 req
->ring_fd
= ring_fd
;
4587 req
->has_user
= *mm
!= NULL
;
4588 req
->in_async
= async
;
4589 req
->needs_fixed_file
= async
;
4590 trace_io_uring_submit_sqe(ctx
, req
->user_data
, true, async
);
4591 if (!io_submit_sqe(req
, sqe
, statep
, &link
))
4595 if (submitted
!= nr
)
4596 percpu_ref_put_many(&ctx
->refs
, nr
- submitted
);
4598 io_queue_link_head(link
);
4600 io_submit_state_end(&state
);
4602 /* Commit SQ ring head once we've consumed and submitted all SQEs */
4603 io_commit_sqring(ctx
);
4608 static int io_sq_thread(void *data
)
4610 struct io_ring_ctx
*ctx
= data
;
4611 struct mm_struct
*cur_mm
= NULL
;
4612 const struct cred
*old_cred
;
4613 mm_segment_t old_fs
;
4616 unsigned long timeout
;
4619 complete(&ctx
->completions
[1]);
4623 old_cred
= override_creds(ctx
->creds
);
4625 ret
= timeout
= inflight
= 0;
4626 while (!kthread_should_park()) {
4627 unsigned int to_submit
;
4630 unsigned nr_events
= 0;
4632 if (ctx
->flags
& IORING_SETUP_IOPOLL
) {
4634 * inflight is the count of the maximum possible
4635 * entries we submitted, but it can be smaller
4636 * if we dropped some of them. If we don't have
4637 * poll entries available, then we know that we
4638 * have nothing left to poll for. Reset the
4639 * inflight count to zero in that case.
4641 mutex_lock(&ctx
->uring_lock
);
4642 if (!list_empty(&ctx
->poll_list
))
4643 __io_iopoll_check(ctx
, &nr_events
, 0);
4646 mutex_unlock(&ctx
->uring_lock
);
4649 * Normal IO, just pretend everything completed.
4650 * We don't have to poll completions for that.
4652 nr_events
= inflight
;
4655 inflight
-= nr_events
;
4657 timeout
= jiffies
+ ctx
->sq_thread_idle
;
4660 to_submit
= io_sqring_entries(ctx
);
4663 * If submit got -EBUSY, flag us as needing the application
4664 * to enter the kernel to reap and flush events.
4666 if (!to_submit
|| ret
== -EBUSY
) {
4668 * We're polling. If we're within the defined idle
4669 * period, then let us spin without work before going
4670 * to sleep. The exception is if we got EBUSY doing
4671 * more IO, we should wait for the application to
4672 * reap events and wake us up.
4675 (!time_after(jiffies
, timeout
) && ret
!= -EBUSY
)) {
4681 * Drop cur_mm before scheduling, we can't hold it for
4682 * long periods (or over schedule()). Do this before
4683 * adding ourselves to the waitqueue, as the unuse/drop
4692 prepare_to_wait(&ctx
->sqo_wait
, &wait
,
4693 TASK_INTERRUPTIBLE
);
4695 /* Tell userspace we may need a wakeup call */
4696 ctx
->rings
->sq_flags
|= IORING_SQ_NEED_WAKEUP
;
4697 /* make sure to read SQ tail after writing flags */
4700 to_submit
= io_sqring_entries(ctx
);
4701 if (!to_submit
|| ret
== -EBUSY
) {
4702 if (kthread_should_park()) {
4703 finish_wait(&ctx
->sqo_wait
, &wait
);
4706 if (signal_pending(current
))
4707 flush_signals(current
);
4709 finish_wait(&ctx
->sqo_wait
, &wait
);
4711 ctx
->rings
->sq_flags
&= ~IORING_SQ_NEED_WAKEUP
;
4714 finish_wait(&ctx
->sqo_wait
, &wait
);
4716 ctx
->rings
->sq_flags
&= ~IORING_SQ_NEED_WAKEUP
;
4719 to_submit
= min(to_submit
, ctx
->sq_entries
);
4720 mutex_lock(&ctx
->uring_lock
);
4721 ret
= io_submit_sqes(ctx
, to_submit
, NULL
, -1, &cur_mm
, true);
4722 mutex_unlock(&ctx
->uring_lock
);
4732 revert_creds(old_cred
);
4739 struct io_wait_queue
{
4740 struct wait_queue_entry wq
;
4741 struct io_ring_ctx
*ctx
;
4743 unsigned nr_timeouts
;
4746 static inline bool io_should_wake(struct io_wait_queue
*iowq
, bool noflush
)
4748 struct io_ring_ctx
*ctx
= iowq
->ctx
;
4751 * Wake up if we have enough events, or if a timeout occurred since we
4752 * started waiting. For timeouts, we always want to return to userspace,
4753 * regardless of event count.
4755 return io_cqring_events(ctx
, noflush
) >= iowq
->to_wait
||
4756 atomic_read(&ctx
->cq_timeouts
) != iowq
->nr_timeouts
;
4759 static int io_wake_function(struct wait_queue_entry
*curr
, unsigned int mode
,
4760 int wake_flags
, void *key
)
4762 struct io_wait_queue
*iowq
= container_of(curr
, struct io_wait_queue
,
4765 /* use noflush == true, as we can't safely rely on locking context */
4766 if (!io_should_wake(iowq
, true))
4769 return autoremove_wake_function(curr
, mode
, wake_flags
, key
);
4773 * Wait until events become available, if we don't already have some. The
4774 * application must reap them itself, as they reside on the shared cq ring.
4776 static int io_cqring_wait(struct io_ring_ctx
*ctx
, int min_events
,
4777 const sigset_t __user
*sig
, size_t sigsz
)
4779 struct io_wait_queue iowq
= {
4782 .func
= io_wake_function
,
4783 .entry
= LIST_HEAD_INIT(iowq
.wq
.entry
),
4786 .to_wait
= min_events
,
4788 struct io_rings
*rings
= ctx
->rings
;
4791 if (io_cqring_events(ctx
, false) >= min_events
)
4795 #ifdef CONFIG_COMPAT
4796 if (in_compat_syscall())
4797 ret
= set_compat_user_sigmask((const compat_sigset_t __user
*)sig
,
4801 ret
= set_user_sigmask(sig
, sigsz
);
4807 iowq
.nr_timeouts
= atomic_read(&ctx
->cq_timeouts
);
4808 trace_io_uring_cqring_wait(ctx
, min_events
);
4810 prepare_to_wait_exclusive(&ctx
->wait
, &iowq
.wq
,
4811 TASK_INTERRUPTIBLE
);
4812 if (io_should_wake(&iowq
, false))
4815 if (signal_pending(current
)) {
4820 finish_wait(&ctx
->wait
, &iowq
.wq
);
4822 restore_saved_sigmask_unless(ret
== -EINTR
);
4824 return READ_ONCE(rings
->cq
.head
) == READ_ONCE(rings
->cq
.tail
) ? ret
: 0;
4827 static void __io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
4829 #if defined(CONFIG_UNIX)
4830 if (ctx
->ring_sock
) {
4831 struct sock
*sock
= ctx
->ring_sock
->sk
;
4832 struct sk_buff
*skb
;
4834 while ((skb
= skb_dequeue(&sock
->sk_receive_queue
)) != NULL
)
4840 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
4843 file
= io_file_from_index(ctx
, i
);
4850 static void io_file_ref_kill(struct percpu_ref
*ref
)
4852 struct fixed_file_data
*data
;
4854 data
= container_of(ref
, struct fixed_file_data
, refs
);
4855 complete(&data
->done
);
4858 static int io_sqe_files_unregister(struct io_ring_ctx
*ctx
)
4860 struct fixed_file_data
*data
= ctx
->file_data
;
4861 unsigned nr_tables
, i
;
4866 /* protect against inflight atomic switch, which drops the ref */
4867 flush_work(&data
->ref_work
);
4868 percpu_ref_get(&data
->refs
);
4869 percpu_ref_kill_and_confirm(&data
->refs
, io_file_ref_kill
);
4870 wait_for_completion(&data
->done
);
4871 percpu_ref_put(&data
->refs
);
4872 percpu_ref_exit(&data
->refs
);
4874 __io_sqe_files_unregister(ctx
);
4875 nr_tables
= DIV_ROUND_UP(ctx
->nr_user_files
, IORING_MAX_FILES_TABLE
);
4876 for (i
= 0; i
< nr_tables
; i
++)
4877 kfree(data
->table
[i
].files
);
4880 ctx
->file_data
= NULL
;
4881 ctx
->nr_user_files
= 0;
4885 static void io_sq_thread_stop(struct io_ring_ctx
*ctx
)
4887 if (ctx
->sqo_thread
) {
4888 wait_for_completion(&ctx
->completions
[1]);
4890 * The park is a bit of a work-around, without it we get
4891 * warning spews on shutdown with SQPOLL set and affinity
4892 * set to a single CPU.
4894 kthread_park(ctx
->sqo_thread
);
4895 kthread_stop(ctx
->sqo_thread
);
4896 ctx
->sqo_thread
= NULL
;
4900 static void io_finish_async(struct io_ring_ctx
*ctx
)
4902 io_sq_thread_stop(ctx
);
4905 io_wq_destroy(ctx
->io_wq
);
4910 #if defined(CONFIG_UNIX)
4912 * Ensure the UNIX gc is aware of our file set, so we are certain that
4913 * the io_uring can be safely unregistered on process exit, even if we have
4914 * loops in the file referencing.
4916 static int __io_sqe_files_scm(struct io_ring_ctx
*ctx
, int nr
, int offset
)
4918 struct sock
*sk
= ctx
->ring_sock
->sk
;
4919 struct scm_fp_list
*fpl
;
4920 struct sk_buff
*skb
;
4923 if (!capable(CAP_SYS_RESOURCE
) && !capable(CAP_SYS_ADMIN
)) {
4924 unsigned long inflight
= ctx
->user
->unix_inflight
+ nr
;
4926 if (inflight
> task_rlimit(current
, RLIMIT_NOFILE
))
4930 fpl
= kzalloc(sizeof(*fpl
), GFP_KERNEL
);
4934 skb
= alloc_skb(0, GFP_KERNEL
);
4943 fpl
->user
= get_uid(ctx
->user
);
4944 for (i
= 0; i
< nr
; i
++) {
4945 struct file
*file
= io_file_from_index(ctx
, i
+ offset
);
4949 fpl
->fp
[nr_files
] = get_file(file
);
4950 unix_inflight(fpl
->user
, fpl
->fp
[nr_files
]);
4955 fpl
->max
= SCM_MAX_FD
;
4956 fpl
->count
= nr_files
;
4957 UNIXCB(skb
).fp
= fpl
;
4958 skb
->destructor
= unix_destruct_scm
;
4959 refcount_add(skb
->truesize
, &sk
->sk_wmem_alloc
);
4960 skb_queue_head(&sk
->sk_receive_queue
, skb
);
4962 for (i
= 0; i
< nr_files
; i
++)
4973 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
4974 * causes regular reference counting to break down. We rely on the UNIX
4975 * garbage collection to take care of this problem for us.
4977 static int io_sqe_files_scm(struct io_ring_ctx
*ctx
)
4979 unsigned left
, total
;
4983 left
= ctx
->nr_user_files
;
4985 unsigned this_files
= min_t(unsigned, left
, SCM_MAX_FD
);
4987 ret
= __io_sqe_files_scm(ctx
, this_files
, total
);
4991 total
+= this_files
;
4997 while (total
< ctx
->nr_user_files
) {
4998 struct file
*file
= io_file_from_index(ctx
, total
);
5008 static int io_sqe_files_scm(struct io_ring_ctx
*ctx
)
5014 static int io_sqe_alloc_file_tables(struct io_ring_ctx
*ctx
, unsigned nr_tables
,
5019 for (i
= 0; i
< nr_tables
; i
++) {
5020 struct fixed_file_table
*table
= &ctx
->file_data
->table
[i
];
5021 unsigned this_files
;
5023 this_files
= min(nr_files
, IORING_MAX_FILES_TABLE
);
5024 table
->files
= kcalloc(this_files
, sizeof(struct file
*),
5028 nr_files
-= this_files
;
5034 for (i
= 0; i
< nr_tables
; i
++) {
5035 struct fixed_file_table
*table
= &ctx
->file_data
->table
[i
];
5036 kfree(table
->files
);
5041 static void io_ring_file_put(struct io_ring_ctx
*ctx
, struct file
*file
)
5043 #if defined(CONFIG_UNIX)
5044 struct sock
*sock
= ctx
->ring_sock
->sk
;
5045 struct sk_buff_head list
, *head
= &sock
->sk_receive_queue
;
5046 struct sk_buff
*skb
;
5049 __skb_queue_head_init(&list
);
5052 * Find the skb that holds this file in its SCM_RIGHTS. When found,
5053 * remove this entry and rearrange the file array.
5055 skb
= skb_dequeue(head
);
5057 struct scm_fp_list
*fp
;
5059 fp
= UNIXCB(skb
).fp
;
5060 for (i
= 0; i
< fp
->count
; i
++) {
5063 if (fp
->fp
[i
] != file
)
5066 unix_notinflight(fp
->user
, fp
->fp
[i
]);
5067 left
= fp
->count
- 1 - i
;
5069 memmove(&fp
->fp
[i
], &fp
->fp
[i
+ 1],
5070 left
* sizeof(struct file
*));
5077 __skb_queue_tail(&list
, skb
);
5087 __skb_queue_tail(&list
, skb
);
5089 skb
= skb_dequeue(head
);
5092 if (skb_peek(&list
)) {
5093 spin_lock_irq(&head
->lock
);
5094 while ((skb
= __skb_dequeue(&list
)) != NULL
)
5095 __skb_queue_tail(head
, skb
);
5096 spin_unlock_irq(&head
->lock
);
5103 struct io_file_put
{
5104 struct llist_node llist
;
5106 struct completion
*done
;
5109 static void io_ring_file_ref_switch(struct work_struct
*work
)
5111 struct io_file_put
*pfile
, *tmp
;
5112 struct fixed_file_data
*data
;
5113 struct llist_node
*node
;
5115 data
= container_of(work
, struct fixed_file_data
, ref_work
);
5117 while ((node
= llist_del_all(&data
->put_llist
)) != NULL
) {
5118 llist_for_each_entry_safe(pfile
, tmp
, node
, llist
) {
5119 io_ring_file_put(data
->ctx
, pfile
->file
);
5121 complete(pfile
->done
);
5127 percpu_ref_get(&data
->refs
);
5128 percpu_ref_switch_to_percpu(&data
->refs
);
5131 static void io_file_data_ref_zero(struct percpu_ref
*ref
)
5133 struct fixed_file_data
*data
;
5135 data
= container_of(ref
, struct fixed_file_data
, refs
);
5137 /* we can't safely switch from inside this context, punt to wq */
5138 queue_work(system_wq
, &data
->ref_work
);
5141 static int io_sqe_files_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
5144 __s32 __user
*fds
= (__s32 __user
*) arg
;
5154 if (nr_args
> IORING_MAX_FIXED_FILES
)
5157 ctx
->file_data
= kzalloc(sizeof(*ctx
->file_data
), GFP_KERNEL
);
5158 if (!ctx
->file_data
)
5160 ctx
->file_data
->ctx
= ctx
;
5161 init_completion(&ctx
->file_data
->done
);
5163 nr_tables
= DIV_ROUND_UP(nr_args
, IORING_MAX_FILES_TABLE
);
5164 ctx
->file_data
->table
= kcalloc(nr_tables
,
5165 sizeof(struct fixed_file_table
),
5167 if (!ctx
->file_data
->table
) {
5168 kfree(ctx
->file_data
);
5169 ctx
->file_data
= NULL
;
5173 if (percpu_ref_init(&ctx
->file_data
->refs
, io_file_data_ref_zero
,
5174 PERCPU_REF_ALLOW_REINIT
, GFP_KERNEL
)) {
5175 kfree(ctx
->file_data
->table
);
5176 kfree(ctx
->file_data
);
5177 ctx
->file_data
= NULL
;
5180 ctx
->file_data
->put_llist
.first
= NULL
;
5181 INIT_WORK(&ctx
->file_data
->ref_work
, io_ring_file_ref_switch
);
5183 if (io_sqe_alloc_file_tables(ctx
, nr_tables
, nr_args
)) {
5184 percpu_ref_exit(&ctx
->file_data
->refs
);
5185 kfree(ctx
->file_data
->table
);
5186 kfree(ctx
->file_data
);
5187 ctx
->file_data
= NULL
;
5191 for (i
= 0; i
< nr_args
; i
++, ctx
->nr_user_files
++) {
5192 struct fixed_file_table
*table
;
5196 if (copy_from_user(&fd
, &fds
[i
], sizeof(fd
)))
5198 /* allow sparse sets */
5204 table
= &ctx
->file_data
->table
[i
>> IORING_FILE_TABLE_SHIFT
];
5205 index
= i
& IORING_FILE_TABLE_MASK
;
5213 * Don't allow io_uring instances to be registered. If UNIX
5214 * isn't enabled, then this causes a reference cycle and this
5215 * instance can never get freed. If UNIX is enabled we'll
5216 * handle it just fine, but there's still no point in allowing
5217 * a ring fd as it doesn't support regular read/write anyway.
5219 if (file
->f_op
== &io_uring_fops
) {
5224 table
->files
[index
] = file
;
5228 for (i
= 0; i
< ctx
->nr_user_files
; i
++) {
5229 file
= io_file_from_index(ctx
, i
);
5233 for (i
= 0; i
< nr_tables
; i
++)
5234 kfree(ctx
->file_data
->table
[i
].files
);
5236 kfree(ctx
->file_data
->table
);
5237 kfree(ctx
->file_data
);
5238 ctx
->file_data
= NULL
;
5239 ctx
->nr_user_files
= 0;
5243 ret
= io_sqe_files_scm(ctx
);
5245 io_sqe_files_unregister(ctx
);
5250 static int io_sqe_file_register(struct io_ring_ctx
*ctx
, struct file
*file
,
5253 #if defined(CONFIG_UNIX)
5254 struct sock
*sock
= ctx
->ring_sock
->sk
;
5255 struct sk_buff_head
*head
= &sock
->sk_receive_queue
;
5256 struct sk_buff
*skb
;
5259 * See if we can merge this file into an existing skb SCM_RIGHTS
5260 * file set. If there's no room, fall back to allocating a new skb
5261 * and filling it in.
5263 spin_lock_irq(&head
->lock
);
5264 skb
= skb_peek(head
);
5266 struct scm_fp_list
*fpl
= UNIXCB(skb
).fp
;
5268 if (fpl
->count
< SCM_MAX_FD
) {
5269 __skb_unlink(skb
, head
);
5270 spin_unlock_irq(&head
->lock
);
5271 fpl
->fp
[fpl
->count
] = get_file(file
);
5272 unix_inflight(fpl
->user
, fpl
->fp
[fpl
->count
]);
5274 spin_lock_irq(&head
->lock
);
5275 __skb_queue_head(head
, skb
);
5280 spin_unlock_irq(&head
->lock
);
5287 return __io_sqe_files_scm(ctx
, 1, index
);
5293 static void io_atomic_switch(struct percpu_ref
*ref
)
5295 struct fixed_file_data
*data
;
5297 data
= container_of(ref
, struct fixed_file_data
, refs
);
5298 clear_bit(FFD_F_ATOMIC
, &data
->state
);
5301 static bool io_queue_file_removal(struct fixed_file_data
*data
,
5304 struct io_file_put
*pfile
, pfile_stack
;
5305 DECLARE_COMPLETION_ONSTACK(done
);
5308 * If we fail allocating the struct we need for doing async reomval
5309 * of this file, just punt to sync and wait for it.
5311 pfile
= kzalloc(sizeof(*pfile
), GFP_KERNEL
);
5313 pfile
= &pfile_stack
;
5314 pfile
->done
= &done
;
5318 llist_add(&pfile
->llist
, &data
->put_llist
);
5320 if (pfile
== &pfile_stack
) {
5321 if (!test_and_set_bit(FFD_F_ATOMIC
, &data
->state
)) {
5322 percpu_ref_put(&data
->refs
);
5323 percpu_ref_switch_to_atomic(&data
->refs
,
5326 wait_for_completion(&done
);
5327 flush_work(&data
->ref_work
);
5334 static int __io_sqe_files_update(struct io_ring_ctx
*ctx
,
5335 struct io_uring_files_update
*up
,
5338 struct fixed_file_data
*data
= ctx
->file_data
;
5339 bool ref_switch
= false;
5345 if (check_add_overflow(up
->offset
, nr_args
, &done
))
5347 if (done
> ctx
->nr_user_files
)
5351 fds
= u64_to_user_ptr(up
->fds
);
5353 struct fixed_file_table
*table
;
5357 if (copy_from_user(&fd
, &fds
[done
], sizeof(fd
))) {
5361 i
= array_index_nospec(up
->offset
, ctx
->nr_user_files
);
5362 table
= &ctx
->file_data
->table
[i
>> IORING_FILE_TABLE_SHIFT
];
5363 index
= i
& IORING_FILE_TABLE_MASK
;
5364 if (table
->files
[index
]) {
5365 file
= io_file_from_index(ctx
, index
);
5366 table
->files
[index
] = NULL
;
5367 if (io_queue_file_removal(data
, file
))
5377 * Don't allow io_uring instances to be registered. If
5378 * UNIX isn't enabled, then this causes a reference
5379 * cycle and this instance can never get freed. If UNIX
5380 * is enabled we'll handle it just fine, but there's
5381 * still no point in allowing a ring fd as it doesn't
5382 * support regular read/write anyway.
5384 if (file
->f_op
== &io_uring_fops
) {
5389 table
->files
[index
] = file
;
5390 err
= io_sqe_file_register(ctx
, file
, i
);
5399 if (ref_switch
&& !test_and_set_bit(FFD_F_ATOMIC
, &data
->state
)) {
5400 percpu_ref_put(&data
->refs
);
5401 percpu_ref_switch_to_atomic(&data
->refs
, io_atomic_switch
);
5404 return done
? done
: err
;
5406 static int io_sqe_files_update(struct io_ring_ctx
*ctx
, void __user
*arg
,
5409 struct io_uring_files_update up
;
5411 if (!ctx
->file_data
)
5415 if (copy_from_user(&up
, arg
, sizeof(up
)))
5420 return __io_sqe_files_update(ctx
, &up
, nr_args
);
5423 static void io_put_work(struct io_wq_work
*work
)
5425 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
5430 static void io_get_work(struct io_wq_work
*work
)
5432 struct io_kiocb
*req
= container_of(work
, struct io_kiocb
, work
);
5434 refcount_inc(&req
->refs
);
5437 static int io_sq_offload_start(struct io_ring_ctx
*ctx
,
5438 struct io_uring_params
*p
)
5440 struct io_wq_data data
;
5441 unsigned concurrency
;
5444 init_waitqueue_head(&ctx
->sqo_wait
);
5445 mmgrab(current
->mm
);
5446 ctx
->sqo_mm
= current
->mm
;
5448 if (ctx
->flags
& IORING_SETUP_SQPOLL
) {
5450 if (!capable(CAP_SYS_ADMIN
))
5453 ctx
->sq_thread_idle
= msecs_to_jiffies(p
->sq_thread_idle
);
5454 if (!ctx
->sq_thread_idle
)
5455 ctx
->sq_thread_idle
= HZ
;
5457 if (p
->flags
& IORING_SETUP_SQ_AFF
) {
5458 int cpu
= p
->sq_thread_cpu
;
5461 if (cpu
>= nr_cpu_ids
)
5463 if (!cpu_online(cpu
))
5466 ctx
->sqo_thread
= kthread_create_on_cpu(io_sq_thread
,
5470 ctx
->sqo_thread
= kthread_create(io_sq_thread
, ctx
,
5473 if (IS_ERR(ctx
->sqo_thread
)) {
5474 ret
= PTR_ERR(ctx
->sqo_thread
);
5475 ctx
->sqo_thread
= NULL
;
5478 wake_up_process(ctx
->sqo_thread
);
5479 } else if (p
->flags
& IORING_SETUP_SQ_AFF
) {
5480 /* Can't have SQ_AFF without SQPOLL */
5485 data
.mm
= ctx
->sqo_mm
;
5486 data
.user
= ctx
->user
;
5487 data
.creds
= ctx
->creds
;
5488 data
.get_work
= io_get_work
;
5489 data
.put_work
= io_put_work
;
5491 /* Do QD, or 4 * CPUS, whatever is smallest */
5492 concurrency
= min(ctx
->sq_entries
, 4 * num_online_cpus());
5493 ctx
->io_wq
= io_wq_create(concurrency
, &data
);
5494 if (IS_ERR(ctx
->io_wq
)) {
5495 ret
= PTR_ERR(ctx
->io_wq
);
5502 io_finish_async(ctx
);
5503 mmdrop(ctx
->sqo_mm
);
5508 static void io_unaccount_mem(struct user_struct
*user
, unsigned long nr_pages
)
5510 atomic_long_sub(nr_pages
, &user
->locked_vm
);
5513 static int io_account_mem(struct user_struct
*user
, unsigned long nr_pages
)
5515 unsigned long page_limit
, cur_pages
, new_pages
;
5517 /* Don't allow more pages than we can safely lock */
5518 page_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
5521 cur_pages
= atomic_long_read(&user
->locked_vm
);
5522 new_pages
= cur_pages
+ nr_pages
;
5523 if (new_pages
> page_limit
)
5525 } while (atomic_long_cmpxchg(&user
->locked_vm
, cur_pages
,
5526 new_pages
) != cur_pages
);
5531 static void io_mem_free(void *ptr
)
5538 page
= virt_to_head_page(ptr
);
5539 if (put_page_testzero(page
))
5540 free_compound_page(page
);
5543 static void *io_mem_alloc(size_t size
)
5545 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_ZERO
| __GFP_NOWARN
| __GFP_COMP
|
5548 return (void *) __get_free_pages(gfp_flags
, get_order(size
));
5551 static unsigned long rings_size(unsigned sq_entries
, unsigned cq_entries
,
5554 struct io_rings
*rings
;
5555 size_t off
, sq_array_size
;
5557 off
= struct_size(rings
, cqes
, cq_entries
);
5558 if (off
== SIZE_MAX
)
5562 off
= ALIGN(off
, SMP_CACHE_BYTES
);
5567 sq_array_size
= array_size(sizeof(u32
), sq_entries
);
5568 if (sq_array_size
== SIZE_MAX
)
5571 if (check_add_overflow(off
, sq_array_size
, &off
))
5580 static unsigned long ring_pages(unsigned sq_entries
, unsigned cq_entries
)
5584 pages
= (size_t)1 << get_order(
5585 rings_size(sq_entries
, cq_entries
, NULL
));
5586 pages
+= (size_t)1 << get_order(
5587 array_size(sizeof(struct io_uring_sqe
), sq_entries
));
5592 static int io_sqe_buffer_unregister(struct io_ring_ctx
*ctx
)
5596 if (!ctx
->user_bufs
)
5599 for (i
= 0; i
< ctx
->nr_user_bufs
; i
++) {
5600 struct io_mapped_ubuf
*imu
= &ctx
->user_bufs
[i
];
5602 for (j
= 0; j
< imu
->nr_bvecs
; j
++)
5603 put_user_page(imu
->bvec
[j
].bv_page
);
5605 if (ctx
->account_mem
)
5606 io_unaccount_mem(ctx
->user
, imu
->nr_bvecs
);
5611 kfree(ctx
->user_bufs
);
5612 ctx
->user_bufs
= NULL
;
5613 ctx
->nr_user_bufs
= 0;
5617 static int io_copy_iov(struct io_ring_ctx
*ctx
, struct iovec
*dst
,
5618 void __user
*arg
, unsigned index
)
5620 struct iovec __user
*src
;
5622 #ifdef CONFIG_COMPAT
5624 struct compat_iovec __user
*ciovs
;
5625 struct compat_iovec ciov
;
5627 ciovs
= (struct compat_iovec __user
*) arg
;
5628 if (copy_from_user(&ciov
, &ciovs
[index
], sizeof(ciov
)))
5631 dst
->iov_base
= u64_to_user_ptr((u64
)ciov
.iov_base
);
5632 dst
->iov_len
= ciov
.iov_len
;
5636 src
= (struct iovec __user
*) arg
;
5637 if (copy_from_user(dst
, &src
[index
], sizeof(*dst
)))
5642 static int io_sqe_buffer_register(struct io_ring_ctx
*ctx
, void __user
*arg
,
5645 struct vm_area_struct
**vmas
= NULL
;
5646 struct page
**pages
= NULL
;
5647 int i
, j
, got_pages
= 0;
5652 if (!nr_args
|| nr_args
> UIO_MAXIOV
)
5655 ctx
->user_bufs
= kcalloc(nr_args
, sizeof(struct io_mapped_ubuf
),
5657 if (!ctx
->user_bufs
)
5660 for (i
= 0; i
< nr_args
; i
++) {
5661 struct io_mapped_ubuf
*imu
= &ctx
->user_bufs
[i
];
5662 unsigned long off
, start
, end
, ubuf
;
5667 ret
= io_copy_iov(ctx
, &iov
, arg
, i
);
5672 * Don't impose further limits on the size and buffer
5673 * constraints here, we'll -EINVAL later when IO is
5674 * submitted if they are wrong.
5677 if (!iov
.iov_base
|| !iov
.iov_len
)
5680 /* arbitrary limit, but we need something */
5681 if (iov
.iov_len
> SZ_1G
)
5684 ubuf
= (unsigned long) iov
.iov_base
;
5685 end
= (ubuf
+ iov
.iov_len
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
5686 start
= ubuf
>> PAGE_SHIFT
;
5687 nr_pages
= end
- start
;
5689 if (ctx
->account_mem
) {
5690 ret
= io_account_mem(ctx
->user
, nr_pages
);
5696 if (!pages
|| nr_pages
> got_pages
) {
5699 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*),
5701 vmas
= kvmalloc_array(nr_pages
,
5702 sizeof(struct vm_area_struct
*),
5704 if (!pages
|| !vmas
) {
5706 if (ctx
->account_mem
)
5707 io_unaccount_mem(ctx
->user
, nr_pages
);
5710 got_pages
= nr_pages
;
5713 imu
->bvec
= kvmalloc_array(nr_pages
, sizeof(struct bio_vec
),
5717 if (ctx
->account_mem
)
5718 io_unaccount_mem(ctx
->user
, nr_pages
);
5723 down_read(¤t
->mm
->mmap_sem
);
5724 pret
= get_user_pages(ubuf
, nr_pages
,
5725 FOLL_WRITE
| FOLL_LONGTERM
,
5727 if (pret
== nr_pages
) {
5728 /* don't support file backed memory */
5729 for (j
= 0; j
< nr_pages
; j
++) {
5730 struct vm_area_struct
*vma
= vmas
[j
];
5733 !is_file_hugepages(vma
->vm_file
)) {
5739 ret
= pret
< 0 ? pret
: -EFAULT
;
5741 up_read(¤t
->mm
->mmap_sem
);
5744 * if we did partial map, or found file backed vmas,
5745 * release any pages we did get
5748 put_user_pages(pages
, pret
);
5749 if (ctx
->account_mem
)
5750 io_unaccount_mem(ctx
->user
, nr_pages
);
5755 off
= ubuf
& ~PAGE_MASK
;
5757 for (j
= 0; j
< nr_pages
; j
++) {
5760 vec_len
= min_t(size_t, size
, PAGE_SIZE
- off
);
5761 imu
->bvec
[j
].bv_page
= pages
[j
];
5762 imu
->bvec
[j
].bv_len
= vec_len
;
5763 imu
->bvec
[j
].bv_offset
= off
;
5767 /* store original address for later verification */
5769 imu
->len
= iov
.iov_len
;
5770 imu
->nr_bvecs
= nr_pages
;
5772 ctx
->nr_user_bufs
++;
5780 io_sqe_buffer_unregister(ctx
);
5784 static int io_eventfd_register(struct io_ring_ctx
*ctx
, void __user
*arg
)
5786 __s32 __user
*fds
= arg
;
5792 if (copy_from_user(&fd
, fds
, sizeof(*fds
)))
5795 ctx
->cq_ev_fd
= eventfd_ctx_fdget(fd
);
5796 if (IS_ERR(ctx
->cq_ev_fd
)) {
5797 int ret
= PTR_ERR(ctx
->cq_ev_fd
);
5798 ctx
->cq_ev_fd
= NULL
;
5805 static int io_eventfd_unregister(struct io_ring_ctx
*ctx
)
5807 if (ctx
->cq_ev_fd
) {
5808 eventfd_ctx_put(ctx
->cq_ev_fd
);
5809 ctx
->cq_ev_fd
= NULL
;
5816 static void io_ring_ctx_free(struct io_ring_ctx
*ctx
)
5818 io_finish_async(ctx
);
5820 mmdrop(ctx
->sqo_mm
);
5822 io_iopoll_reap_events(ctx
);
5823 io_sqe_buffer_unregister(ctx
);
5824 io_sqe_files_unregister(ctx
);
5825 io_eventfd_unregister(ctx
);
5827 #if defined(CONFIG_UNIX)
5828 if (ctx
->ring_sock
) {
5829 ctx
->ring_sock
->file
= NULL
; /* so that iput() is called */
5830 sock_release(ctx
->ring_sock
);
5834 io_mem_free(ctx
->rings
);
5835 io_mem_free(ctx
->sq_sqes
);
5837 percpu_ref_exit(&ctx
->refs
);
5838 if (ctx
->account_mem
)
5839 io_unaccount_mem(ctx
->user
,
5840 ring_pages(ctx
->sq_entries
, ctx
->cq_entries
));
5841 free_uid(ctx
->user
);
5842 put_cred(ctx
->creds
);
5843 kfree(ctx
->completions
);
5844 kfree(ctx
->cancel_hash
);
5845 kmem_cache_free(req_cachep
, ctx
->fallback_req
);
5849 static __poll_t
io_uring_poll(struct file
*file
, poll_table
*wait
)
5851 struct io_ring_ctx
*ctx
= file
->private_data
;
5854 poll_wait(file
, &ctx
->cq_wait
, wait
);
5856 * synchronizes with barrier from wq_has_sleeper call in
5860 if (READ_ONCE(ctx
->rings
->sq
.tail
) - ctx
->cached_sq_head
!=
5861 ctx
->rings
->sq_ring_entries
)
5862 mask
|= EPOLLOUT
| EPOLLWRNORM
;
5863 if (READ_ONCE(ctx
->rings
->cq
.head
) != ctx
->cached_cq_tail
)
5864 mask
|= EPOLLIN
| EPOLLRDNORM
;
5869 static int io_uring_fasync(int fd
, struct file
*file
, int on
)
5871 struct io_ring_ctx
*ctx
= file
->private_data
;
5873 return fasync_helper(fd
, file
, on
, &ctx
->cq_fasync
);
5876 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx
*ctx
)
5878 mutex_lock(&ctx
->uring_lock
);
5879 percpu_ref_kill(&ctx
->refs
);
5880 mutex_unlock(&ctx
->uring_lock
);
5882 io_kill_timeouts(ctx
);
5883 io_poll_remove_all(ctx
);
5886 io_wq_cancel_all(ctx
->io_wq
);
5888 io_iopoll_reap_events(ctx
);
5889 /* if we failed setting up the ctx, we might not have any rings */
5891 io_cqring_overflow_flush(ctx
, true);
5892 wait_for_completion(&ctx
->completions
[0]);
5893 io_ring_ctx_free(ctx
);
5896 static int io_uring_release(struct inode
*inode
, struct file
*file
)
5898 struct io_ring_ctx
*ctx
= file
->private_data
;
5900 file
->private_data
= NULL
;
5901 io_ring_ctx_wait_and_kill(ctx
);
5905 static void io_uring_cancel_files(struct io_ring_ctx
*ctx
,
5906 struct files_struct
*files
)
5908 struct io_kiocb
*req
;
5911 while (!list_empty_careful(&ctx
->inflight_list
)) {
5912 struct io_kiocb
*cancel_req
= NULL
;
5914 spin_lock_irq(&ctx
->inflight_lock
);
5915 list_for_each_entry(req
, &ctx
->inflight_list
, inflight_entry
) {
5916 if (req
->work
.files
!= files
)
5918 /* req is being completed, ignore */
5919 if (!refcount_inc_not_zero(&req
->refs
))
5925 prepare_to_wait(&ctx
->inflight_wait
, &wait
,
5926 TASK_UNINTERRUPTIBLE
);
5927 spin_unlock_irq(&ctx
->inflight_lock
);
5929 /* We need to keep going until we don't find a matching req */
5933 io_wq_cancel_work(ctx
->io_wq
, &cancel_req
->work
);
5934 io_put_req(cancel_req
);
5937 finish_wait(&ctx
->inflight_wait
, &wait
);
5940 static int io_uring_flush(struct file
*file
, void *data
)
5942 struct io_ring_ctx
*ctx
= file
->private_data
;
5944 io_uring_cancel_files(ctx
, data
);
5945 if (fatal_signal_pending(current
) || (current
->flags
& PF_EXITING
)) {
5946 io_cqring_overflow_flush(ctx
, true);
5947 io_wq_cancel_all(ctx
->io_wq
);
5952 static void *io_uring_validate_mmap_request(struct file
*file
,
5953 loff_t pgoff
, size_t sz
)
5955 struct io_ring_ctx
*ctx
= file
->private_data
;
5956 loff_t offset
= pgoff
<< PAGE_SHIFT
;
5961 case IORING_OFF_SQ_RING
:
5962 case IORING_OFF_CQ_RING
:
5965 case IORING_OFF_SQES
:
5969 return ERR_PTR(-EINVAL
);
5972 page
= virt_to_head_page(ptr
);
5973 if (sz
> page_size(page
))
5974 return ERR_PTR(-EINVAL
);
5981 static int io_uring_mmap(struct file
*file
, struct vm_area_struct
*vma
)
5983 size_t sz
= vma
->vm_end
- vma
->vm_start
;
5987 ptr
= io_uring_validate_mmap_request(file
, vma
->vm_pgoff
, sz
);
5989 return PTR_ERR(ptr
);
5991 pfn
= virt_to_phys(ptr
) >> PAGE_SHIFT
;
5992 return remap_pfn_range(vma
, vma
->vm_start
, pfn
, sz
, vma
->vm_page_prot
);
5995 #else /* !CONFIG_MMU */
5997 static int io_uring_mmap(struct file
*file
, struct vm_area_struct
*vma
)
5999 return vma
->vm_flags
& (VM_SHARED
| VM_MAYSHARE
) ? 0 : -EINVAL
;
6002 static unsigned int io_uring_nommu_mmap_capabilities(struct file
*file
)
6004 return NOMMU_MAP_DIRECT
| NOMMU_MAP_READ
| NOMMU_MAP_WRITE
;
6007 static unsigned long io_uring_nommu_get_unmapped_area(struct file
*file
,
6008 unsigned long addr
, unsigned long len
,
6009 unsigned long pgoff
, unsigned long flags
)
6013 ptr
= io_uring_validate_mmap_request(file
, pgoff
, len
);
6015 return PTR_ERR(ptr
);
6017 return (unsigned long) ptr
;
6020 #endif /* !CONFIG_MMU */
6022 SYSCALL_DEFINE6(io_uring_enter
, unsigned int, fd
, u32
, to_submit
,
6023 u32
, min_complete
, u32
, flags
, const sigset_t __user
*, sig
,
6026 struct io_ring_ctx
*ctx
;
6031 if (flags
& ~(IORING_ENTER_GETEVENTS
| IORING_ENTER_SQ_WAKEUP
))
6039 if (f
.file
->f_op
!= &io_uring_fops
)
6043 ctx
= f
.file
->private_data
;
6044 if (!percpu_ref_tryget(&ctx
->refs
))
6048 * For SQ polling, the thread will do all submissions and completions.
6049 * Just return the requested submit count, and wake the thread if
6053 if (ctx
->flags
& IORING_SETUP_SQPOLL
) {
6054 if (!list_empty_careful(&ctx
->cq_overflow_list
))
6055 io_cqring_overflow_flush(ctx
, false);
6056 if (flags
& IORING_ENTER_SQ_WAKEUP
)
6057 wake_up(&ctx
->sqo_wait
);
6058 submitted
= to_submit
;
6059 } else if (to_submit
) {
6060 struct mm_struct
*cur_mm
;
6062 if (current
->mm
!= ctx
->sqo_mm
||
6063 current_cred() != ctx
->creds
) {
6068 to_submit
= min(to_submit
, ctx
->sq_entries
);
6069 mutex_lock(&ctx
->uring_lock
);
6070 /* already have mm, so io_submit_sqes() won't try to grab it */
6071 cur_mm
= ctx
->sqo_mm
;
6072 submitted
= io_submit_sqes(ctx
, to_submit
, f
.file
, fd
,
6074 mutex_unlock(&ctx
->uring_lock
);
6076 if (submitted
!= to_submit
)
6079 if (flags
& IORING_ENTER_GETEVENTS
) {
6080 unsigned nr_events
= 0;
6082 min_complete
= min(min_complete
, ctx
->cq_entries
);
6084 if (ctx
->flags
& IORING_SETUP_IOPOLL
) {
6085 ret
= io_iopoll_check(ctx
, &nr_events
, min_complete
);
6087 ret
= io_cqring_wait(ctx
, min_complete
, sig
, sigsz
);
6092 percpu_ref_put(&ctx
->refs
);
6095 return submitted
? submitted
: ret
;
6098 static const struct file_operations io_uring_fops
= {
6099 .release
= io_uring_release
,
6100 .flush
= io_uring_flush
,
6101 .mmap
= io_uring_mmap
,
6103 .get_unmapped_area
= io_uring_nommu_get_unmapped_area
,
6104 .mmap_capabilities
= io_uring_nommu_mmap_capabilities
,
6106 .poll
= io_uring_poll
,
6107 .fasync
= io_uring_fasync
,
6110 static int io_allocate_scq_urings(struct io_ring_ctx
*ctx
,
6111 struct io_uring_params
*p
)
6113 struct io_rings
*rings
;
6114 size_t size
, sq_array_offset
;
6116 size
= rings_size(p
->sq_entries
, p
->cq_entries
, &sq_array_offset
);
6117 if (size
== SIZE_MAX
)
6120 rings
= io_mem_alloc(size
);
6125 ctx
->sq_array
= (u32
*)((char *)rings
+ sq_array_offset
);
6126 rings
->sq_ring_mask
= p
->sq_entries
- 1;
6127 rings
->cq_ring_mask
= p
->cq_entries
- 1;
6128 rings
->sq_ring_entries
= p
->sq_entries
;
6129 rings
->cq_ring_entries
= p
->cq_entries
;
6130 ctx
->sq_mask
= rings
->sq_ring_mask
;
6131 ctx
->cq_mask
= rings
->cq_ring_mask
;
6132 ctx
->sq_entries
= rings
->sq_ring_entries
;
6133 ctx
->cq_entries
= rings
->cq_ring_entries
;
6135 size
= array_size(sizeof(struct io_uring_sqe
), p
->sq_entries
);
6136 if (size
== SIZE_MAX
) {
6137 io_mem_free(ctx
->rings
);
6142 ctx
->sq_sqes
= io_mem_alloc(size
);
6143 if (!ctx
->sq_sqes
) {
6144 io_mem_free(ctx
->rings
);
6153 * Allocate an anonymous fd, this is what constitutes the application
6154 * visible backing of an io_uring instance. The application mmaps this
6155 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
6156 * we have to tie this fd to a socket for file garbage collection purposes.
6158 static int io_uring_get_fd(struct io_ring_ctx
*ctx
)
6163 #if defined(CONFIG_UNIX)
6164 ret
= sock_create_kern(&init_net
, PF_UNIX
, SOCK_RAW
, IPPROTO_IP
,
6170 ret
= get_unused_fd_flags(O_RDWR
| O_CLOEXEC
);
6174 file
= anon_inode_getfile("[io_uring]", &io_uring_fops
, ctx
,
6175 O_RDWR
| O_CLOEXEC
);
6178 ret
= PTR_ERR(file
);
6182 #if defined(CONFIG_UNIX)
6183 ctx
->ring_sock
->file
= file
;
6185 fd_install(ret
, file
);
6188 #if defined(CONFIG_UNIX)
6189 sock_release(ctx
->ring_sock
);
6190 ctx
->ring_sock
= NULL
;
6195 static int io_uring_create(unsigned entries
, struct io_uring_params
*p
)
6197 struct user_struct
*user
= NULL
;
6198 struct io_ring_ctx
*ctx
;
6202 if (!entries
|| entries
> IORING_MAX_ENTRIES
)
6206 * Use twice as many entries for the CQ ring. It's possible for the
6207 * application to drive a higher depth than the size of the SQ ring,
6208 * since the sqes are only used at submission time. This allows for
6209 * some flexibility in overcommitting a bit. If the application has
6210 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
6211 * of CQ ring entries manually.
6213 p
->sq_entries
= roundup_pow_of_two(entries
);
6214 if (p
->flags
& IORING_SETUP_CQSIZE
) {
6216 * If IORING_SETUP_CQSIZE is set, we do the same roundup
6217 * to a power-of-two, if it isn't already. We do NOT impose
6218 * any cq vs sq ring sizing.
6220 if (p
->cq_entries
< p
->sq_entries
|| p
->cq_entries
> IORING_MAX_CQ_ENTRIES
)
6222 p
->cq_entries
= roundup_pow_of_two(p
->cq_entries
);
6224 p
->cq_entries
= 2 * p
->sq_entries
;
6227 user
= get_uid(current_user());
6228 account_mem
= !capable(CAP_IPC_LOCK
);
6231 ret
= io_account_mem(user
,
6232 ring_pages(p
->sq_entries
, p
->cq_entries
));
6239 ctx
= io_ring_ctx_alloc(p
);
6242 io_unaccount_mem(user
, ring_pages(p
->sq_entries
,
6247 ctx
->compat
= in_compat_syscall();
6248 ctx
->account_mem
= account_mem
;
6250 ctx
->creds
= get_current_cred();
6252 ret
= io_allocate_scq_urings(ctx
, p
);
6256 ret
= io_sq_offload_start(ctx
, p
);
6260 memset(&p
->sq_off
, 0, sizeof(p
->sq_off
));
6261 p
->sq_off
.head
= offsetof(struct io_rings
, sq
.head
);
6262 p
->sq_off
.tail
= offsetof(struct io_rings
, sq
.tail
);
6263 p
->sq_off
.ring_mask
= offsetof(struct io_rings
, sq_ring_mask
);
6264 p
->sq_off
.ring_entries
= offsetof(struct io_rings
, sq_ring_entries
);
6265 p
->sq_off
.flags
= offsetof(struct io_rings
, sq_flags
);
6266 p
->sq_off
.dropped
= offsetof(struct io_rings
, sq_dropped
);
6267 p
->sq_off
.array
= (char *)ctx
->sq_array
- (char *)ctx
->rings
;
6269 memset(&p
->cq_off
, 0, sizeof(p
->cq_off
));
6270 p
->cq_off
.head
= offsetof(struct io_rings
, cq
.head
);
6271 p
->cq_off
.tail
= offsetof(struct io_rings
, cq
.tail
);
6272 p
->cq_off
.ring_mask
= offsetof(struct io_rings
, cq_ring_mask
);
6273 p
->cq_off
.ring_entries
= offsetof(struct io_rings
, cq_ring_entries
);
6274 p
->cq_off
.overflow
= offsetof(struct io_rings
, cq_overflow
);
6275 p
->cq_off
.cqes
= offsetof(struct io_rings
, cqes
);
6278 * Install ring fd as the very last thing, so we don't risk someone
6279 * having closed it before we finish setup
6281 ret
= io_uring_get_fd(ctx
);
6285 p
->features
= IORING_FEAT_SINGLE_MMAP
| IORING_FEAT_NODROP
|
6286 IORING_FEAT_SUBMIT_STABLE
| IORING_FEAT_RW_CUR_POS
;
6287 trace_io_uring_create(ret
, ctx
, p
->sq_entries
, p
->cq_entries
, p
->flags
);
6290 io_ring_ctx_wait_and_kill(ctx
);
6295 * Sets up an aio uring context, and returns the fd. Applications asks for a
6296 * ring size, we return the actual sq/cq ring sizes (among other things) in the
6297 * params structure passed in.
6299 static long io_uring_setup(u32 entries
, struct io_uring_params __user
*params
)
6301 struct io_uring_params p
;
6305 if (copy_from_user(&p
, params
, sizeof(p
)))
6307 for (i
= 0; i
< ARRAY_SIZE(p
.resv
); i
++) {
6312 if (p
.flags
& ~(IORING_SETUP_IOPOLL
| IORING_SETUP_SQPOLL
|
6313 IORING_SETUP_SQ_AFF
| IORING_SETUP_CQSIZE
))
6316 ret
= io_uring_create(entries
, &p
);
6320 if (copy_to_user(params
, &p
, sizeof(p
)))
6326 SYSCALL_DEFINE2(io_uring_setup
, u32
, entries
,
6327 struct io_uring_params __user
*, params
)
6329 return io_uring_setup(entries
, params
);
6332 static int __io_uring_register(struct io_ring_ctx
*ctx
, unsigned opcode
,
6333 void __user
*arg
, unsigned nr_args
)
6334 __releases(ctx
->uring_lock
)
6335 __acquires(ctx
->uring_lock
)
6340 * We're inside the ring mutex, if the ref is already dying, then
6341 * someone else killed the ctx or is already going through
6342 * io_uring_register().
6344 if (percpu_ref_is_dying(&ctx
->refs
))
6347 if (opcode
!= IORING_UNREGISTER_FILES
&&
6348 opcode
!= IORING_REGISTER_FILES_UPDATE
) {
6349 percpu_ref_kill(&ctx
->refs
);
6352 * Drop uring mutex before waiting for references to exit. If
6353 * another thread is currently inside io_uring_enter() it might
6354 * need to grab the uring_lock to make progress. If we hold it
6355 * here across the drain wait, then we can deadlock. It's safe
6356 * to drop the mutex here, since no new references will come in
6357 * after we've killed the percpu ref.
6359 mutex_unlock(&ctx
->uring_lock
);
6360 wait_for_completion(&ctx
->completions
[0]);
6361 mutex_lock(&ctx
->uring_lock
);
6365 case IORING_REGISTER_BUFFERS
:
6366 ret
= io_sqe_buffer_register(ctx
, arg
, nr_args
);
6368 case IORING_UNREGISTER_BUFFERS
:
6372 ret
= io_sqe_buffer_unregister(ctx
);
6374 case IORING_REGISTER_FILES
:
6375 ret
= io_sqe_files_register(ctx
, arg
, nr_args
);
6377 case IORING_UNREGISTER_FILES
:
6381 ret
= io_sqe_files_unregister(ctx
);
6383 case IORING_REGISTER_FILES_UPDATE
:
6384 ret
= io_sqe_files_update(ctx
, arg
, nr_args
);
6386 case IORING_REGISTER_EVENTFD
:
6390 ret
= io_eventfd_register(ctx
, arg
);
6392 case IORING_UNREGISTER_EVENTFD
:
6396 ret
= io_eventfd_unregister(ctx
);
6404 if (opcode
!= IORING_UNREGISTER_FILES
&&
6405 opcode
!= IORING_REGISTER_FILES_UPDATE
) {
6406 /* bring the ctx back to life */
6407 reinit_completion(&ctx
->completions
[0]);
6408 percpu_ref_reinit(&ctx
->refs
);
6413 SYSCALL_DEFINE4(io_uring_register
, unsigned int, fd
, unsigned int, opcode
,
6414 void __user
*, arg
, unsigned int, nr_args
)
6416 struct io_ring_ctx
*ctx
;
6425 if (f
.file
->f_op
!= &io_uring_fops
)
6428 ctx
= f
.file
->private_data
;
6430 mutex_lock(&ctx
->uring_lock
);
6431 ret
= __io_uring_register(ctx
, opcode
, arg
, nr_args
);
6432 mutex_unlock(&ctx
->uring_lock
);
6433 trace_io_uring_register(ctx
, opcode
, ctx
->nr_user_files
, ctx
->nr_user_bufs
,
6434 ctx
->cq_ev_fd
!= NULL
, ret
);
6440 static int __init
io_uring_init(void)
6442 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs
) != IORING_OP_LAST
);
6443 req_cachep
= KMEM_CACHE(io_kiocb
, SLAB_HWCACHE_ALIGN
| SLAB_PANIC
);
6446 __initcall(io_uring_init
);