]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/io_uring.c
io_uring: cancel sqpoll via task_work
[mirror_ubuntu-jammy-kernel.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
2b188cc1 60#include <linux/blkdev.h>
edafccee 61#include <linux/bvec.h>
2b188cc1
JA
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
6b06314c 65#include <net/scm.h>
2b188cc1
JA
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
edafccee
JA
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
aa4c3967 72#include <linux/highmem.h>
15b71abe
JA
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
4840e418 75#include <linux/fadvise.h>
3e4827b0 76#include <linux/eventpoll.h>
7d67af2c 77#include <linux/splice.h>
b41e9852 78#include <linux/task_work.h>
bcf5a063 79#include <linux/pagemap.h>
0f212204 80#include <linux/io_uring.h>
e4b4a13f 81#include <linux/freezer.h>
2b188cc1 82
c826bd7a
DD
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
2b188cc1
JA
86#include <uapi/linux/io_uring.h>
87
88#include "internal.h"
561fb04a 89#include "io-wq.h"
2b188cc1 90
5277deaa 91#define IORING_MAX_ENTRIES 32768
33a107f0 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
65e19f54
JA
93
94/*
95 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
96 */
97#define IORING_FILE_TABLE_SHIFT 9
98#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
99#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
100#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
21b55dbc
SG
101#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
102 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 103
b16fed66
PB
104#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
105 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
106 IOSQE_BUFFER_SELECT)
107
2b188cc1
JA
108struct io_uring {
109 u32 head ____cacheline_aligned_in_smp;
110 u32 tail ____cacheline_aligned_in_smp;
111};
112
1e84b97b 113/*
75b28aff
HV
114 * This data is shared with the application through the mmap at offsets
115 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
116 *
117 * The offsets to the member fields are published through struct
118 * io_sqring_offsets when calling io_uring_setup.
119 */
75b28aff 120struct io_rings {
1e84b97b
SB
121 /*
122 * Head and tail offsets into the ring; the offsets need to be
123 * masked to get valid indices.
124 *
75b28aff
HV
125 * The kernel controls head of the sq ring and the tail of the cq ring,
126 * and the application controls tail of the sq ring and the head of the
127 * cq ring.
1e84b97b 128 */
75b28aff 129 struct io_uring sq, cq;
1e84b97b 130 /*
75b28aff 131 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
132 * ring_entries - 1)
133 */
75b28aff
HV
134 u32 sq_ring_mask, cq_ring_mask;
135 /* Ring sizes (constant, power of 2) */
136 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
137 /*
138 * Number of invalid entries dropped by the kernel due to
139 * invalid index stored in array
140 *
141 * Written by the kernel, shouldn't be modified by the
142 * application (i.e. get number of "new events" by comparing to
143 * cached value).
144 *
145 * After a new SQ head value was read by the application this
146 * counter includes all submissions that were dropped reaching
147 * the new SQ head (and possibly more).
148 */
75b28aff 149 u32 sq_dropped;
1e84b97b 150 /*
0d9b5b3a 151 * Runtime SQ flags
1e84b97b
SB
152 *
153 * Written by the kernel, shouldn't be modified by the
154 * application.
155 *
156 * The application needs a full memory barrier before checking
157 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
158 */
75b28aff 159 u32 sq_flags;
0d9b5b3a
SG
160 /*
161 * Runtime CQ flags
162 *
163 * Written by the application, shouldn't be modified by the
164 * kernel.
165 */
166 u32 cq_flags;
1e84b97b
SB
167 /*
168 * Number of completion events lost because the queue was full;
169 * this should be avoided by the application by making sure
0b4295b5 170 * there are not more requests pending than there is space in
1e84b97b
SB
171 * the completion queue.
172 *
173 * Written by the kernel, shouldn't be modified by the
174 * application (i.e. get number of "new events" by comparing to
175 * cached value).
176 *
177 * As completion events come in out of order this counter is not
178 * ordered with any other data.
179 */
75b28aff 180 u32 cq_overflow;
1e84b97b
SB
181 /*
182 * Ring buffer of completion events.
183 *
184 * The kernel writes completion events fresh every time they are
185 * produced, so the application is allowed to modify pending
186 * entries.
187 */
75b28aff 188 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
189};
190
45d189c6
PB
191enum io_uring_cmd_flags {
192 IO_URING_F_NONBLOCK = 1,
889fca73 193 IO_URING_F_COMPLETE_DEFER = 2,
45d189c6
PB
194};
195
edafccee
JA
196struct io_mapped_ubuf {
197 u64 ubuf;
198 size_t len;
199 struct bio_vec *bvec;
200 unsigned int nr_bvecs;
de293938 201 unsigned long acct_pages;
edafccee
JA
202};
203
50238531
BM
204struct io_ring_ctx;
205
269bbe5f
BM
206struct io_rsrc_put {
207 struct list_head list;
50238531
BM
208 union {
209 void *rsrc;
210 struct file *file;
211 };
269bbe5f
BM
212};
213
214struct fixed_rsrc_table {
65e19f54 215 struct file **files;
31b51510
JA
216};
217
269bbe5f 218struct fixed_rsrc_ref_node {
05589553
XW
219 struct percpu_ref refs;
220 struct list_head node;
269bbe5f
BM
221 struct list_head rsrc_list;
222 struct fixed_rsrc_data *rsrc_data;
50238531
BM
223 void (*rsrc_put)(struct io_ring_ctx *ctx,
224 struct io_rsrc_put *prsrc);
4a38aed2 225 struct llist_node llist;
e297822b 226 bool done;
05589553
XW
227};
228
269bbe5f
BM
229struct fixed_rsrc_data {
230 struct fixed_rsrc_table *table;
05f3fb3c
JA
231 struct io_ring_ctx *ctx;
232
269bbe5f 233 struct fixed_rsrc_ref_node *node;
05f3fb3c 234 struct percpu_ref refs;
05f3fb3c 235 struct completion done;
8bad28d8 236 bool quiesce;
05f3fb3c
JA
237};
238
5a2e745d
JA
239struct io_buffer {
240 struct list_head list;
241 __u64 addr;
242 __s32 len;
243 __u16 bid;
244};
245
21b55dbc
SG
246struct io_restriction {
247 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
248 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
249 u8 sqe_flags_allowed;
250 u8 sqe_flags_required;
7e84e1c7 251 bool registered;
21b55dbc
SG
252};
253
37d1e2e3
JA
254enum {
255 IO_SQ_THREAD_SHOULD_STOP = 0,
256 IO_SQ_THREAD_SHOULD_PARK,
257};
258
534ca6d6
JA
259struct io_sq_data {
260 refcount_t refs;
05962f95 261 struct rw_semaphore rw_lock;
69fb2131
JA
262
263 /* ctx's that are using this sqd */
264 struct list_head ctx_list;
69fb2131 265
534ca6d6
JA
266 struct task_struct *thread;
267 struct wait_queue_head wait;
08369246
XW
268
269 unsigned sq_thread_idle;
37d1e2e3
JA
270 int sq_cpu;
271 pid_t task_pid;
5c2469e0 272 pid_t task_tgid;
37d1e2e3
JA
273
274 unsigned long state;
37d1e2e3 275 struct completion exited;
534ca6d6
JA
276};
277
258b29a9 278#define IO_IOPOLL_BATCH 8
6dd0be1e 279#define IO_COMPL_BATCH 32
6ff119a6 280#define IO_REQ_CACHE_SIZE 32
bf019da7 281#define IO_REQ_ALLOC_BATCH 8
258b29a9
PB
282
283struct io_comp_state {
6dd0be1e 284 struct io_kiocb *reqs[IO_COMPL_BATCH];
1b4c351f 285 unsigned int nr;
c7dae4ba
JA
286 unsigned int locked_free_nr;
287 /* inline/task_work completion list, under ->uring_lock */
1b4c351f 288 struct list_head free_list;
c7dae4ba
JA
289 /* IRQ completion list, under ->completion_lock */
290 struct list_head locked_free_list;
258b29a9
PB
291};
292
a1ab7b35
PB
293struct io_submit_link {
294 struct io_kiocb *head;
295 struct io_kiocb *last;
296};
297
258b29a9
PB
298struct io_submit_state {
299 struct blk_plug plug;
a1ab7b35 300 struct io_submit_link link;
258b29a9
PB
301
302 /*
303 * io_kiocb alloc cache
304 */
bf019da7 305 void *reqs[IO_REQ_CACHE_SIZE];
258b29a9
PB
306 unsigned int free_reqs;
307
308 bool plug_started;
309
310 /*
311 * Batch completion logic
312 */
313 struct io_comp_state comp;
314
315 /*
316 * File reference cache
317 */
318 struct file *file;
319 unsigned int fd;
320 unsigned int file_refs;
321 unsigned int ios_left;
322};
323
2b188cc1
JA
324struct io_ring_ctx {
325 struct {
326 struct percpu_ref refs;
327 } ____cacheline_aligned_in_smp;
328
329 struct {
330 unsigned int flags;
e1d85334 331 unsigned int compat: 1;
e1d85334
RD
332 unsigned int cq_overflow_flushed: 1;
333 unsigned int drain_next: 1;
334 unsigned int eventfd_async: 1;
21b55dbc 335 unsigned int restricted: 1;
2b188cc1 336
75b28aff
HV
337 /*
338 * Ring buffer of indices into array of io_uring_sqe, which is
339 * mmapped by the application using the IORING_OFF_SQES offset.
340 *
341 * This indirection could e.g. be used to assign fixed
342 * io_uring_sqe entries to operations and only submit them to
343 * the queue when needed.
344 *
345 * The kernel modifies neither the indices array nor the entries
346 * array.
347 */
348 u32 *sq_array;
2b188cc1
JA
349 unsigned cached_sq_head;
350 unsigned sq_entries;
351 unsigned sq_mask;
6c271ce2 352 unsigned sq_thread_idle;
498ccd9e 353 unsigned cached_sq_dropped;
2c3bac6d 354 unsigned cached_cq_overflow;
ad3eb2c8 355 unsigned long sq_check_overflow;
de0617e4 356
e941894e
JA
357 /* hashed buffered write serialization */
358 struct io_wq_hash *hash_map;
359
de0617e4 360 struct list_head defer_list;
5262f567 361 struct list_head timeout_list;
1d7bb1d5 362 struct list_head cq_overflow_list;
fcb323cc 363
ad3eb2c8 364 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
365 } ____cacheline_aligned_in_smp;
366
3c1a2ead
JA
367 struct {
368 struct mutex uring_lock;
369 wait_queue_head_t wait;
370 } ____cacheline_aligned_in_smp;
371
372 struct io_submit_state submit_state;
373
206aefde
JA
374 struct io_rings *rings;
375
2aede0e4
JA
376 /* Only used for accounting purposes */
377 struct mm_struct *mm_account;
378
7c30f36a 379 const struct cred *sq_creds; /* cred used for __io_sq_thread() */
534ca6d6
JA
380 struct io_sq_data *sq_data; /* if using sq thread polling */
381
90554200 382 struct wait_queue_head sqo_sq_wait;
69fb2131 383 struct list_head sqd_list;
75b28aff 384
6b06314c
JA
385 /*
386 * If used, fixed file set. Writers must ensure that ->refs is dead,
387 * readers must ensure that ->refs is alive as long as the file* is
388 * used. Only updated through io_uring_register(2).
389 */
269bbe5f 390 struct fixed_rsrc_data *file_data;
6b06314c
JA
391 unsigned nr_user_files;
392
edafccee
JA
393 /* if used, fixed mapped user buffers */
394 unsigned nr_user_bufs;
395 struct io_mapped_ubuf *user_bufs;
396
2b188cc1
JA
397 struct user_struct *user;
398
0f158b4c 399 struct completion ref_comp;
206aefde
JA
400
401#if defined(CONFIG_UNIX)
402 struct socket *ring_sock;
403#endif
404
5a2e745d
JA
405 struct idr io_buffer_idr;
406
61cf9370
MWO
407 struct xarray personalities;
408 u32 pers_next;
071698e1 409
206aefde
JA
410 struct {
411 unsigned cached_cq_tail;
412 unsigned cq_entries;
413 unsigned cq_mask;
414 atomic_t cq_timeouts;
f010505b 415 unsigned cq_last_tm_flush;
ad3eb2c8 416 unsigned long cq_check_overflow;
206aefde
JA
417 struct wait_queue_head cq_wait;
418 struct fasync_struct *cq_fasync;
419 struct eventfd_ctx *cq_ev_fd;
420 } ____cacheline_aligned_in_smp;
2b188cc1 421
2b188cc1
JA
422 struct {
423 spinlock_t completion_lock;
e94f141b 424
def596e9 425 /*
540e32a0 426 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
427 * io_uring instances that don't use IORING_SETUP_SQPOLL.
428 * For SQPOLL, only the single threaded io_sq_thread() will
429 * manipulate the list, hence no extra locking is needed there.
430 */
540e32a0 431 struct list_head iopoll_list;
78076bb6
JA
432 struct hlist_head *cancel_hash;
433 unsigned cancel_hash_bits;
e94f141b 434 bool poll_multi_file;
31b51510 435
fcb323cc
JA
436 spinlock_t inflight_lock;
437 struct list_head inflight_list;
2b188cc1 438 } ____cacheline_aligned_in_smp;
85faa7b8 439
269bbe5f
BM
440 struct delayed_work rsrc_put_work;
441 struct llist_head rsrc_put_llist;
d67d2263
BM
442 struct list_head rsrc_ref_list;
443 spinlock_t rsrc_ref_lock;
4a38aed2 444
21b55dbc 445 struct io_restriction restrictions;
3c1a2ead 446
7c25c0d1
JA
447 /* exit task_work */
448 struct callback_head *exit_task_work;
449
e941894e
JA
450 struct wait_queue_head hash_wait;
451
3c1a2ead
JA
452 /* Keep this last, we don't need it for the fast path */
453 struct work_struct exit_work;
13bf43f5 454 struct list_head tctx_list;
2b188cc1
JA
455};
456
09bb8394
JA
457/*
458 * First field must be the file pointer in all the
459 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
460 */
221c5eb2
JA
461struct io_poll_iocb {
462 struct file *file;
018043be 463 struct wait_queue_head *head;
221c5eb2 464 __poll_t events;
8c838788 465 bool done;
221c5eb2 466 bool canceled;
392edb45 467 struct wait_queue_entry wait;
221c5eb2
JA
468};
469
018043be
PB
470struct io_poll_remove {
471 struct file *file;
472 u64 addr;
473};
474
b5dba59e
JA
475struct io_close {
476 struct file *file;
b5dba59e
JA
477 int fd;
478};
479
ad8a48ac
JA
480struct io_timeout_data {
481 struct io_kiocb *req;
482 struct hrtimer timer;
483 struct timespec64 ts;
484 enum hrtimer_mode mode;
485};
486
8ed8d3c3
JA
487struct io_accept {
488 struct file *file;
489 struct sockaddr __user *addr;
490 int __user *addr_len;
491 int flags;
09952e3e 492 unsigned long nofile;
8ed8d3c3
JA
493};
494
495struct io_sync {
496 struct file *file;
497 loff_t len;
498 loff_t off;
499 int flags;
d63d1b5e 500 int mode;
8ed8d3c3
JA
501};
502
fbf23849
JA
503struct io_cancel {
504 struct file *file;
505 u64 addr;
506};
507
b29472ee
JA
508struct io_timeout {
509 struct file *file;
bfe68a22
PB
510 u32 off;
511 u32 target_seq;
135fcde8 512 struct list_head list;
90cd7e42
PB
513 /* head of the link, used by linked timeouts only */
514 struct io_kiocb *head;
b29472ee
JA
515};
516
0bdf7a2d
PB
517struct io_timeout_rem {
518 struct file *file;
519 u64 addr;
9c8e11b3
PB
520
521 /* timeout update */
522 struct timespec64 ts;
523 u32 flags;
0bdf7a2d
PB
524};
525
9adbd45d
JA
526struct io_rw {
527 /* NOTE: kiocb has the file as the first member, so don't do it here */
528 struct kiocb kiocb;
529 u64 addr;
530 u64 len;
531};
532
3fbb51c1
JA
533struct io_connect {
534 struct file *file;
535 struct sockaddr __user *addr;
536 int addr_len;
537};
538
e47293fd
JA
539struct io_sr_msg {
540 struct file *file;
fddaface 541 union {
270a5940 542 struct user_msghdr __user *umsg;
fddaface
JA
543 void __user *buf;
544 };
e47293fd 545 int msg_flags;
bcda7baa 546 int bgid;
fddaface 547 size_t len;
bcda7baa 548 struct io_buffer *kbuf;
e47293fd
JA
549};
550
15b71abe
JA
551struct io_open {
552 struct file *file;
553 int dfd;
15b71abe 554 struct filename *filename;
c12cedf2 555 struct open_how how;
4022e7af 556 unsigned long nofile;
15b71abe
JA
557};
558
269bbe5f 559struct io_rsrc_update {
05f3fb3c
JA
560 struct file *file;
561 u64 arg;
562 u32 nr_args;
563 u32 offset;
564};
565
4840e418
JA
566struct io_fadvise {
567 struct file *file;
568 u64 offset;
569 u32 len;
570 u32 advice;
571};
572
c1ca757b
JA
573struct io_madvise {
574 struct file *file;
575 u64 addr;
576 u32 len;
577 u32 advice;
578};
579
3e4827b0
JA
580struct io_epoll {
581 struct file *file;
582 int epfd;
583 int op;
584 int fd;
585 struct epoll_event event;
e47293fd
JA
586};
587
7d67af2c
PB
588struct io_splice {
589 struct file *file_out;
590 struct file *file_in;
591 loff_t off_out;
592 loff_t off_in;
593 u64 len;
594 unsigned int flags;
595};
596
ddf0322d
JA
597struct io_provide_buf {
598 struct file *file;
599 __u64 addr;
600 __s32 len;
601 __u32 bgid;
602 __u16 nbufs;
603 __u16 bid;
604};
605
1d9e1288
BM
606struct io_statx {
607 struct file *file;
608 int dfd;
609 unsigned int mask;
610 unsigned int flags;
e62753e4 611 const char __user *filename;
1d9e1288
BM
612 struct statx __user *buffer;
613};
614
36f4fa68
JA
615struct io_shutdown {
616 struct file *file;
617 int how;
618};
619
80a261fd
JA
620struct io_rename {
621 struct file *file;
622 int old_dfd;
623 int new_dfd;
624 struct filename *oldpath;
625 struct filename *newpath;
626 int flags;
627};
628
14a1143b
JA
629struct io_unlink {
630 struct file *file;
631 int dfd;
632 int flags;
633 struct filename *filename;
634};
635
3ca405eb
PB
636struct io_completion {
637 struct file *file;
638 struct list_head list;
0f7e466b 639 int cflags;
3ca405eb
PB
640};
641
f499a021
JA
642struct io_async_connect {
643 struct sockaddr_storage address;
644};
645
03b1230c
JA
646struct io_async_msghdr {
647 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
648 /* points to an allocated iov, if NULL we use fast_iov instead */
649 struct iovec *free_iov;
03b1230c
JA
650 struct sockaddr __user *uaddr;
651 struct msghdr msg;
b537916c 652 struct sockaddr_storage addr;
03b1230c
JA
653};
654
f67676d1
JA
655struct io_async_rw {
656 struct iovec fast_iov[UIO_FASTIOV];
ff6165b2
JA
657 const struct iovec *free_iovec;
658 struct iov_iter iter;
227c0c96 659 size_t bytes_done;
bcf5a063 660 struct wait_page_queue wpq;
f67676d1
JA
661};
662
6b47ee6e
PB
663enum {
664 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
665 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
666 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
667 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
668 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 669 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
6b47ee6e 670
6b47ee6e
PB
671 REQ_F_FAIL_LINK_BIT,
672 REQ_F_INFLIGHT_BIT,
673 REQ_F_CUR_POS_BIT,
674 REQ_F_NOWAIT_BIT,
6b47ee6e 675 REQ_F_LINK_TIMEOUT_BIT,
6b47ee6e 676 REQ_F_ISREG_BIT,
99bc4c38 677 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 678 REQ_F_POLLED_BIT,
bcda7baa 679 REQ_F_BUFFER_SELECTED_BIT,
5b0bbee4 680 REQ_F_NO_FILE_TABLE_BIT,
900fad45 681 REQ_F_LTIMEOUT_ACTIVE_BIT,
e342c807 682 REQ_F_COMPLETE_INLINE_BIT,
84557871
JA
683
684 /* not a real bit, just to check we're not overflowing the space */
685 __REQ_F_LAST_BIT,
6b47ee6e
PB
686};
687
688enum {
689 /* ctx owns file */
690 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
691 /* drain existing IO first */
692 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
693 /* linked sqes */
694 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
695 /* doesn't sever on completion < 0 */
696 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
697 /* IOSQE_ASYNC */
698 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
699 /* IOSQE_BUFFER_SELECT */
700 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
6b47ee6e 701
6b47ee6e
PB
702 /* fail rest of links */
703 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
b05a1bcd 704 /* on inflight list, should be cancelled and waited on exit reliably */
6b47ee6e
PB
705 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
706 /* read/write uses file position */
707 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
708 /* must not punt to workers */
709 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 710 /* has or had linked timeout */
6b47ee6e 711 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
6b47ee6e
PB
712 /* regular file */
713 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
99bc4c38
PB
714 /* needs cleanup */
715 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
716 /* already went through poll handler */
717 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
718 /* buffer already selected */
719 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
5b0bbee4
JA
720 /* doesn't need file table for this request */
721 REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
900fad45
PB
722 /* linked timeout is active, i.e. prepared by link's head */
723 REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
e342c807
PB
724 /* completion is deferred through io_comp_state */
725 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
d7718a9d
JA
726};
727
728struct async_poll {
729 struct io_poll_iocb poll;
807abcb0 730 struct io_poll_iocb *double_poll;
6b47ee6e
PB
731};
732
7cbf1722
JA
733struct io_task_work {
734 struct io_wq_work_node node;
735 task_work_func_t func;
736};
737
09bb8394
JA
738/*
739 * NOTE! Each of the iocb union members has the file pointer
740 * as the first entry in their struct definition. So you can
741 * access the file pointer through any of the sub-structs,
742 * or directly as just 'ki_filp' in this struct.
743 */
2b188cc1 744struct io_kiocb {
221c5eb2 745 union {
09bb8394 746 struct file *file;
9adbd45d 747 struct io_rw rw;
221c5eb2 748 struct io_poll_iocb poll;
018043be 749 struct io_poll_remove poll_remove;
8ed8d3c3
JA
750 struct io_accept accept;
751 struct io_sync sync;
fbf23849 752 struct io_cancel cancel;
b29472ee 753 struct io_timeout timeout;
0bdf7a2d 754 struct io_timeout_rem timeout_rem;
3fbb51c1 755 struct io_connect connect;
e47293fd 756 struct io_sr_msg sr_msg;
15b71abe 757 struct io_open open;
b5dba59e 758 struct io_close close;
269bbe5f 759 struct io_rsrc_update rsrc_update;
4840e418 760 struct io_fadvise fadvise;
c1ca757b 761 struct io_madvise madvise;
3e4827b0 762 struct io_epoll epoll;
7d67af2c 763 struct io_splice splice;
ddf0322d 764 struct io_provide_buf pbuf;
1d9e1288 765 struct io_statx statx;
36f4fa68 766 struct io_shutdown shutdown;
80a261fd 767 struct io_rename rename;
14a1143b 768 struct io_unlink unlink;
3ca405eb
PB
769 /* use only after cleaning per-op data, see io_clean_op() */
770 struct io_completion compl;
221c5eb2 771 };
2b188cc1 772
e8c2bc1f
JA
773 /* opcode allocated if it needs to store data for async defer */
774 void *async_data;
d625c6ee 775 u8 opcode;
65a6543d
XW
776 /* polled IO has completed */
777 u8 iopoll_completed;
2b188cc1 778
4f4eeba8 779 u16 buf_index;
9cf7c104 780 u32 result;
4f4eeba8 781
010e8e6b
PB
782 struct io_ring_ctx *ctx;
783 unsigned int flags;
784 refcount_t refs;
785 struct task_struct *task;
786 u64 user_data;
d7718a9d 787
f2f87370 788 struct io_kiocb *link;
269bbe5f 789 struct percpu_ref *fixed_rsrc_refs;
fcb323cc 790
d21ffe7e
PB
791 /*
792 * 1. used with ctx->iopoll_list with reads/writes
793 * 2. to track reqs with ->files (see io_op_def::file_table)
794 */
010e8e6b 795 struct list_head inflight_entry;
7cbf1722
JA
796 union {
797 struct io_task_work io_task_work;
798 struct callback_head task_work;
799 };
010e8e6b
PB
800 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
801 struct hlist_node hash_node;
802 struct async_poll *apoll;
803 struct io_wq_work work;
2b188cc1 804};
05589553 805
13bf43f5
PB
806struct io_tctx_node {
807 struct list_head ctx_node;
808 struct task_struct *task;
13bf43f5
PB
809 struct io_ring_ctx *ctx;
810};
811
27dc8338
PB
812struct io_defer_entry {
813 struct list_head list;
814 struct io_kiocb *req;
9cf7c104 815 u32 seq;
2b188cc1
JA
816};
817
d3656344 818struct io_op_def {
d3656344
JA
819 /* needs req->file assigned */
820 unsigned needs_file : 1;
d3656344
JA
821 /* hash wq insertion if file is a regular file */
822 unsigned hash_reg_file : 1;
823 /* unbound wq insertion if file is a non-regular file */
824 unsigned unbound_nonreg_file : 1;
66f4af93
JA
825 /* opcode is not supported by this kernel */
826 unsigned not_supported : 1;
8a72758c
JA
827 /* set if opcode supports polled "wait" */
828 unsigned pollin : 1;
829 unsigned pollout : 1;
bcda7baa
JA
830 /* op supports buffer selection */
831 unsigned buffer_select : 1;
e8c2bc1f
JA
832 /* must always have async data allocated */
833 unsigned needs_async_data : 1;
27926b68
JA
834 /* should block plug */
835 unsigned plug : 1;
e8c2bc1f
JA
836 /* size of async data needed, if any */
837 unsigned short async_size;
d3656344
JA
838};
839
0918682b 840static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
841 [IORING_OP_NOP] = {},
842 [IORING_OP_READV] = {
d3656344
JA
843 .needs_file = 1,
844 .unbound_nonreg_file = 1,
8a72758c 845 .pollin = 1,
4d954c25 846 .buffer_select = 1,
e8c2bc1f 847 .needs_async_data = 1,
27926b68 848 .plug = 1,
e8c2bc1f 849 .async_size = sizeof(struct io_async_rw),
d3656344 850 },
0463b6c5 851 [IORING_OP_WRITEV] = {
d3656344
JA
852 .needs_file = 1,
853 .hash_reg_file = 1,
854 .unbound_nonreg_file = 1,
8a72758c 855 .pollout = 1,
e8c2bc1f 856 .needs_async_data = 1,
27926b68 857 .plug = 1,
e8c2bc1f 858 .async_size = sizeof(struct io_async_rw),
d3656344 859 },
0463b6c5 860 [IORING_OP_FSYNC] = {
d3656344
JA
861 .needs_file = 1,
862 },
0463b6c5 863 [IORING_OP_READ_FIXED] = {
d3656344
JA
864 .needs_file = 1,
865 .unbound_nonreg_file = 1,
8a72758c 866 .pollin = 1,
27926b68 867 .plug = 1,
e8c2bc1f 868 .async_size = sizeof(struct io_async_rw),
d3656344 869 },
0463b6c5 870 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
871 .needs_file = 1,
872 .hash_reg_file = 1,
873 .unbound_nonreg_file = 1,
8a72758c 874 .pollout = 1,
27926b68 875 .plug = 1,
e8c2bc1f 876 .async_size = sizeof(struct io_async_rw),
d3656344 877 },
0463b6c5 878 [IORING_OP_POLL_ADD] = {
d3656344
JA
879 .needs_file = 1,
880 .unbound_nonreg_file = 1,
881 },
0463b6c5
PB
882 [IORING_OP_POLL_REMOVE] = {},
883 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
884 .needs_file = 1,
885 },
0463b6c5 886 [IORING_OP_SENDMSG] = {
d3656344
JA
887 .needs_file = 1,
888 .unbound_nonreg_file = 1,
8a72758c 889 .pollout = 1,
e8c2bc1f
JA
890 .needs_async_data = 1,
891 .async_size = sizeof(struct io_async_msghdr),
d3656344 892 },
0463b6c5 893 [IORING_OP_RECVMSG] = {
d3656344
JA
894 .needs_file = 1,
895 .unbound_nonreg_file = 1,
8a72758c 896 .pollin = 1,
52de1fe1 897 .buffer_select = 1,
e8c2bc1f
JA
898 .needs_async_data = 1,
899 .async_size = sizeof(struct io_async_msghdr),
d3656344 900 },
0463b6c5 901 [IORING_OP_TIMEOUT] = {
e8c2bc1f
JA
902 .needs_async_data = 1,
903 .async_size = sizeof(struct io_timeout_data),
d3656344 904 },
9c8e11b3
PB
905 [IORING_OP_TIMEOUT_REMOVE] = {
906 /* used by timeout updates' prep() */
9c8e11b3 907 },
0463b6c5 908 [IORING_OP_ACCEPT] = {
d3656344
JA
909 .needs_file = 1,
910 .unbound_nonreg_file = 1,
8a72758c 911 .pollin = 1,
d3656344 912 },
0463b6c5
PB
913 [IORING_OP_ASYNC_CANCEL] = {},
914 [IORING_OP_LINK_TIMEOUT] = {
e8c2bc1f
JA
915 .needs_async_data = 1,
916 .async_size = sizeof(struct io_timeout_data),
d3656344 917 },
0463b6c5 918 [IORING_OP_CONNECT] = {
d3656344
JA
919 .needs_file = 1,
920 .unbound_nonreg_file = 1,
8a72758c 921 .pollout = 1,
e8c2bc1f
JA
922 .needs_async_data = 1,
923 .async_size = sizeof(struct io_async_connect),
d3656344 924 },
0463b6c5 925 [IORING_OP_FALLOCATE] = {
d3656344 926 .needs_file = 1,
d3656344 927 },
44526bed
JA
928 [IORING_OP_OPENAT] = {},
929 [IORING_OP_CLOSE] = {},
930 [IORING_OP_FILES_UPDATE] = {},
931 [IORING_OP_STATX] = {},
0463b6c5 932 [IORING_OP_READ] = {
3a6820f2
JA
933 .needs_file = 1,
934 .unbound_nonreg_file = 1,
8a72758c 935 .pollin = 1,
bcda7baa 936 .buffer_select = 1,
27926b68 937 .plug = 1,
e8c2bc1f 938 .async_size = sizeof(struct io_async_rw),
3a6820f2 939 },
0463b6c5 940 [IORING_OP_WRITE] = {
3a6820f2
JA
941 .needs_file = 1,
942 .unbound_nonreg_file = 1,
8a72758c 943 .pollout = 1,
27926b68 944 .plug = 1,
e8c2bc1f 945 .async_size = sizeof(struct io_async_rw),
3a6820f2 946 },
0463b6c5 947 [IORING_OP_FADVISE] = {
4840e418 948 .needs_file = 1,
c1ca757b 949 },
44526bed 950 [IORING_OP_MADVISE] = {},
0463b6c5 951 [IORING_OP_SEND] = {
fddaface
JA
952 .needs_file = 1,
953 .unbound_nonreg_file = 1,
8a72758c 954 .pollout = 1,
fddaface 955 },
0463b6c5 956 [IORING_OP_RECV] = {
fddaface
JA
957 .needs_file = 1,
958 .unbound_nonreg_file = 1,
8a72758c 959 .pollin = 1,
bcda7baa 960 .buffer_select = 1,
fddaface 961 },
0463b6c5 962 [IORING_OP_OPENAT2] = {
cebdb986 963 },
3e4827b0
JA
964 [IORING_OP_EPOLL_CTL] = {
965 .unbound_nonreg_file = 1,
3e4827b0 966 },
7d67af2c
PB
967 [IORING_OP_SPLICE] = {
968 .needs_file = 1,
969 .hash_reg_file = 1,
970 .unbound_nonreg_file = 1,
ddf0322d
JA
971 },
972 [IORING_OP_PROVIDE_BUFFERS] = {},
067524e9 973 [IORING_OP_REMOVE_BUFFERS] = {},
f2a8d5c7
PB
974 [IORING_OP_TEE] = {
975 .needs_file = 1,
976 .hash_reg_file = 1,
977 .unbound_nonreg_file = 1,
978 },
36f4fa68
JA
979 [IORING_OP_SHUTDOWN] = {
980 .needs_file = 1,
981 },
44526bed
JA
982 [IORING_OP_RENAMEAT] = {},
983 [IORING_OP_UNLINKAT] = {},
d3656344
JA
984};
985
7a612350 986static bool io_disarm_next(struct io_kiocb *req);
d56d938b 987static void io_uring_del_task_file(unsigned long index);
9936c7c2
PB
988static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
989 struct task_struct *task,
990 struct files_struct *files);
37d1e2e3 991static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
269bbe5f 992static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
bc9744cd 993static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
1ffc5422 994 struct io_ring_ctx *ctx);
f2303b1f 995static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
1ffc5422 996
23faba36 997static bool io_rw_reissue(struct io_kiocb *req);
78e19bbe 998static void io_cqring_fill_event(struct io_kiocb *req, long res);
ec9c02ad 999static void io_put_req(struct io_kiocb *req);
216578e5 1000static void io_put_req_deferred(struct io_kiocb *req, int nr);
c40f6379 1001static void io_double_put_req(struct io_kiocb *req);
c7dae4ba
JA
1002static void io_dismantle_req(struct io_kiocb *req);
1003static void io_put_task(struct task_struct *task, int nr);
1004static void io_queue_next(struct io_kiocb *req);
94ae5e77 1005static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
7271ef3a 1006static void __io_queue_linked_timeout(struct io_kiocb *req);
94ae5e77 1007static void io_queue_linked_timeout(struct io_kiocb *req);
05f3fb3c 1008static int __io_sqe_files_update(struct io_ring_ctx *ctx,
269bbe5f 1009 struct io_uring_rsrc_update *ip,
05f3fb3c 1010 unsigned nr_args);
3ca405eb 1011static void __io_clean_op(struct io_kiocb *req);
8371adf5
PB
1012static struct file *io_file_get(struct io_submit_state *state,
1013 struct io_kiocb *req, int fd, bool fixed);
c5eef2b9 1014static void __io_queue_sqe(struct io_kiocb *req);
269bbe5f 1015static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1016
847595de
PB
1017static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
1018 struct iov_iter *iter, bool needs_lock);
ff6165b2
JA
1019static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
1020 const struct iovec *fast_iov,
227c0c96 1021 struct iov_iter *iter, bool force);
907d1df3 1022static void io_req_task_queue(struct io_kiocb *req);
65453d1e
JA
1023static void io_submit_flush_completions(struct io_comp_state *cs,
1024 struct io_ring_ctx *ctx);
de0617e4 1025
2b188cc1
JA
1026static struct kmem_cache *req_cachep;
1027
0918682b 1028static const struct file_operations io_uring_fops;
2b188cc1
JA
1029
1030struct sock *io_uring_get_socket(struct file *file)
1031{
1032#if defined(CONFIG_UNIX)
1033 if (file->f_op == &io_uring_fops) {
1034 struct io_ring_ctx *ctx = file->private_data;
1035
1036 return ctx->ring_sock->sk;
1037 }
1038#endif
1039 return NULL;
1040}
1041EXPORT_SYMBOL(io_uring_get_socket);
1042
f2f87370
PB
1043#define io_for_each_link(pos, head) \
1044 for (pos = (head); pos; pos = pos->link)
1045
3ca405eb
PB
1046static inline void io_clean_op(struct io_kiocb *req)
1047{
9d5c8190 1048 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
3ca405eb
PB
1049 __io_clean_op(req);
1050}
1051
36f72fe2
PB
1052static inline void io_set_resource_node(struct io_kiocb *req)
1053{
1054 struct io_ring_ctx *ctx = req->ctx;
1055
269bbe5f
BM
1056 if (!req->fixed_rsrc_refs) {
1057 req->fixed_rsrc_refs = &ctx->file_data->node->refs;
1058 percpu_ref_get(req->fixed_rsrc_refs);
36f72fe2
PB
1059 }
1060}
1061
08d23634
PB
1062static bool io_match_task(struct io_kiocb *head,
1063 struct task_struct *task,
1064 struct files_struct *files)
1065{
1066 struct io_kiocb *req;
1067
84965ff8
JA
1068 if (task && head->task != task) {
1069 /* in terms of cancelation, always match if req task is dead */
1070 if (head->task->flags & PF_EXITING)
1071 return true;
08d23634 1072 return false;
84965ff8 1073 }
08d23634
PB
1074 if (!files)
1075 return true;
1076
1077 io_for_each_link(req, head) {
b05a1bcd 1078 if (req->flags & REQ_F_INFLIGHT)
02a13674 1079 return true;
4379bf8b 1080 if (req->task->files == files)
08d23634
PB
1081 return true;
1082 }
1083 return false;
1084}
1085
c40f6379
JA
1086static inline void req_set_fail_links(struct io_kiocb *req)
1087{
1088 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1089 req->flags |= REQ_F_FAIL_LINK;
1090}
4a38aed2 1091
2b188cc1
JA
1092static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1093{
1094 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1095
0f158b4c 1096 complete(&ctx->ref_comp);
2b188cc1
JA
1097}
1098
8eb7e2d0
PB
1099static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1100{
1101 return !req->timeout.off;
1102}
1103
2b188cc1
JA
1104static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1105{
1106 struct io_ring_ctx *ctx;
78076bb6 1107 int hash_bits;
2b188cc1
JA
1108
1109 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1110 if (!ctx)
1111 return NULL;
1112
78076bb6
JA
1113 /*
1114 * Use 5 bits less than the max cq entries, that should give us around
1115 * 32 entries per hash list if totally full and uniformly spread.
1116 */
1117 hash_bits = ilog2(p->cq_entries);
1118 hash_bits -= 5;
1119 if (hash_bits <= 0)
1120 hash_bits = 1;
1121 ctx->cancel_hash_bits = hash_bits;
1122 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1123 GFP_KERNEL);
1124 if (!ctx->cancel_hash)
1125 goto err;
1126 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1127
21482896 1128 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1129 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1130 goto err;
2b188cc1
JA
1131
1132 ctx->flags = p->flags;
90554200 1133 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1134 INIT_LIST_HEAD(&ctx->sqd_list);
2b188cc1 1135 init_waitqueue_head(&ctx->cq_wait);
1d7bb1d5 1136 INIT_LIST_HEAD(&ctx->cq_overflow_list);
0f158b4c 1137 init_completion(&ctx->ref_comp);
5a2e745d 1138 idr_init(&ctx->io_buffer_idr);
61cf9370 1139 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
2b188cc1
JA
1140 mutex_init(&ctx->uring_lock);
1141 init_waitqueue_head(&ctx->wait);
1142 spin_lock_init(&ctx->completion_lock);
540e32a0 1143 INIT_LIST_HEAD(&ctx->iopoll_list);
de0617e4 1144 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1145 INIT_LIST_HEAD(&ctx->timeout_list);
fcb323cc
JA
1146 spin_lock_init(&ctx->inflight_lock);
1147 INIT_LIST_HEAD(&ctx->inflight_list);
d67d2263
BM
1148 spin_lock_init(&ctx->rsrc_ref_lock);
1149 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1150 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1151 init_llist_head(&ctx->rsrc_put_llist);
13bf43f5 1152 INIT_LIST_HEAD(&ctx->tctx_list);
1b4c351f 1153 INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
c7dae4ba 1154 INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
2b188cc1 1155 return ctx;
206aefde 1156err:
78076bb6 1157 kfree(ctx->cancel_hash);
206aefde
JA
1158 kfree(ctx);
1159 return NULL;
2b188cc1
JA
1160}
1161
9cf7c104 1162static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1163{
2bc9930e
JA
1164 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1165 struct io_ring_ctx *ctx = req->ctx;
a197f664 1166
9cf7c104 1167 return seq != ctx->cached_cq_tail
2c3bac6d 1168 + READ_ONCE(ctx->cached_cq_overflow);
2bc9930e 1169 }
de0617e4 1170
9d858b21 1171 return false;
de0617e4
JA
1172}
1173
ce3d5aae
PB
1174static void io_req_track_inflight(struct io_kiocb *req)
1175{
1176 struct io_ring_ctx *ctx = req->ctx;
1177
1178 if (!(req->flags & REQ_F_INFLIGHT)) {
ce3d5aae
PB
1179 req->flags |= REQ_F_INFLIGHT;
1180
1181 spin_lock_irq(&ctx->inflight_lock);
1182 list_add(&req->inflight_entry, &ctx->inflight_list);
1183 spin_unlock_irq(&ctx->inflight_lock);
1184 }
1185}
1186
1e6fa521
JA
1187static void io_prep_async_work(struct io_kiocb *req)
1188{
1189 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1190 struct io_ring_ctx *ctx = req->ctx;
1191
003e8dcc
JA
1192 if (!req->work.creds)
1193 req->work.creds = get_current_cred();
1194
feaadc4f
PB
1195 if (req->flags & REQ_F_FORCE_ASYNC)
1196 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1197
1e6fa521
JA
1198 if (req->flags & REQ_F_ISREG) {
1199 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1200 io_wq_hash_work(&req->work, file_inode(req->file));
1201 } else {
1202 if (def->unbound_nonreg_file)
1203 req->work.flags |= IO_WQ_WORK_UNBOUND;
1204 }
561fb04a 1205}
cccf0ee8 1206
cbdcb435 1207static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1208{
cbdcb435 1209 struct io_kiocb *cur;
54a91f3b 1210
f2f87370
PB
1211 io_for_each_link(cur, req)
1212 io_prep_async_work(cur);
561fb04a
JA
1213}
1214
ebf93667 1215static void io_queue_async_work(struct io_kiocb *req)
561fb04a 1216{
a197f664 1217 struct io_ring_ctx *ctx = req->ctx;
cbdcb435 1218 struct io_kiocb *link = io_prep_linked_timeout(req);
5aa75ed5 1219 struct io_uring_task *tctx = req->task->io_uring;
561fb04a 1220
3bfe6106
JA
1221 BUG_ON(!tctx);
1222 BUG_ON(!tctx->io_wq);
561fb04a 1223
8766dd51
PB
1224 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1225 &req->work, req->flags);
cbdcb435
PB
1226 /* init ->work of the whole link before punting */
1227 io_prep_async_link(req);
ebf93667 1228 io_wq_enqueue(tctx->io_wq, &req->work);
7271ef3a
JA
1229 if (link)
1230 io_queue_linked_timeout(link);
cbdcb435
PB
1231}
1232
5262f567
JA
1233static void io_kill_timeout(struct io_kiocb *req)
1234{
e8c2bc1f 1235 struct io_timeout_data *io = req->async_data;
5262f567
JA
1236 int ret;
1237
e8c2bc1f 1238 ret = hrtimer_try_to_cancel(&io->timer);
5262f567 1239 if (ret != -1) {
01cec8c1
PB
1240 atomic_set(&req->ctx->cq_timeouts,
1241 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1242 list_del_init(&req->timeout.list);
78e19bbe 1243 io_cqring_fill_event(req, 0);
216578e5 1244 io_put_req_deferred(req, 1);
5262f567
JA
1245 }
1246}
1247
76e1b642
JA
1248/*
1249 * Returns true if we found and killed one or more timeouts
1250 */
6b81928d
PB
1251static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
1252 struct files_struct *files)
5262f567
JA
1253{
1254 struct io_kiocb *req, *tmp;
76e1b642 1255 int canceled = 0;
5262f567
JA
1256
1257 spin_lock_irq(&ctx->completion_lock);
f3606e3a 1258 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
6b81928d 1259 if (io_match_task(req, tsk, files)) {
f3606e3a 1260 io_kill_timeout(req);
76e1b642
JA
1261 canceled++;
1262 }
f3606e3a 1263 }
5262f567 1264 spin_unlock_irq(&ctx->completion_lock);
76e1b642 1265 return canceled != 0;
5262f567
JA
1266}
1267
04518945 1268static void __io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 1269{
04518945 1270 do {
27dc8338
PB
1271 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1272 struct io_defer_entry, list);
de0617e4 1273
9cf7c104 1274 if (req_need_defer(de->req, de->seq))
04518945 1275 break;
27dc8338 1276 list_del_init(&de->list);
907d1df3 1277 io_req_task_queue(de->req);
27dc8338 1278 kfree(de);
04518945
PB
1279 } while (!list_empty(&ctx->defer_list));
1280}
1281
360428f8 1282static void io_flush_timeouts(struct io_ring_ctx *ctx)
de0617e4 1283{
f010505b
MDG
1284 u32 seq;
1285
1286 if (list_empty(&ctx->timeout_list))
1287 return;
1288
1289 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1290
1291 do {
1292 u32 events_needed, events_got;
360428f8 1293 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
135fcde8 1294 struct io_kiocb, timeout.list);
de0617e4 1295
8eb7e2d0 1296 if (io_is_timeout_noseq(req))
360428f8 1297 break;
f010505b
MDG
1298
1299 /*
1300 * Since seq can easily wrap around over time, subtract
1301 * the last seq at which timeouts were flushed before comparing.
1302 * Assuming not more than 2^31-1 events have happened since,
1303 * these subtractions won't have wrapped, so we can check if
1304 * target is in [last_seq, current_seq] by comparing the two.
1305 */
1306 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1307 events_got = seq - ctx->cq_last_tm_flush;
1308 if (events_got < events_needed)
360428f8 1309 break;
bfe68a22 1310
135fcde8 1311 list_del_init(&req->timeout.list);
5262f567 1312 io_kill_timeout(req);
f010505b
MDG
1313 } while (!list_empty(&ctx->timeout_list));
1314
1315 ctx->cq_last_tm_flush = seq;
360428f8 1316}
5262f567 1317
360428f8
PB
1318static void io_commit_cqring(struct io_ring_ctx *ctx)
1319{
1320 io_flush_timeouts(ctx);
ec30e04b
PB
1321
1322 /* order cqe stores with ring update */
1323 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
de0617e4 1324
04518945
PB
1325 if (unlikely(!list_empty(&ctx->defer_list)))
1326 __io_queue_deferred(ctx);
de0617e4
JA
1327}
1328
90554200
JA
1329static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1330{
1331 struct io_rings *r = ctx->rings;
1332
1333 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1334}
1335
888aae2e
PB
1336static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1337{
1338 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1339}
1340
2b188cc1
JA
1341static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1342{
75b28aff 1343 struct io_rings *rings = ctx->rings;
2b188cc1
JA
1344 unsigned tail;
1345
115e12e5
SB
1346 /*
1347 * writes to the cq entry need to come after reading head; the
1348 * control dependency is enough as we're using WRITE_ONCE to
1349 * fill the cq entry
1350 */
888aae2e 1351 if (__io_cqring_events(ctx) == rings->cq_ring_entries)
2b188cc1
JA
1352 return NULL;
1353
888aae2e 1354 tail = ctx->cached_cq_tail++;
75b28aff 1355 return &rings->cqes[tail & ctx->cq_mask];
2b188cc1
JA
1356}
1357
f2842ab5
JA
1358static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1359{
f0b493e6
JA
1360 if (!ctx->cq_ev_fd)
1361 return false;
7e55a19c
SG
1362 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1363 return false;
f2842ab5
JA
1364 if (!ctx->eventfd_async)
1365 return true;
b41e9852 1366 return io_wq_current_is_worker();
f2842ab5
JA
1367}
1368
b41e9852 1369static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 1370{
b1445e59
PB
1371 /* see waitqueue_active() comment */
1372 smp_mb();
1373
1d7bb1d5
JA
1374 if (waitqueue_active(&ctx->wait))
1375 wake_up(&ctx->wait);
534ca6d6
JA
1376 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1377 wake_up(&ctx->sq_data->wait);
b41e9852 1378 if (io_should_trigger_evfd(ctx))
1d7bb1d5 1379 eventfd_signal(ctx->cq_ev_fd, 1);
b1445e59 1380 if (waitqueue_active(&ctx->cq_wait)) {
4aa84f2f
PB
1381 wake_up_interruptible(&ctx->cq_wait);
1382 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1383 }
1d7bb1d5
JA
1384}
1385
80c18e4a
PB
1386static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1387{
b1445e59
PB
1388 /* see waitqueue_active() comment */
1389 smp_mb();
1390
80c18e4a
PB
1391 if (ctx->flags & IORING_SETUP_SQPOLL) {
1392 if (waitqueue_active(&ctx->wait))
1393 wake_up(&ctx->wait);
1394 }
1395 if (io_should_trigger_evfd(ctx))
1396 eventfd_signal(ctx->cq_ev_fd, 1);
b1445e59 1397 if (waitqueue_active(&ctx->cq_wait)) {
4aa84f2f
PB
1398 wake_up_interruptible(&ctx->cq_wait);
1399 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1400 }
80c18e4a
PB
1401}
1402
c4a2ed72 1403/* Returns true if there are no backlogged entries after the flush */
6c503150
PB
1404static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1405 struct task_struct *tsk,
1406 struct files_struct *files)
1d7bb1d5
JA
1407{
1408 struct io_rings *rings = ctx->rings;
e6c8aa9a 1409 struct io_kiocb *req, *tmp;
1d7bb1d5 1410 struct io_uring_cqe *cqe;
1d7bb1d5 1411 unsigned long flags;
b18032bb 1412 bool all_flushed, posted;
1d7bb1d5
JA
1413 LIST_HEAD(list);
1414
e23de15f
PB
1415 if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
1416 return false;
1d7bb1d5 1417
b18032bb 1418 posted = false;
1d7bb1d5 1419 spin_lock_irqsave(&ctx->completion_lock, flags);
e6c8aa9a 1420 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
08d23634 1421 if (!io_match_task(req, tsk, files))
e6c8aa9a
JA
1422 continue;
1423
1d7bb1d5
JA
1424 cqe = io_get_cqring(ctx);
1425 if (!cqe && !force)
1426 break;
1427
40d8ddd4 1428 list_move(&req->compl.list, &list);
1d7bb1d5
JA
1429 if (cqe) {
1430 WRITE_ONCE(cqe->user_data, req->user_data);
1431 WRITE_ONCE(cqe->res, req->result);
0f7e466b 1432 WRITE_ONCE(cqe->flags, req->compl.cflags);
1d7bb1d5 1433 } else {
2c3bac6d 1434 ctx->cached_cq_overflow++;
1d7bb1d5 1435 WRITE_ONCE(ctx->rings->cq_overflow,
2c3bac6d 1436 ctx->cached_cq_overflow);
1d7bb1d5 1437 }
b18032bb 1438 posted = true;
1d7bb1d5
JA
1439 }
1440
09e88404
PB
1441 all_flushed = list_empty(&ctx->cq_overflow_list);
1442 if (all_flushed) {
1443 clear_bit(0, &ctx->sq_check_overflow);
1444 clear_bit(0, &ctx->cq_check_overflow);
1445 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1446 }
46930143 1447
b18032bb
JA
1448 if (posted)
1449 io_commit_cqring(ctx);
1d7bb1d5 1450 spin_unlock_irqrestore(&ctx->completion_lock, flags);
b18032bb
JA
1451 if (posted)
1452 io_cqring_ev_posted(ctx);
1d7bb1d5
JA
1453
1454 while (!list_empty(&list)) {
40d8ddd4
PB
1455 req = list_first_entry(&list, struct io_kiocb, compl.list);
1456 list_del(&req->compl.list);
ec9c02ad 1457 io_put_req(req);
1d7bb1d5 1458 }
c4a2ed72 1459
09e88404 1460 return all_flushed;
1d7bb1d5
JA
1461}
1462
ca0a2651 1463static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
6c503150
PB
1464 struct task_struct *tsk,
1465 struct files_struct *files)
1466{
ca0a2651
JA
1467 bool ret = true;
1468
6c503150
PB
1469 if (test_bit(0, &ctx->cq_check_overflow)) {
1470 /* iopoll syncs against uring_lock, not completion_lock */
1471 if (ctx->flags & IORING_SETUP_IOPOLL)
1472 mutex_lock(&ctx->uring_lock);
ca0a2651 1473 ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
6c503150
PB
1474 if (ctx->flags & IORING_SETUP_IOPOLL)
1475 mutex_unlock(&ctx->uring_lock);
1476 }
ca0a2651
JA
1477
1478 return ret;
6c503150
PB
1479}
1480
bcda7baa 1481static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
2b188cc1 1482{
78e19bbe 1483 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1484 struct io_uring_cqe *cqe;
1485
78e19bbe 1486 trace_io_uring_complete(ctx, req->user_data, res);
51c3ff62 1487
2b188cc1
JA
1488 /*
1489 * If we can't get a cq entry, userspace overflowed the
1490 * submission (by quite a lot). Increment the overflow count in
1491 * the ring.
1492 */
1493 cqe = io_get_cqring(ctx);
1d7bb1d5 1494 if (likely(cqe)) {
78e19bbe 1495 WRITE_ONCE(cqe->user_data, req->user_data);
2b188cc1 1496 WRITE_ONCE(cqe->res, res);
bcda7baa 1497 WRITE_ONCE(cqe->flags, cflags);
fdaf083c
JA
1498 } else if (ctx->cq_overflow_flushed ||
1499 atomic_read(&req->task->io_uring->in_idle)) {
0f212204
JA
1500 /*
1501 * If we're in ring overflow flush mode, or in task cancel mode,
1502 * then we cannot store the request for later flushing, we need
1503 * to drop it on the floor.
1504 */
2c3bac6d
PB
1505 ctx->cached_cq_overflow++;
1506 WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1d7bb1d5 1507 } else {
ad3eb2c8
JA
1508 if (list_empty(&ctx->cq_overflow_list)) {
1509 set_bit(0, &ctx->sq_check_overflow);
1510 set_bit(0, &ctx->cq_check_overflow);
6d5f9049 1511 ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
ad3eb2c8 1512 }
40d8ddd4 1513 io_clean_op(req);
1d7bb1d5 1514 req->result = res;
0f7e466b 1515 req->compl.cflags = cflags;
40d8ddd4
PB
1516 refcount_inc(&req->refs);
1517 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
2b188cc1
JA
1518 }
1519}
1520
bcda7baa
JA
1521static void io_cqring_fill_event(struct io_kiocb *req, long res)
1522{
1523 __io_cqring_fill_event(req, res, 0);
1524}
1525
7a612350
PB
1526static void io_req_complete_post(struct io_kiocb *req, long res,
1527 unsigned int cflags)
2b188cc1 1528{
78e19bbe 1529 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1530 unsigned long flags;
1531
1532 spin_lock_irqsave(&ctx->completion_lock, flags);
bcda7baa 1533 __io_cqring_fill_event(req, res, cflags);
c7dae4ba
JA
1534 /*
1535 * If we're the last reference to this request, add to our locked
1536 * free_list cache.
1537 */
1538 if (refcount_dec_and_test(&req->refs)) {
1539 struct io_comp_state *cs = &ctx->submit_state.comp;
1540
7a612350
PB
1541 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1542 if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK))
1543 io_disarm_next(req);
1544 if (req->link) {
1545 io_req_task_queue(req->link);
1546 req->link = NULL;
1547 }
1548 }
c7dae4ba
JA
1549 io_dismantle_req(req);
1550 io_put_task(req->task, 1);
1551 list_add(&req->compl.list, &cs->locked_free_list);
1552 cs->locked_free_nr++;
1553 } else
1554 req = NULL;
7a612350 1555 io_commit_cqring(ctx);
2b188cc1 1556 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8c838788 1557 io_cqring_ev_posted(ctx);
7a612350
PB
1558
1559 if (req)
c7dae4ba 1560 percpu_ref_put(&ctx->refs);
229a7b63
JA
1561}
1562
a38d68db 1563static void io_req_complete_state(struct io_kiocb *req, long res,
889fca73 1564 unsigned int cflags)
229a7b63 1565{
a38d68db
PB
1566 io_clean_op(req);
1567 req->result = res;
1568 req->compl.cflags = cflags;
e342c807 1569 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
1570}
1571
889fca73
PB
1572static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1573 long res, unsigned cflags)
bcda7baa 1574{
889fca73
PB
1575 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1576 io_req_complete_state(req, res, cflags);
a38d68db 1577 else
c7dae4ba 1578 io_req_complete_post(req, res, cflags);
bcda7baa
JA
1579}
1580
a38d68db 1581static inline void io_req_complete(struct io_kiocb *req, long res)
0ddf92e8 1582{
889fca73 1583 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
1584}
1585
c7dae4ba 1586static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
0ddf92e8 1587{
c7dae4ba
JA
1588 struct io_submit_state *state = &ctx->submit_state;
1589 struct io_comp_state *cs = &state->comp;
e5d1bc0a 1590 struct io_kiocb *req = NULL;
0ddf92e8 1591
c7dae4ba
JA
1592 /*
1593 * If we have more than a batch's worth of requests in our IRQ side
1594 * locked cache, grab the lock and move them over to our submission
1595 * side cache.
1596 */
1597 if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
1598 spin_lock_irq(&ctx->completion_lock);
1599 list_splice_init(&cs->locked_free_list, &cs->free_list);
1600 cs->locked_free_nr = 0;
1601 spin_unlock_irq(&ctx->completion_lock);
1602 }
0ddf92e8 1603
c7dae4ba
JA
1604 while (!list_empty(&cs->free_list)) {
1605 req = list_first_entry(&cs->free_list, struct io_kiocb,
1b4c351f
JA
1606 compl.list);
1607 list_del(&req->compl.list);
e5d1bc0a
PB
1608 state->reqs[state->free_reqs++] = req;
1609 if (state->free_reqs == ARRAY_SIZE(state->reqs))
1610 break;
1b4c351f
JA
1611 }
1612
e5d1bc0a 1613 return req != NULL;
0ddf92e8
JA
1614}
1615
e5d1bc0a 1616static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2b188cc1 1617{
e5d1bc0a
PB
1618 struct io_submit_state *state = &ctx->submit_state;
1619
1620 BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
1621
f6b6c7d6 1622 if (!state->free_reqs) {
291b2821 1623 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2579f913
JA
1624 int ret;
1625
c7dae4ba 1626 if (io_flush_cached_reqs(ctx))
e5d1bc0a
PB
1627 goto got_req;
1628
bf019da7
PB
1629 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1630 state->reqs);
fd6fab2c
JA
1631
1632 /*
1633 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1634 * retry single alloc to be on the safe side.
1635 */
1636 if (unlikely(ret <= 0)) {
1637 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1638 if (!state->reqs[0])
3893f39f 1639 return NULL;
fd6fab2c
JA
1640 ret = 1;
1641 }
291b2821 1642 state->free_reqs = ret;
2b188cc1 1643 }
e5d1bc0a 1644got_req:
291b2821
PB
1645 state->free_reqs--;
1646 return state->reqs[state->free_reqs];
2b188cc1
JA
1647}
1648
8da11c19
PB
1649static inline void io_put_file(struct io_kiocb *req, struct file *file,
1650 bool fixed)
1651{
36f72fe2 1652 if (!fixed)
8da11c19
PB
1653 fput(file);
1654}
1655
4edf20f9 1656static void io_dismantle_req(struct io_kiocb *req)
2b188cc1 1657{
3ca405eb 1658 io_clean_op(req);
929a3af9 1659
e8c2bc1f
JA
1660 if (req->async_data)
1661 kfree(req->async_data);
8da11c19
PB
1662 if (req->file)
1663 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
269bbe5f
BM
1664 if (req->fixed_rsrc_refs)
1665 percpu_ref_put(req->fixed_rsrc_refs);
003e8dcc
JA
1666 if (req->work.creds) {
1667 put_cred(req->work.creds);
1668 req->work.creds = NULL;
1669 }
f85c310a
PB
1670
1671 if (req->flags & REQ_F_INFLIGHT) {
1672 struct io_ring_ctx *ctx = req->ctx;
f85c310a
PB
1673 unsigned long flags;
1674
1675 spin_lock_irqsave(&ctx->inflight_lock, flags);
1676 list_del(&req->inflight_entry);
1677 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1678 req->flags &= ~REQ_F_INFLIGHT;
f85c310a 1679 }
e65ef56d
JA
1680}
1681
b23fcf47 1682/* must to be called somewhat shortly after putting a request */
7c660731
PB
1683static inline void io_put_task(struct task_struct *task, int nr)
1684{
1685 struct io_uring_task *tctx = task->io_uring;
1686
1687 percpu_counter_sub(&tctx->inflight, nr);
1688 if (unlikely(atomic_read(&tctx->in_idle)))
1689 wake_up(&tctx->wait);
1690 put_task_struct_many(task, nr);
1691}
1692
216578e5 1693static void __io_free_req(struct io_kiocb *req)
c6ca97b3 1694{
51a4cc11 1695 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 1696
216578e5 1697 io_dismantle_req(req);
7c660731 1698 io_put_task(req->task, 1);
c6ca97b3 1699
3893f39f 1700 kmem_cache_free(req_cachep, req);
ecfc5177 1701 percpu_ref_put(&ctx->refs);
e65ef56d
JA
1702}
1703
f2f87370
PB
1704static inline void io_remove_next_linked(struct io_kiocb *req)
1705{
1706 struct io_kiocb *nxt = req->link;
1707
1708 req->link = nxt->link;
1709 nxt->link = NULL;
1710}
1711
33cc89a9
PB
1712static bool io_kill_linked_timeout(struct io_kiocb *req)
1713 __must_hold(&req->ctx->completion_lock)
2665abfd 1714{
33cc89a9 1715 struct io_kiocb *link = req->link;
c9abd7ad 1716 bool cancelled = false;
f2f87370 1717
900fad45
PB
1718 /*
1719 * Can happen if a linked timeout fired and link had been like
1720 * req -> link t-out -> link t-out [-> ...]
1721 */
c9abd7ad
PB
1722 if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1723 struct io_timeout_data *io = link->async_data;
1724 int ret;
7c86ffee 1725
f2f87370 1726 io_remove_next_linked(req);
90cd7e42 1727 link->timeout.head = NULL;
c9abd7ad
PB
1728 ret = hrtimer_try_to_cancel(&io->timer);
1729 if (ret != -1) {
1730 io_cqring_fill_event(link, -ECANCELED);
33cc89a9 1731 io_put_req_deferred(link, 1);
c9abd7ad
PB
1732 cancelled = true;
1733 }
1734 }
7c86ffee 1735 req->flags &= ~REQ_F_LINK_TIMEOUT;
33cc89a9 1736 return cancelled;
7c86ffee
PB
1737}
1738
d148ca4b 1739static void io_fail_links(struct io_kiocb *req)
33cc89a9 1740 __must_hold(&req->ctx->completion_lock)
9e645e11 1741{
33cc89a9 1742 struct io_kiocb *nxt, *link = req->link;
9e645e11 1743
f2f87370 1744 req->link = NULL;
f2f87370
PB
1745 while (link) {
1746 nxt = link->link;
1747 link->link = NULL;
2665abfd 1748
f2f87370 1749 trace_io_uring_fail_link(req, link);
7c86ffee 1750 io_cqring_fill_event(link, -ECANCELED);
1575f21a 1751 io_put_req_deferred(link, 2);
f2f87370 1752 link = nxt;
9e645e11 1753 }
33cc89a9 1754}
9e645e11 1755
33cc89a9
PB
1756static bool io_disarm_next(struct io_kiocb *req)
1757 __must_hold(&req->ctx->completion_lock)
1758{
1759 bool posted = false;
1760
1761 if (likely(req->flags & REQ_F_LINK_TIMEOUT))
1762 posted = io_kill_linked_timeout(req);
1763 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
1764 posted |= (req->link != NULL);
1765 io_fail_links(req);
1766 }
1767 return posted;
9e645e11
JA
1768}
1769
3fa5e0f3 1770static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
c69f8dbe 1771{
33cc89a9 1772 struct io_kiocb *nxt;
944e58bf 1773
9e645e11
JA
1774 /*
1775 * If LINK is set, we have dependent requests in this chain. If we
1776 * didn't fail this request, queue the first one up, moving any other
1777 * dependencies to the next request. In case of failure, fail the rest
1778 * of the chain.
1779 */
33cc89a9
PB
1780 if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) {
1781 struct io_ring_ctx *ctx = req->ctx;
1782 unsigned long flags;
1783 bool posted;
1784
1785 spin_lock_irqsave(&ctx->completion_lock, flags);
1786 posted = io_disarm_next(req);
1787 if (posted)
1788 io_commit_cqring(req->ctx);
1789 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1790 if (posted)
1791 io_cqring_ev_posted(ctx);
f2f87370 1792 }
33cc89a9
PB
1793 nxt = req->link;
1794 req->link = NULL;
1795 return nxt;
4d7dd462 1796}
9e645e11 1797
f2f87370 1798static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
3fa5e0f3 1799{
cdbff982 1800 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
3fa5e0f3
PB
1801 return NULL;
1802 return __io_req_find_next(req);
1803}
1804
2c32395d
PB
1805static void ctx_flush_and_put(struct io_ring_ctx *ctx)
1806{
1807 if (!ctx)
1808 return;
1809 if (ctx->submit_state.comp.nr) {
1810 mutex_lock(&ctx->uring_lock);
1811 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
1812 mutex_unlock(&ctx->uring_lock);
1813 }
1814 percpu_ref_put(&ctx->refs);
1815}
1816
7cbf1722 1817static bool __tctx_task_work(struct io_uring_task *tctx)
c2c4c83c 1818{
65453d1e 1819 struct io_ring_ctx *ctx = NULL;
7cbf1722
JA
1820 struct io_wq_work_list list;
1821 struct io_wq_work_node *node;
c2c4c83c 1822
7cbf1722
JA
1823 if (wq_list_empty(&tctx->task_list))
1824 return false;
6200b0ae 1825
0b81e80c 1826 spin_lock_irq(&tctx->task_lock);
7cbf1722
JA
1827 list = tctx->task_list;
1828 INIT_WQ_LIST(&tctx->task_list);
0b81e80c 1829 spin_unlock_irq(&tctx->task_lock);
c2c4c83c 1830
7cbf1722
JA
1831 node = list.first;
1832 while (node) {
1833 struct io_wq_work_node *next = node->next;
1834 struct io_kiocb *req;
0ba9c9ed 1835
7cbf1722 1836 req = container_of(node, struct io_kiocb, io_task_work.node);
2c32395d
PB
1837 if (req->ctx != ctx) {
1838 ctx_flush_and_put(ctx);
1839 ctx = req->ctx;
1840 percpu_ref_get(&ctx->refs);
65453d1e 1841 }
65453d1e 1842
2c32395d
PB
1843 req->task_work.func(&req->task_work);
1844 node = next;
7cbf1722
JA
1845 }
1846
2c32395d 1847 ctx_flush_and_put(ctx);
7cbf1722 1848 return list.first != NULL;
c2c4c83c
JA
1849}
1850
7cbf1722 1851static void tctx_task_work(struct callback_head *cb)
c40f6379 1852{
7cbf1722 1853 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
c40f6379 1854
1d5f360d
JA
1855 clear_bit(0, &tctx->task_state);
1856
7cbf1722
JA
1857 while (__tctx_task_work(tctx))
1858 cond_resched();
7cbf1722
JA
1859}
1860
1861static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
1862 enum task_work_notify_mode notify)
1863{
1864 struct io_uring_task *tctx = tsk->io_uring;
1865 struct io_wq_work_node *node, *prev;
0b81e80c 1866 unsigned long flags;
7cbf1722
JA
1867 int ret;
1868
1869 WARN_ON_ONCE(!tctx);
1870
0b81e80c 1871 spin_lock_irqsave(&tctx->task_lock, flags);
7cbf1722 1872 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
0b81e80c 1873 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
1874
1875 /* task_work already pending, we're done */
1876 if (test_bit(0, &tctx->task_state) ||
1877 test_and_set_bit(0, &tctx->task_state))
1878 return 0;
1879
1880 if (!task_work_add(tsk, &tctx->task_work, notify))
1881 return 0;
1882
1883 /*
1884 * Slow path - we failed, find and delete work. if the work is not
1885 * in the list, it got run and we're fine.
1886 */
1887 ret = 0;
0b81e80c 1888 spin_lock_irqsave(&tctx->task_lock, flags);
7cbf1722
JA
1889 wq_list_for_each(node, prev, &tctx->task_list) {
1890 if (&req->io_task_work.node == node) {
1891 wq_list_del(&tctx->task_list, node, prev);
1892 ret = 1;
1893 break;
1894 }
1895 }
0b81e80c 1896 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
1897 clear_bit(0, &tctx->task_state);
1898 return ret;
1899}
1900
355fb9e2 1901static int io_req_task_work_add(struct io_kiocb *req)
c2c4c83c
JA
1902{
1903 struct task_struct *tsk = req->task;
1904 struct io_ring_ctx *ctx = req->ctx;
91989c70
JA
1905 enum task_work_notify_mode notify;
1906 int ret;
c2c4c83c 1907
6200b0ae
JA
1908 if (tsk->flags & PF_EXITING)
1909 return -ESRCH;
1910
c2c4c83c 1911 /*
0ba9c9ed
JA
1912 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1913 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1914 * processing task_work. There's no reliable way to tell if TWA_RESUME
1915 * will do the job.
c2c4c83c 1916 */
91989c70 1917 notify = TWA_NONE;
355fb9e2 1918 if (!(ctx->flags & IORING_SETUP_SQPOLL))
c2c4c83c
JA
1919 notify = TWA_SIGNAL;
1920
7cbf1722 1921 ret = io_task_work_add(tsk, req, notify);
c2c4c83c
JA
1922 if (!ret)
1923 wake_up_process(tsk);
0ba9c9ed 1924
c2c4c83c
JA
1925 return ret;
1926}
1927
eab30c4d 1928static void io_req_task_work_add_fallback(struct io_kiocb *req,
7cbf1722 1929 task_work_func_t cb)
eab30c4d 1930{
7c25c0d1
JA
1931 struct io_ring_ctx *ctx = req->ctx;
1932 struct callback_head *head;
eab30c4d
PB
1933
1934 init_task_work(&req->task_work, cb);
7c25c0d1
JA
1935 do {
1936 head = READ_ONCE(ctx->exit_task_work);
1937 req->task_work.next = head;
1938 } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
eab30c4d
PB
1939}
1940
c40f6379
JA
1941static void __io_req_task_cancel(struct io_kiocb *req, int error)
1942{
1943 struct io_ring_ctx *ctx = req->ctx;
1944
1945 spin_lock_irq(&ctx->completion_lock);
1946 io_cqring_fill_event(req, error);
1947 io_commit_cqring(ctx);
1948 spin_unlock_irq(&ctx->completion_lock);
1949
1950 io_cqring_ev_posted(ctx);
1951 req_set_fail_links(req);
1952 io_double_put_req(req);
1953}
1954
1955static void io_req_task_cancel(struct callback_head *cb)
1956{
1957 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
87ceb6a6 1958 struct io_ring_ctx *ctx = req->ctx;
c40f6379 1959
792bb6eb 1960 mutex_lock(&ctx->uring_lock);
a3df7698 1961 __io_req_task_cancel(req, req->result);
792bb6eb 1962 mutex_unlock(&ctx->uring_lock);
87ceb6a6 1963 percpu_ref_put(&ctx->refs);
c40f6379
JA
1964}
1965
1966static void __io_req_task_submit(struct io_kiocb *req)
1967{
1968 struct io_ring_ctx *ctx = req->ctx;
1969
04fc6c80 1970 /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
81b6d05c 1971 mutex_lock(&ctx->uring_lock);
70aacfe6 1972 if (!(current->flags & PF_EXITING) && !current->in_execve)
c5eef2b9 1973 __io_queue_sqe(req);
81b6d05c 1974 else
c40f6379 1975 __io_req_task_cancel(req, -EFAULT);
81b6d05c 1976 mutex_unlock(&ctx->uring_lock);
c40f6379
JA
1977}
1978
1979static void io_req_task_submit(struct callback_head *cb)
1980{
1981 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1982
1983 __io_req_task_submit(req);
1984}
1985
1986static void io_req_task_queue(struct io_kiocb *req)
1987{
c40f6379
JA
1988 int ret;
1989
7cbf1722 1990 req->task_work.func = io_req_task_submit;
355fb9e2 1991 ret = io_req_task_work_add(req);
c40f6379 1992 if (unlikely(ret)) {
a3df7698 1993 req->result = -ECANCELED;
04fc6c80 1994 percpu_ref_get(&req->ctx->refs);
eab30c4d 1995 io_req_task_work_add_fallback(req, io_req_task_cancel);
c40f6379 1996 }
c40f6379
JA
1997}
1998
a3df7698
PB
1999static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2000{
2001 percpu_ref_get(&req->ctx->refs);
2002 req->result = ret;
2003 req->task_work.func = io_req_task_cancel;
2004
2005 if (unlikely(io_req_task_work_add(req)))
2006 io_req_task_work_add_fallback(req, io_req_task_cancel);
2007}
2008
f2f87370 2009static inline void io_queue_next(struct io_kiocb *req)
c69f8dbe 2010{
9b5f7bd9 2011 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf
PB
2012
2013 if (nxt)
906a8c3f 2014 io_req_task_queue(nxt);
c69f8dbe
JL
2015}
2016
c3524383 2017static void io_free_req(struct io_kiocb *req)
7a743e22 2018{
c3524383
PB
2019 io_queue_next(req);
2020 __io_free_req(req);
2021}
8766dd51 2022
2d6500d4 2023struct req_batch {
5af1d13e
PB
2024 struct task_struct *task;
2025 int task_refs;
1b4c351f 2026 int ctx_refs;
2d6500d4
PB
2027};
2028
5af1d13e
PB
2029static inline void io_init_req_batch(struct req_batch *rb)
2030{
5af1d13e 2031 rb->task_refs = 0;
9ae72463 2032 rb->ctx_refs = 0;
5af1d13e
PB
2033 rb->task = NULL;
2034}
2035
2d6500d4
PB
2036static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2037 struct req_batch *rb)
2038{
6e833d53 2039 if (rb->task)
7c660731 2040 io_put_task(rb->task, rb->task_refs);
9ae72463
PB
2041 if (rb->ctx_refs)
2042 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2d6500d4
PB
2043}
2044
6ff119a6
PB
2045static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2046 struct io_submit_state *state)
2d6500d4 2047{
f2f87370 2048 io_queue_next(req);
2d6500d4 2049
e3bc8e9d 2050 if (req->task != rb->task) {
7c660731
PB
2051 if (rb->task)
2052 io_put_task(rb->task, rb->task_refs);
e3bc8e9d
JA
2053 rb->task = req->task;
2054 rb->task_refs = 0;
5af1d13e 2055 }
e3bc8e9d 2056 rb->task_refs++;
9ae72463 2057 rb->ctx_refs++;
5af1d13e 2058
4edf20f9 2059 io_dismantle_req(req);
bd759045 2060 if (state->free_reqs != ARRAY_SIZE(state->reqs))
6ff119a6 2061 state->reqs[state->free_reqs++] = req;
bd759045
PB
2062 else
2063 list_add(&req->compl.list, &state->comp.free_list);
7a743e22
PB
2064}
2065
905c172f
PB
2066static void io_submit_flush_completions(struct io_comp_state *cs,
2067 struct io_ring_ctx *ctx)
2068{
2069 int i, nr = cs->nr;
2070 struct io_kiocb *req;
2071 struct req_batch rb;
2072
2073 io_init_req_batch(&rb);
2074 spin_lock_irq(&ctx->completion_lock);
2075 for (i = 0; i < nr; i++) {
2076 req = cs->reqs[i];
2077 __io_cqring_fill_event(req, req->result, req->compl.cflags);
2078 }
2079 io_commit_cqring(ctx);
2080 spin_unlock_irq(&ctx->completion_lock);
2081
2082 io_cqring_ev_posted(ctx);
2083 for (i = 0; i < nr; i++) {
2084 req = cs->reqs[i];
2085
2086 /* submission and completion refs */
2087 if (refcount_sub_and_test(2, &req->refs))
6ff119a6 2088 io_req_free_batch(&rb, req, &ctx->submit_state);
905c172f
PB
2089 }
2090
2091 io_req_free_batch_finish(ctx, &rb);
2092 cs->nr = 0;
7a743e22
PB
2093}
2094
ba816ad6
JA
2095/*
2096 * Drop reference to request, return next in chain (if there is one) if this
2097 * was the last reference to this request.
2098 */
9b5f7bd9 2099static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 2100{
9b5f7bd9
PB
2101 struct io_kiocb *nxt = NULL;
2102
2a44f467 2103 if (refcount_dec_and_test(&req->refs)) {
9b5f7bd9 2104 nxt = io_req_find_next(req);
4d7dd462 2105 __io_free_req(req);
2a44f467 2106 }
9b5f7bd9 2107 return nxt;
2b188cc1
JA
2108}
2109
e65ef56d
JA
2110static void io_put_req(struct io_kiocb *req)
2111{
2112 if (refcount_dec_and_test(&req->refs))
2113 io_free_req(req);
2b188cc1
JA
2114}
2115
216578e5
PB
2116static void io_put_req_deferred_cb(struct callback_head *cb)
2117{
2118 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2119
2120 io_free_req(req);
2121}
2122
2123static void io_free_req_deferred(struct io_kiocb *req)
2124{
2125 int ret;
2126
7cbf1722 2127 req->task_work.func = io_put_req_deferred_cb;
355fb9e2 2128 ret = io_req_task_work_add(req);
eab30c4d
PB
2129 if (unlikely(ret))
2130 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
216578e5
PB
2131}
2132
2133static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2134{
2135 if (refcount_sub_and_test(refs, &req->refs))
2136 io_free_req_deferred(req);
2137}
2138
978db57e
JA
2139static void io_double_put_req(struct io_kiocb *req)
2140{
2141 /* drop both submit and complete references */
2142 if (refcount_sub_and_test(2, &req->refs))
2143 io_free_req(req);
2144}
2145
6c503150 2146static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
2147{
2148 /* See comment at the top of this file */
2149 smp_rmb();
e23de15f 2150 return __io_cqring_events(ctx);
a3a0e43f
JA
2151}
2152
fb5ccc98
PB
2153static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2154{
2155 struct io_rings *rings = ctx->rings;
2156
2157 /* make sure SQ entry isn't read before tail */
2158 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2159}
2160
8ff069bf 2161static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
e94f141b 2162{
8ff069bf 2163 unsigned int cflags;
e94f141b 2164
bcda7baa
JA
2165 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2166 cflags |= IORING_CQE_F_BUFFER;
0e1b6fe3 2167 req->flags &= ~REQ_F_BUFFER_SELECTED;
bcda7baa
JA
2168 kfree(kbuf);
2169 return cflags;
e94f141b
JA
2170}
2171
8ff069bf 2172static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
bcda7baa 2173{
4d954c25 2174 struct io_buffer *kbuf;
bcda7baa 2175
4d954c25 2176 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
8ff069bf
PB
2177 return io_put_kbuf(req, kbuf);
2178}
2179
4c6e277c
JA
2180static inline bool io_run_task_work(void)
2181{
6200b0ae
JA
2182 /*
2183 * Not safe to run on exiting task, and the task_work handling will
2184 * not add work to such a task.
2185 */
2186 if (unlikely(current->flags & PF_EXITING))
2187 return false;
4c6e277c
JA
2188 if (current->task_works) {
2189 __set_current_state(TASK_RUNNING);
2190 task_work_run();
2191 return true;
2192 }
2193
2194 return false;
bcda7baa
JA
2195}
2196
def596e9
JA
2197/*
2198 * Find and free completed poll iocbs
2199 */
2200static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2201 struct list_head *done)
2202{
8237e045 2203 struct req_batch rb;
def596e9 2204 struct io_kiocb *req;
bbde017a
XW
2205
2206 /* order with ->result store in io_complete_rw_iopoll() */
2207 smp_rmb();
def596e9 2208
5af1d13e 2209 io_init_req_batch(&rb);
def596e9 2210 while (!list_empty(done)) {
bcda7baa
JA
2211 int cflags = 0;
2212
d21ffe7e 2213 req = list_first_entry(done, struct io_kiocb, inflight_entry);
f161340d
PB
2214 list_del(&req->inflight_entry);
2215
bbde017a
XW
2216 if (READ_ONCE(req->result) == -EAGAIN) {
2217 req->iopoll_completed = 0;
23faba36 2218 if (io_rw_reissue(req))
f161340d 2219 continue;
bbde017a 2220 }
def596e9 2221
bcda7baa 2222 if (req->flags & REQ_F_BUFFER_SELECTED)
8ff069bf 2223 cflags = io_put_rw_kbuf(req);
bcda7baa
JA
2224
2225 __io_cqring_fill_event(req, req->result, cflags);
def596e9
JA
2226 (*nr_events)++;
2227
c3524383 2228 if (refcount_dec_and_test(&req->refs))
6ff119a6 2229 io_req_free_batch(&rb, req, &ctx->submit_state);
def596e9 2230 }
def596e9 2231
09bb8394 2232 io_commit_cqring(ctx);
80c18e4a 2233 io_cqring_ev_posted_iopoll(ctx);
2d6500d4 2234 io_req_free_batch_finish(ctx, &rb);
581f9810
BM
2235}
2236
def596e9
JA
2237static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2238 long min)
2239{
2240 struct io_kiocb *req, *tmp;
2241 LIST_HEAD(done);
2242 bool spin;
2243 int ret;
2244
2245 /*
2246 * Only spin for completions if we don't have multiple devices hanging
2247 * off our complete list, and we're under the requested amount.
2248 */
2249 spin = !ctx->poll_multi_file && *nr_events < min;
2250
2251 ret = 0;
d21ffe7e 2252 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
9adbd45d 2253 struct kiocb *kiocb = &req->rw.kiocb;
def596e9
JA
2254
2255 /*
581f9810
BM
2256 * Move completed and retryable entries to our local lists.
2257 * If we find a request that requires polling, break out
2258 * and complete those lists first, if we have entries there.
def596e9 2259 */
65a6543d 2260 if (READ_ONCE(req->iopoll_completed)) {
d21ffe7e 2261 list_move_tail(&req->inflight_entry, &done);
def596e9
JA
2262 continue;
2263 }
2264 if (!list_empty(&done))
2265 break;
2266
2267 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2268 if (ret < 0)
2269 break;
2270
3aadc23e
PB
2271 /* iopoll may have completed current req */
2272 if (READ_ONCE(req->iopoll_completed))
d21ffe7e 2273 list_move_tail(&req->inflight_entry, &done);
3aadc23e 2274
def596e9
JA
2275 if (ret && spin)
2276 spin = false;
2277 ret = 0;
2278 }
2279
2280 if (!list_empty(&done))
2281 io_iopoll_complete(ctx, nr_events, &done);
2282
2283 return ret;
2284}
2285
2286/*
d195a66e 2287 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
def596e9
JA
2288 * non-spinning poll check - we'll still enter the driver poll loop, but only
2289 * as a non-spinning completion check.
2290 */
2291static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2292 long min)
2293{
540e32a0 2294 while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
def596e9
JA
2295 int ret;
2296
2297 ret = io_do_iopoll(ctx, nr_events, min);
2298 if (ret < 0)
2299 return ret;
eba0a4dd 2300 if (*nr_events >= min)
def596e9
JA
2301 return 0;
2302 }
2303
2304 return 1;
2305}
2306
2307/*
2308 * We can't just wait for polled events to come to us, we have to actively
2309 * find and complete them.
2310 */
b2edc0a7 2311static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
2312{
2313 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2314 return;
2315
2316 mutex_lock(&ctx->uring_lock);
540e32a0 2317 while (!list_empty(&ctx->iopoll_list)) {
def596e9
JA
2318 unsigned int nr_events = 0;
2319
b2edc0a7 2320 io_do_iopoll(ctx, &nr_events, 0);
08f5439f 2321
b2edc0a7
PB
2322 /* let it sleep and repeat later if can't complete a request */
2323 if (nr_events == 0)
2324 break;
08f5439f
JA
2325 /*
2326 * Ensure we allow local-to-the-cpu processing to take place,
2327 * in this case we need to ensure that we reap all events.
3fcee5a6 2328 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 2329 */
3fcee5a6
PB
2330 if (need_resched()) {
2331 mutex_unlock(&ctx->uring_lock);
2332 cond_resched();
2333 mutex_lock(&ctx->uring_lock);
2334 }
def596e9
JA
2335 }
2336 mutex_unlock(&ctx->uring_lock);
2337}
2338
7668b92a 2339static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 2340{
7668b92a 2341 unsigned int nr_events = 0;
2b2ed975 2342 int iters = 0, ret = 0;
500f9fba 2343
c7849be9
XW
2344 /*
2345 * We disallow the app entering submit/complete with polling, but we
2346 * still need to lock the ring to prevent racing with polled issue
2347 * that got punted to a workqueue.
2348 */
2349 mutex_lock(&ctx->uring_lock);
def596e9 2350 do {
a3a0e43f
JA
2351 /*
2352 * Don't enter poll loop if we already have events pending.
2353 * If we do, we can potentially be spinning for commands that
2354 * already triggered a CQE (eg in error).
2355 */
6c503150
PB
2356 if (test_bit(0, &ctx->cq_check_overflow))
2357 __io_cqring_overflow_flush(ctx, false, NULL, NULL);
2358 if (io_cqring_events(ctx))
a3a0e43f
JA
2359 break;
2360
500f9fba
JA
2361 /*
2362 * If a submit got punted to a workqueue, we can have the
2363 * application entering polling for a command before it gets
2364 * issued. That app will hold the uring_lock for the duration
2365 * of the poll right here, so we need to take a breather every
2366 * now and then to ensure that the issue has a chance to add
2367 * the poll to the issued list. Otherwise we can spin here
2368 * forever, while the workqueue is stuck trying to acquire the
2369 * very same mutex.
2370 */
2371 if (!(++iters & 7)) {
2372 mutex_unlock(&ctx->uring_lock);
4c6e277c 2373 io_run_task_work();
500f9fba
JA
2374 mutex_lock(&ctx->uring_lock);
2375 }
2376
7668b92a 2377 ret = io_iopoll_getevents(ctx, &nr_events, min);
def596e9
JA
2378 if (ret <= 0)
2379 break;
2380 ret = 0;
7668b92a 2381 } while (min && !nr_events && !need_resched());
def596e9 2382
500f9fba 2383 mutex_unlock(&ctx->uring_lock);
def596e9
JA
2384 return ret;
2385}
2386
491381ce 2387static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 2388{
491381ce
JA
2389 /*
2390 * Tell lockdep we inherited freeze protection from submission
2391 * thread.
2392 */
2393 if (req->flags & REQ_F_ISREG) {
2394 struct inode *inode = file_inode(req->file);
2b188cc1 2395
491381ce 2396 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1 2397 }
491381ce 2398 file_end_write(req->file);
2b188cc1
JA
2399}
2400
b63534c4 2401#ifdef CONFIG_BLOCK
dc2a6e9a 2402static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4
JA
2403{
2404 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
4a245479 2405 int rw, ret;
b63534c4 2406 struct iov_iter iter;
b63534c4 2407
dc2a6e9a
PB
2408 /* already prepared */
2409 if (req->async_data)
2410 return true;
b63534c4
JA
2411
2412 switch (req->opcode) {
2413 case IORING_OP_READV:
2414 case IORING_OP_READ_FIXED:
2415 case IORING_OP_READ:
2416 rw = READ;
2417 break;
2418 case IORING_OP_WRITEV:
2419 case IORING_OP_WRITE_FIXED:
2420 case IORING_OP_WRITE:
2421 rw = WRITE;
2422 break;
2423 default:
2424 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2425 req->opcode);
dc2a6e9a 2426 return false;
b63534c4
JA
2427 }
2428
dc2a6e9a
PB
2429 ret = io_import_iovec(rw, req, &iovec, &iter, false);
2430 if (ret < 0)
2431 return false;
6bf985dc 2432 return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
b63534c4 2433}
b63534c4 2434
3e6a0d3c 2435static bool io_rw_should_reissue(struct io_kiocb *req)
b63534c4 2436{
355afaeb 2437 umode_t mode = file_inode(req->file)->i_mode;
3e6a0d3c 2438 struct io_ring_ctx *ctx = req->ctx;
b63534c4 2439
355afaeb
JA
2440 if (!S_ISBLK(mode) && !S_ISREG(mode))
2441 return false;
3e6a0d3c
JA
2442 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2443 !(ctx->flags & IORING_SETUP_IOPOLL)))
b63534c4 2444 return false;
7c977a58
JA
2445 /*
2446 * If ref is dying, we might be running poll reap from the exit work.
2447 * Don't attempt to reissue from that path, just let it fail with
2448 * -EAGAIN.
2449 */
3e6a0d3c
JA
2450 if (percpu_ref_is_dying(&ctx->refs))
2451 return false;
2452 return true;
2453}
2454#endif
2455
2456static bool io_rw_reissue(struct io_kiocb *req)
2457{
2458#ifdef CONFIG_BLOCK
2459 if (!io_rw_should_reissue(req))
7c977a58 2460 return false;
b63534c4 2461
55e6ac1e
PB
2462 lockdep_assert_held(&req->ctx->uring_lock);
2463
37d1e2e3 2464 if (io_resubmit_prep(req)) {
fdee946d
JA
2465 refcount_inc(&req->refs);
2466 io_queue_async_work(req);
b63534c4 2467 return true;
fdee946d 2468 }
dc2a6e9a 2469 req_set_fail_links(req);
b63534c4
JA
2470#endif
2471 return false;
2472}
2473
a1d7c393 2474static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
889fca73 2475 unsigned int issue_flags)
a1d7c393 2476{
2f8e45f1
PB
2477 int cflags = 0;
2478
23faba36
PB
2479 if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
2480 return;
2f8e45f1
PB
2481 if (res != req->result)
2482 req_set_fail_links(req);
23faba36 2483
2f8e45f1
PB
2484 if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2485 kiocb_end_write(req);
2486 if (req->flags & REQ_F_BUFFER_SELECTED)
2487 cflags = io_put_rw_kbuf(req);
2488 __io_req_complete(req, issue_flags, res, cflags);
ba816ad6
JA
2489}
2490
2491static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2492{
9adbd45d 2493 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 2494
889fca73 2495 __io_complete_rw(req, res, res2, 0);
2b188cc1
JA
2496}
2497
def596e9
JA
2498static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2499{
9adbd45d 2500 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 2501
3e6a0d3c
JA
2502#ifdef CONFIG_BLOCK
2503 /* Rewind iter, if we have one. iopoll path resubmits as usual */
2504 if (res == -EAGAIN && io_rw_should_reissue(req)) {
2505 struct io_async_rw *rw = req->async_data;
2506
2507 if (rw)
2508 iov_iter_revert(&rw->iter,
2509 req->result - iov_iter_count(&rw->iter));
2510 else if (!io_resubmit_prep(req))
2511 res = -EIO;
2512 }
2513#endif
2514
491381ce
JA
2515 if (kiocb->ki_flags & IOCB_WRITE)
2516 kiocb_end_write(req);
def596e9 2517
2d7d6792 2518 if (res != -EAGAIN && res != req->result)
4e88d6e7 2519 req_set_fail_links(req);
bbde017a
XW
2520
2521 WRITE_ONCE(req->result, res);
2522 /* order with io_poll_complete() checking ->result */
cd664b0e
PB
2523 smp_wmb();
2524 WRITE_ONCE(req->iopoll_completed, 1);
def596e9
JA
2525}
2526
2527/*
2528 * After the iocb has been issued, it's safe to be found on the poll list.
2529 * Adding the kiocb to the list AFTER submission ensures that we don't
2530 * find it from a io_iopoll_getevents() thread before the issuer is done
2531 * accessing the kiocb cookie.
2532 */
2e9dbe90 2533static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
def596e9
JA
2534{
2535 struct io_ring_ctx *ctx = req->ctx;
2536
2537 /*
2538 * Track whether we have multiple files in our lists. This will impact
2539 * how we do polling eventually, not spinning if we're on potentially
2540 * different devices.
2541 */
540e32a0 2542 if (list_empty(&ctx->iopoll_list)) {
def596e9
JA
2543 ctx->poll_multi_file = false;
2544 } else if (!ctx->poll_multi_file) {
2545 struct io_kiocb *list_req;
2546
540e32a0 2547 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
d21ffe7e 2548 inflight_entry);
9adbd45d 2549 if (list_req->file != req->file)
def596e9
JA
2550 ctx->poll_multi_file = true;
2551 }
2552
2553 /*
2554 * For fast devices, IO may have already completed. If it has, add
2555 * it to the front so we find it first.
2556 */
65a6543d 2557 if (READ_ONCE(req->iopoll_completed))
d21ffe7e 2558 list_add(&req->inflight_entry, &ctx->iopoll_list);
def596e9 2559 else
d21ffe7e 2560 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
bdcd3eab 2561
2e9dbe90
XW
2562 /*
2563 * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
2564 * task context or in io worker task context. If current task context is
2565 * sq thread, we don't need to check whether should wake up sq thread.
2566 */
2567 if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
534ca6d6
JA
2568 wq_has_sleeper(&ctx->sq_data->wait))
2569 wake_up(&ctx->sq_data->wait);
def596e9
JA
2570}
2571
9f13c35b
PB
2572static inline void io_state_file_put(struct io_submit_state *state)
2573{
02b23a9a
PB
2574 if (state->file_refs) {
2575 fput_many(state->file, state->file_refs);
2576 state->file_refs = 0;
2577 }
9a56a232
JA
2578}
2579
2580/*
2581 * Get as many references to a file as we have IOs left in this submission,
2582 * assuming most submissions are for one file, or at least that each file
2583 * has more than one submission.
2584 */
8da11c19 2585static struct file *__io_file_get(struct io_submit_state *state, int fd)
9a56a232
JA
2586{
2587 if (!state)
2588 return fget(fd);
2589
6e1271e6 2590 if (state->file_refs) {
9a56a232 2591 if (state->fd == fd) {
6e1271e6 2592 state->file_refs--;
9a56a232
JA
2593 return state->file;
2594 }
02b23a9a 2595 io_state_file_put(state);
9a56a232
JA
2596 }
2597 state->file = fget_many(fd, state->ios_left);
6e1271e6 2598 if (unlikely(!state->file))
9a56a232
JA
2599 return NULL;
2600
2601 state->fd = fd;
6e1271e6 2602 state->file_refs = state->ios_left - 1;
9a56a232
JA
2603 return state->file;
2604}
2605
4503b767
JA
2606static bool io_bdev_nowait(struct block_device *bdev)
2607{
9ba0d0c8 2608 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
2609}
2610
2b188cc1
JA
2611/*
2612 * If we tracked the file through the SCM inflight mechanism, we could support
2613 * any file. For now, just ensure that anything potentially problematic is done
2614 * inline.
2615 */
af197f50 2616static bool io_file_supports_async(struct file *file, int rw)
2b188cc1
JA
2617{
2618 umode_t mode = file_inode(file)->i_mode;
2619
4503b767 2620 if (S_ISBLK(mode)) {
4e7b5671
CH
2621 if (IS_ENABLED(CONFIG_BLOCK) &&
2622 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
2623 return true;
2624 return false;
2625 }
2626 if (S_ISCHR(mode) || S_ISSOCK(mode))
2b188cc1 2627 return true;
4503b767 2628 if (S_ISREG(mode)) {
4e7b5671
CH
2629 if (IS_ENABLED(CONFIG_BLOCK) &&
2630 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
2631 file->f_op != &io_uring_fops)
2632 return true;
2633 return false;
2634 }
2b188cc1 2635
c5b85625
JA
2636 /* any ->read/write should understand O_NONBLOCK */
2637 if (file->f_flags & O_NONBLOCK)
2638 return true;
2639
af197f50
JA
2640 if (!(file->f_mode & FMODE_NOWAIT))
2641 return false;
2642
2643 if (rw == READ)
2644 return file->f_op->read_iter != NULL;
2645
2646 return file->f_op->write_iter != NULL;
2b188cc1
JA
2647}
2648
a88fc400 2649static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 2650{
def596e9 2651 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 2652 struct kiocb *kiocb = &req->rw.kiocb;
75c668cd 2653 struct file *file = req->file;
09bb8394
JA
2654 unsigned ioprio;
2655 int ret;
2b188cc1 2656
75c668cd 2657 if (S_ISREG(file_inode(file)->i_mode))
491381ce
JA
2658 req->flags |= REQ_F_ISREG;
2659
2b188cc1 2660 kiocb->ki_pos = READ_ONCE(sqe->off);
75c668cd 2661 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
ba04291e 2662 req->flags |= REQ_F_CUR_POS;
75c668cd 2663 kiocb->ki_pos = file->f_pos;
ba04291e 2664 }
2b188cc1 2665 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
2666 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2667 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2668 if (unlikely(ret))
2669 return ret;
2b188cc1 2670
75c668cd
PB
2671 /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2672 if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2673 req->flags |= REQ_F_NOWAIT;
2674
2b188cc1
JA
2675 ioprio = READ_ONCE(sqe->ioprio);
2676 if (ioprio) {
2677 ret = ioprio_check_cap(ioprio);
2678 if (ret)
09bb8394 2679 return ret;
2b188cc1
JA
2680
2681 kiocb->ki_ioprio = ioprio;
2682 } else
2683 kiocb->ki_ioprio = get_current_ioprio();
2684
def596e9 2685 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
2686 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2687 !kiocb->ki_filp->f_op->iopoll)
09bb8394 2688 return -EOPNOTSUPP;
2b188cc1 2689
def596e9
JA
2690 kiocb->ki_flags |= IOCB_HIPRI;
2691 kiocb->ki_complete = io_complete_rw_iopoll;
65a6543d 2692 req->iopoll_completed = 0;
def596e9 2693 } else {
09bb8394
JA
2694 if (kiocb->ki_flags & IOCB_HIPRI)
2695 return -EINVAL;
def596e9
JA
2696 kiocb->ki_complete = io_complete_rw;
2697 }
9adbd45d 2698
3529d8c2
JA
2699 req->rw.addr = READ_ONCE(sqe->addr);
2700 req->rw.len = READ_ONCE(sqe->len);
4f4eeba8 2701 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 2702 return 0;
2b188cc1
JA
2703}
2704
2705static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2706{
2707 switch (ret) {
2708 case -EIOCBQUEUED:
2709 break;
2710 case -ERESTARTSYS:
2711 case -ERESTARTNOINTR:
2712 case -ERESTARTNOHAND:
2713 case -ERESTART_RESTARTBLOCK:
2714 /*
2715 * We can't just restart the syscall, since previously
2716 * submitted sqes may already be in progress. Just fail this
2717 * IO with EINTR.
2718 */
2719 ret = -EINTR;
df561f66 2720 fallthrough;
2b188cc1
JA
2721 default:
2722 kiocb->ki_complete(kiocb, ret, 0);
2723 }
2724}
2725
a1d7c393 2726static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
889fca73 2727 unsigned int issue_flags)
ba816ad6 2728{
ba04291e 2729 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
e8c2bc1f 2730 struct io_async_rw *io = req->async_data;
ba04291e 2731
227c0c96 2732 /* add previously done IO, if any */
e8c2bc1f 2733 if (io && io->bytes_done > 0) {
227c0c96 2734 if (ret < 0)
e8c2bc1f 2735 ret = io->bytes_done;
227c0c96 2736 else
e8c2bc1f 2737 ret += io->bytes_done;
227c0c96
JA
2738 }
2739
ba04291e
JA
2740 if (req->flags & REQ_F_CUR_POS)
2741 req->file->f_pos = kiocb->ki_pos;
bcaec089 2742 if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
889fca73 2743 __io_complete_rw(req, ret, 0, issue_flags);
ba816ad6
JA
2744 else
2745 io_rw_done(kiocb, ret);
2746}
2747
847595de 2748static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
edafccee 2749{
9adbd45d
JA
2750 struct io_ring_ctx *ctx = req->ctx;
2751 size_t len = req->rw.len;
edafccee 2752 struct io_mapped_ubuf *imu;
4be1c615 2753 u16 index, buf_index = req->buf_index;
edafccee
JA
2754 size_t offset;
2755 u64 buf_addr;
2756
edafccee
JA
2757 if (unlikely(buf_index >= ctx->nr_user_bufs))
2758 return -EFAULT;
edafccee
JA
2759 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2760 imu = &ctx->user_bufs[index];
9adbd45d 2761 buf_addr = req->rw.addr;
edafccee
JA
2762
2763 /* overflow */
2764 if (buf_addr + len < buf_addr)
2765 return -EFAULT;
2766 /* not inside the mapped region */
2767 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2768 return -EFAULT;
2769
2770 /*
2771 * May not be a start of buffer, set size appropriately
2772 * and advance us to the beginning.
2773 */
2774 offset = buf_addr - imu->ubuf;
2775 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
2776
2777 if (offset) {
2778 /*
2779 * Don't use iov_iter_advance() here, as it's really slow for
2780 * using the latter parts of a big fixed buffer - it iterates
2781 * over each segment manually. We can cheat a bit here, because
2782 * we know that:
2783 *
2784 * 1) it's a BVEC iter, we set it up
2785 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2786 * first and last bvec
2787 *
2788 * So just find our index, and adjust the iterator afterwards.
2789 * If the offset is within the first bvec (or the whole first
2790 * bvec, just use iov_iter_advance(). This makes it easier
2791 * since we can just skip the first segment, which may not
2792 * be PAGE_SIZE aligned.
2793 */
2794 const struct bio_vec *bvec = imu->bvec;
2795
2796 if (offset <= bvec->bv_len) {
2797 iov_iter_advance(iter, offset);
2798 } else {
2799 unsigned long seg_skip;
2800
2801 /* skip first vec */
2802 offset -= bvec->bv_len;
2803 seg_skip = 1 + (offset >> PAGE_SHIFT);
2804
2805 iter->bvec = bvec + seg_skip;
2806 iter->nr_segs -= seg_skip;
99c79f66 2807 iter->count -= bvec->bv_len + offset;
bd11b3a3 2808 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2809 }
2810 }
2811
847595de 2812 return 0;
edafccee
JA
2813}
2814
bcda7baa
JA
2815static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2816{
2817 if (needs_lock)
2818 mutex_unlock(&ctx->uring_lock);
2819}
2820
2821static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2822{
2823 /*
2824 * "Normal" inline submissions always hold the uring_lock, since we
2825 * grab it from the system call. Same is true for the SQPOLL offload.
2826 * The only exception is when we've detached the request and issue it
2827 * from an async worker thread, grab the lock for that case.
2828 */
2829 if (needs_lock)
2830 mutex_lock(&ctx->uring_lock);
2831}
2832
2833static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2834 int bgid, struct io_buffer *kbuf,
2835 bool needs_lock)
2836{
2837 struct io_buffer *head;
2838
2839 if (req->flags & REQ_F_BUFFER_SELECTED)
2840 return kbuf;
2841
2842 io_ring_submit_lock(req->ctx, needs_lock);
2843
2844 lockdep_assert_held(&req->ctx->uring_lock);
2845
2846 head = idr_find(&req->ctx->io_buffer_idr, bgid);
2847 if (head) {
2848 if (!list_empty(&head->list)) {
2849 kbuf = list_last_entry(&head->list, struct io_buffer,
2850 list);
2851 list_del(&kbuf->list);
2852 } else {
2853 kbuf = head;
2854 idr_remove(&req->ctx->io_buffer_idr, bgid);
2855 }
2856 if (*len > kbuf->len)
2857 *len = kbuf->len;
2858 } else {
2859 kbuf = ERR_PTR(-ENOBUFS);
2860 }
2861
2862 io_ring_submit_unlock(req->ctx, needs_lock);
2863
2864 return kbuf;
2865}
2866
4d954c25
JA
2867static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2868 bool needs_lock)
2869{
2870 struct io_buffer *kbuf;
4f4eeba8 2871 u16 bgid;
4d954c25
JA
2872
2873 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
4f4eeba8 2874 bgid = req->buf_index;
4d954c25
JA
2875 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2876 if (IS_ERR(kbuf))
2877 return kbuf;
2878 req->rw.addr = (u64) (unsigned long) kbuf;
2879 req->flags |= REQ_F_BUFFER_SELECTED;
2880 return u64_to_user_ptr(kbuf->addr);
2881}
2882
2883#ifdef CONFIG_COMPAT
2884static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2885 bool needs_lock)
2886{
2887 struct compat_iovec __user *uiov;
2888 compat_ssize_t clen;
2889 void __user *buf;
2890 ssize_t len;
2891
2892 uiov = u64_to_user_ptr(req->rw.addr);
2893 if (!access_ok(uiov, sizeof(*uiov)))
2894 return -EFAULT;
2895 if (__get_user(clen, &uiov->iov_len))
2896 return -EFAULT;
2897 if (clen < 0)
2898 return -EINVAL;
2899
2900 len = clen;
2901 buf = io_rw_buffer_select(req, &len, needs_lock);
2902 if (IS_ERR(buf))
2903 return PTR_ERR(buf);
2904 iov[0].iov_base = buf;
2905 iov[0].iov_len = (compat_size_t) len;
2906 return 0;
2907}
2908#endif
2909
2910static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2911 bool needs_lock)
2912{
2913 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2914 void __user *buf;
2915 ssize_t len;
2916
2917 if (copy_from_user(iov, uiov, sizeof(*uiov)))
2918 return -EFAULT;
2919
2920 len = iov[0].iov_len;
2921 if (len < 0)
2922 return -EINVAL;
2923 buf = io_rw_buffer_select(req, &len, needs_lock);
2924 if (IS_ERR(buf))
2925 return PTR_ERR(buf);
2926 iov[0].iov_base = buf;
2927 iov[0].iov_len = len;
2928 return 0;
2929}
2930
2931static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2932 bool needs_lock)
2933{
dddb3e26
JA
2934 if (req->flags & REQ_F_BUFFER_SELECTED) {
2935 struct io_buffer *kbuf;
2936
2937 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2938 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2939 iov[0].iov_len = kbuf->len;
4d954c25 2940 return 0;
dddb3e26 2941 }
dd201662 2942 if (req->rw.len != 1)
4d954c25
JA
2943 return -EINVAL;
2944
2945#ifdef CONFIG_COMPAT
2946 if (req->ctx->compat)
2947 return io_compat_import(req, iov, needs_lock);
2948#endif
2949
2950 return __io_iov_buffer_select(req, iov, needs_lock);
2951}
2952
847595de
PB
2953static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
2954 struct iov_iter *iter, bool needs_lock)
2b188cc1 2955{
9adbd45d
JA
2956 void __user *buf = u64_to_user_ptr(req->rw.addr);
2957 size_t sqe_len = req->rw.len;
847595de 2958 u8 opcode = req->opcode;
4d954c25 2959 ssize_t ret;
edafccee 2960
7d009165 2961 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 2962 *iovec = NULL;
9adbd45d 2963 return io_import_fixed(req, rw, iter);
edafccee 2964 }
2b188cc1 2965
bcda7baa 2966 /* buffer index only valid with fixed read/write, or buffer select */
4f4eeba8 2967 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
9adbd45d
JA
2968 return -EINVAL;
2969
3a6820f2 2970 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 2971 if (req->flags & REQ_F_BUFFER_SELECT) {
4d954c25 2972 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
867a23ea 2973 if (IS_ERR(buf))
4d954c25 2974 return PTR_ERR(buf);
3f9d6441 2975 req->rw.len = sqe_len;
bcda7baa
JA
2976 }
2977
3a6820f2
JA
2978 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2979 *iovec = NULL;
10fc72e4 2980 return ret;
3a6820f2
JA
2981 }
2982
4d954c25
JA
2983 if (req->flags & REQ_F_BUFFER_SELECT) {
2984 ret = io_iov_buffer_select(req, *iovec, needs_lock);
847595de
PB
2985 if (!ret)
2986 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
4d954c25
JA
2987 *iovec = NULL;
2988 return ret;
2989 }
2990
89cd35c5
CH
2991 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
2992 req->ctx->compat);
2b188cc1
JA
2993}
2994
0fef9483
JA
2995static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
2996{
5b09e37e 2997 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
2998}
2999
31b51510 3000/*
32960613
JA
3001 * For files that don't have ->read_iter() and ->write_iter(), handle them
3002 * by looping over ->read() or ->write() manually.
31b51510 3003 */
4017eb91 3004static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3005{
4017eb91
JA
3006 struct kiocb *kiocb = &req->rw.kiocb;
3007 struct file *file = req->file;
32960613
JA
3008 ssize_t ret = 0;
3009
3010 /*
3011 * Don't support polled IO through this interface, and we can't
3012 * support non-blocking either. For the latter, this just causes
3013 * the kiocb to be handled from an async context.
3014 */
3015 if (kiocb->ki_flags & IOCB_HIPRI)
3016 return -EOPNOTSUPP;
3017 if (kiocb->ki_flags & IOCB_NOWAIT)
3018 return -EAGAIN;
3019
3020 while (iov_iter_count(iter)) {
311ae9e1 3021 struct iovec iovec;
32960613
JA
3022 ssize_t nr;
3023
311ae9e1
PB
3024 if (!iov_iter_is_bvec(iter)) {
3025 iovec = iov_iter_iovec(iter);
3026 } else {
4017eb91
JA
3027 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3028 iovec.iov_len = req->rw.len;
311ae9e1
PB
3029 }
3030
32960613
JA
3031 if (rw == READ) {
3032 nr = file->f_op->read(file, iovec.iov_base,
0fef9483 3033 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3034 } else {
3035 nr = file->f_op->write(file, iovec.iov_base,
0fef9483 3036 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3037 }
3038
3039 if (nr < 0) {
3040 if (!ret)
3041 ret = nr;
3042 break;
3043 }
3044 ret += nr;
3045 if (nr != iovec.iov_len)
3046 break;
4017eb91
JA
3047 req->rw.len -= nr;
3048 req->rw.addr += nr;
32960613
JA
3049 iov_iter_advance(iter, nr);
3050 }
3051
3052 return ret;
3053}
3054
ff6165b2
JA
3055static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3056 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3057{
e8c2bc1f 3058 struct io_async_rw *rw = req->async_data;
b64e3444 3059
ff6165b2 3060 memcpy(&rw->iter, iter, sizeof(*iter));
afb87658 3061 rw->free_iovec = iovec;
227c0c96 3062 rw->bytes_done = 0;
ff6165b2 3063 /* can only be fixed buffers, no need to do anything */
9c3a205c 3064 if (iov_iter_is_bvec(iter))
ff6165b2 3065 return;
b64e3444 3066 if (!iovec) {
ff6165b2
JA
3067 unsigned iov_off = 0;
3068
3069 rw->iter.iov = rw->fast_iov;
3070 if (iter->iov != fast_iov) {
3071 iov_off = iter->iov - fast_iov;
3072 rw->iter.iov += iov_off;
3073 }
3074 if (rw->fast_iov != fast_iov)
3075 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
45097dae 3076 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
3077 } else {
3078 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
3079 }
3080}
3081
e8c2bc1f 3082static inline int __io_alloc_async_data(struct io_kiocb *req)
3d9932a8 3083{
e8c2bc1f
JA
3084 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3085 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3086 return req->async_data == NULL;
3d9932a8
XW
3087}
3088
e8c2bc1f 3089static int io_alloc_async_data(struct io_kiocb *req)
f67676d1 3090{
e8c2bc1f 3091 if (!io_op_defs[req->opcode].needs_async_data)
d3656344 3092 return 0;
3d9932a8 3093
e8c2bc1f 3094 return __io_alloc_async_data(req);
b7bb4f7d
JA
3095}
3096
ff6165b2
JA
3097static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3098 const struct iovec *fast_iov,
227c0c96 3099 struct iov_iter *iter, bool force)
b7bb4f7d 3100{
e8c2bc1f 3101 if (!force && !io_op_defs[req->opcode].needs_async_data)
74566df3 3102 return 0;
e8c2bc1f 3103 if (!req->async_data) {
6bf985dc
PB
3104 if (__io_alloc_async_data(req)) {
3105 kfree(iovec);
5d204bcf 3106 return -ENOMEM;
6bf985dc 3107 }
b7bb4f7d 3108
ff6165b2 3109 io_req_map_rw(req, iovec, fast_iov, iter);
5d204bcf 3110 }
b7bb4f7d 3111 return 0;
f67676d1
JA
3112}
3113
73debe68 3114static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 3115{
e8c2bc1f 3116 struct io_async_rw *iorw = req->async_data;
f4bff104 3117 struct iovec *iov = iorw->fast_iov;
847595de 3118 int ret;
c3e330a4 3119
2846c481 3120 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
c3e330a4
PB
3121 if (unlikely(ret < 0))
3122 return ret;
3123
ab0b196c
PB
3124 iorw->bytes_done = 0;
3125 iorw->free_iovec = iov;
3126 if (iov)
3127 req->flags |= REQ_F_NEED_CLEANUP;
c3e330a4
PB
3128 return 0;
3129}
3130
73debe68 3131static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3132{
3529d8c2
JA
3133 if (unlikely(!(req->file->f_mode & FMODE_READ)))
3134 return -EBADF;
93642ef8 3135 return io_prep_rw(req, sqe);
f67676d1
JA
3136}
3137
c1dd91d1
JA
3138/*
3139 * This is our waitqueue callback handler, registered through lock_page_async()
3140 * when we initially tried to do the IO with the iocb armed our waitqueue.
3141 * This gets called when the page is unlocked, and we generally expect that to
3142 * happen when the page IO is completed and the page is now uptodate. This will
3143 * queue a task_work based retry of the operation, attempting to copy the data
3144 * again. If the latter fails because the page was NOT uptodate, then we will
3145 * do a thread based blocking retry of the operation. That's the unexpected
3146 * slow path.
3147 */
bcf5a063
JA
3148static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3149 int sync, void *arg)
3150{
3151 struct wait_page_queue *wpq;
3152 struct io_kiocb *req = wait->private;
bcf5a063 3153 struct wait_page_key *key = arg;
bcf5a063
JA
3154
3155 wpq = container_of(wait, struct wait_page_queue, wait);
3156
cdc8fcb4
LT
3157 if (!wake_page_match(wpq, key))
3158 return 0;
3159
c8d317aa 3160 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063
JA
3161 list_del_init(&wait->entry);
3162
bcf5a063
JA
3163 /* submit ref gets dropped, acquire a new one */
3164 refcount_inc(&req->refs);
921b9054 3165 io_req_task_queue(req);
bcf5a063
JA
3166 return 1;
3167}
3168
c1dd91d1
JA
3169/*
3170 * This controls whether a given IO request should be armed for async page
3171 * based retry. If we return false here, the request is handed to the async
3172 * worker threads for retry. If we're doing buffered reads on a regular file,
3173 * we prepare a private wait_page_queue entry and retry the operation. This
3174 * will either succeed because the page is now uptodate and unlocked, or it
3175 * will register a callback when the page is unlocked at IO completion. Through
3176 * that callback, io_uring uses task_work to setup a retry of the operation.
3177 * That retry will attempt the buffered read again. The retry will generally
3178 * succeed, or in rare cases where it fails, we then fall back to using the
3179 * async worker threads for a blocking retry.
3180 */
227c0c96 3181static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 3182{
e8c2bc1f
JA
3183 struct io_async_rw *rw = req->async_data;
3184 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 3185 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 3186
bcf5a063
JA
3187 /* never retry for NOWAIT, we just complete with -EAGAIN */
3188 if (req->flags & REQ_F_NOWAIT)
3189 return false;
f67676d1 3190
227c0c96 3191 /* Only for buffered IO */
3b2a4439 3192 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 3193 return false;
3b2a4439 3194
bcf5a063
JA
3195 /*
3196 * just use poll if we can, and don't attempt if the fs doesn't
3197 * support callback based unlocks
3198 */
3199 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3200 return false;
f67676d1 3201
3b2a4439
JA
3202 wait->wait.func = io_async_buf_func;
3203 wait->wait.private = req;
3204 wait->wait.flags = 0;
3205 INIT_LIST_HEAD(&wait->wait.entry);
3206 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 3207 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 3208 kiocb->ki_waitq = wait;
3b2a4439 3209 return true;
bcf5a063
JA
3210}
3211
3212static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3213{
3214 if (req->file->f_op->read_iter)
3215 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 3216 else if (req->file->f_op->read)
4017eb91 3217 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
3218 else
3219 return -EINVAL;
f67676d1
JA
3220}
3221
889fca73 3222static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3223{
3224 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3225 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3226 struct iov_iter __iter, *iter = &__iter;
e8c2bc1f 3227 struct io_async_rw *rw = req->async_data;
227c0c96 3228 ssize_t io_size, ret, ret2;
45d189c6 3229 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ff6165b2 3230
2846c481 3231 if (rw) {
e8c2bc1f 3232 iter = &rw->iter;
2846c481
PB
3233 iovec = NULL;
3234 } else {
3235 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3236 if (ret < 0)
3237 return ret;
3238 }
632546c4 3239 io_size = iov_iter_count(iter);
fa15bafb 3240 req->result = io_size;
2b188cc1 3241
fd6c2e4c
JA
3242 /* Ensure we clear previously set non-block flag */
3243 if (!force_nonblock)
29de5f6a 3244 kiocb->ki_flags &= ~IOCB_NOWAIT;
a88fc400
PB
3245 else
3246 kiocb->ki_flags |= IOCB_NOWAIT;
3247
24c74678 3248 /* If the file doesn't support async, just async punt */
6713e7a6
PB
3249 if (force_nonblock && !io_file_supports_async(req->file, READ)) {
3250 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc 3251 return ret ?: -EAGAIN;
6713e7a6 3252 }
9e645e11 3253
632546c4 3254 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
5ea5dd45
PB
3255 if (unlikely(ret)) {
3256 kfree(iovec);
3257 return ret;
3258 }
2b188cc1 3259
227c0c96 3260 ret = io_iter_do_read(req, iter);
32960613 3261
57cd657b 3262 if (ret == -EIOCBQUEUED) {
3e6a0d3c
JA
3263 if (req->async_data)
3264 iov_iter_revert(iter, io_size - iov_iter_count(iter));
fe1cdd55 3265 goto out_free;
227c0c96 3266 } else if (ret == -EAGAIN) {
eefdf30f
JA
3267 /* IOPOLL retry should happen for io-wq threads */
3268 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 3269 goto done;
75c668cd
PB
3270 /* no retry on NONBLOCK nor RWF_NOWAIT */
3271 if (req->flags & REQ_F_NOWAIT)
355afaeb 3272 goto done;
84216315 3273 /* some cases will consume bytes even on error returns */
632546c4 3274 iov_iter_revert(iter, io_size - iov_iter_count(iter));
f38c7e3a 3275 ret = 0;
7335e3bf 3276 } else if (ret <= 0 || ret == io_size || !force_nonblock ||
75c668cd 3277 (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
7335e3bf 3278 /* read all, failed, already did sync or don't want to retry */
00d23d51 3279 goto done;
227c0c96
JA
3280 }
3281
227c0c96 3282 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc
PB
3283 if (ret2)
3284 return ret2;
3285
fe1cdd55 3286 iovec = NULL;
e8c2bc1f 3287 rw = req->async_data;
227c0c96 3288 /* now use our persistent iterator, if we aren't already */
e8c2bc1f 3289 iter = &rw->iter;
227c0c96 3290
b23df91b
PB
3291 do {
3292 io_size -= ret;
3293 rw->bytes_done += ret;
3294 /* if we can retry, do so with the callbacks armed */
3295 if (!io_rw_should_retry(req)) {
3296 kiocb->ki_flags &= ~IOCB_WAITQ;
3297 return -EAGAIN;
3298 }
3299
3300 /*
3301 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3302 * we get -EIOCBQUEUED, then we'll get a notification when the
3303 * desired page gets unlocked. We can also get a partial read
3304 * here, and if we do, then just retry at the new offset.
3305 */
3306 ret = io_iter_do_read(req, iter);
3307 if (ret == -EIOCBQUEUED)
3308 return 0;
227c0c96 3309 /* we got some bytes, but not all. retry. */
b5b0ecb7 3310 kiocb->ki_flags &= ~IOCB_WAITQ;
b23df91b 3311 } while (ret > 0 && ret < io_size);
227c0c96 3312done:
889fca73 3313 kiocb_done(kiocb, ret, issue_flags);
fe1cdd55
PB
3314out_free:
3315 /* it's faster to check here then delegate to kfree */
3316 if (iovec)
3317 kfree(iovec);
5ea5dd45 3318 return 0;
2b188cc1
JA
3319}
3320
73debe68 3321static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3322{
3529d8c2
JA
3323 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3324 return -EBADF;
93642ef8 3325 return io_prep_rw(req, sqe);
f67676d1
JA
3326}
3327
889fca73 3328static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3329{
3330 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3331 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3332 struct iov_iter __iter, *iter = &__iter;
e8c2bc1f 3333 struct io_async_rw *rw = req->async_data;
fa15bafb 3334 ssize_t ret, ret2, io_size;
45d189c6 3335 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
2b188cc1 3336
2846c481 3337 if (rw) {
e8c2bc1f 3338 iter = &rw->iter;
2846c481
PB
3339 iovec = NULL;
3340 } else {
3341 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3342 if (ret < 0)
3343 return ret;
3344 }
632546c4 3345 io_size = iov_iter_count(iter);
fa15bafb 3346 req->result = io_size;
2b188cc1 3347
fd6c2e4c
JA
3348 /* Ensure we clear previously set non-block flag */
3349 if (!force_nonblock)
a88fc400
PB
3350 kiocb->ki_flags &= ~IOCB_NOWAIT;
3351 else
3352 kiocb->ki_flags |= IOCB_NOWAIT;
fd6c2e4c 3353
24c74678 3354 /* If the file doesn't support async, just async punt */
af197f50 3355 if (force_nonblock && !io_file_supports_async(req->file, WRITE))
f67676d1 3356 goto copy_iov;
31b51510 3357
10d59345
JA
3358 /* file path doesn't support NOWAIT for non-direct_IO */
3359 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3360 (req->flags & REQ_F_ISREG))
f67676d1 3361 goto copy_iov;
31b51510 3362
632546c4 3363 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
fa15bafb
PB
3364 if (unlikely(ret))
3365 goto out_free;
4ed734b0 3366
fa15bafb
PB
3367 /*
3368 * Open-code file_start_write here to grab freeze protection,
3369 * which will be released by another thread in
3370 * io_complete_rw(). Fool lockdep by telling it the lock got
3371 * released so that it doesn't complain about the held lock when
3372 * we return to userspace.
3373 */
3374 if (req->flags & REQ_F_ISREG) {
8a3c84b6 3375 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
3376 __sb_writers_release(file_inode(req->file)->i_sb,
3377 SB_FREEZE_WRITE);
3378 }
3379 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 3380
fa15bafb 3381 if (req->file->f_op->write_iter)
ff6165b2 3382 ret2 = call_write_iter(req->file, kiocb, iter);
2dd2111d 3383 else if (req->file->f_op->write)
4017eb91 3384 ret2 = loop_rw_iter(WRITE, req, iter);
2dd2111d
GH
3385 else
3386 ret2 = -EINVAL;
4ed734b0 3387
fa15bafb
PB
3388 /*
3389 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3390 * retry them without IOCB_NOWAIT.
3391 */
3392 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3393 ret2 = -EAGAIN;
75c668cd
PB
3394 /* no retry on NONBLOCK nor RWF_NOWAIT */
3395 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 3396 goto done;
3e6a0d3c
JA
3397 if (ret2 == -EIOCBQUEUED && req->async_data)
3398 iov_iter_revert(iter, io_size - iov_iter_count(iter));
fa15bafb 3399 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f
JA
3400 /* IOPOLL retry should happen for io-wq threads */
3401 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3402 goto copy_iov;
355afaeb 3403done:
889fca73 3404 kiocb_done(kiocb, ret2, issue_flags);
fa15bafb 3405 } else {
f67676d1 3406copy_iov:
84216315 3407 /* some cases will consume bytes even on error returns */
632546c4 3408 iov_iter_revert(iter, io_size - iov_iter_count(iter));
227c0c96 3409 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
6bf985dc 3410 return ret ?: -EAGAIN;
2b188cc1 3411 }
31b51510 3412out_free:
f261c168 3413 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 3414 if (iovec)
6f2cc166 3415 kfree(iovec);
2b188cc1
JA
3416 return ret;
3417}
3418
80a261fd
JA
3419static int io_renameat_prep(struct io_kiocb *req,
3420 const struct io_uring_sqe *sqe)
3421{
3422 struct io_rename *ren = &req->rename;
3423 const char __user *oldf, *newf;
3424
3425 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3426 return -EBADF;
3427
3428 ren->old_dfd = READ_ONCE(sqe->fd);
3429 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3430 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3431 ren->new_dfd = READ_ONCE(sqe->len);
3432 ren->flags = READ_ONCE(sqe->rename_flags);
3433
3434 ren->oldpath = getname(oldf);
3435 if (IS_ERR(ren->oldpath))
3436 return PTR_ERR(ren->oldpath);
3437
3438 ren->newpath = getname(newf);
3439 if (IS_ERR(ren->newpath)) {
3440 putname(ren->oldpath);
3441 return PTR_ERR(ren->newpath);
3442 }
3443
3444 req->flags |= REQ_F_NEED_CLEANUP;
3445 return 0;
3446}
3447
45d189c6 3448static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
3449{
3450 struct io_rename *ren = &req->rename;
3451 int ret;
3452
45d189c6 3453 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
3454 return -EAGAIN;
3455
3456 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3457 ren->newpath, ren->flags);
3458
3459 req->flags &= ~REQ_F_NEED_CLEANUP;
3460 if (ret < 0)
3461 req_set_fail_links(req);
3462 io_req_complete(req, ret);
3463 return 0;
3464}
3465
14a1143b
JA
3466static int io_unlinkat_prep(struct io_kiocb *req,
3467 const struct io_uring_sqe *sqe)
3468{
3469 struct io_unlink *un = &req->unlink;
3470 const char __user *fname;
3471
3472 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3473 return -EBADF;
3474
3475 un->dfd = READ_ONCE(sqe->fd);
3476
3477 un->flags = READ_ONCE(sqe->unlink_flags);
3478 if (un->flags & ~AT_REMOVEDIR)
3479 return -EINVAL;
3480
3481 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3482 un->filename = getname(fname);
3483 if (IS_ERR(un->filename))
3484 return PTR_ERR(un->filename);
3485
3486 req->flags |= REQ_F_NEED_CLEANUP;
3487 return 0;
3488}
3489
45d189c6 3490static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
3491{
3492 struct io_unlink *un = &req->unlink;
3493 int ret;
3494
45d189c6 3495 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
3496 return -EAGAIN;
3497
3498 if (un->flags & AT_REMOVEDIR)
3499 ret = do_rmdir(un->dfd, un->filename);
3500 else
3501 ret = do_unlinkat(un->dfd, un->filename);
3502
3503 req->flags &= ~REQ_F_NEED_CLEANUP;
3504 if (ret < 0)
3505 req_set_fail_links(req);
3506 io_req_complete(req, ret);
3507 return 0;
3508}
3509
36f4fa68
JA
3510static int io_shutdown_prep(struct io_kiocb *req,
3511 const struct io_uring_sqe *sqe)
3512{
3513#if defined(CONFIG_NET)
3514 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3515 return -EINVAL;
3516 if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3517 sqe->buf_index)
3518 return -EINVAL;
3519
3520 req->shutdown.how = READ_ONCE(sqe->len);
3521 return 0;
3522#else
3523 return -EOPNOTSUPP;
3524#endif
3525}
3526
45d189c6 3527static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
3528{
3529#if defined(CONFIG_NET)
3530 struct socket *sock;
3531 int ret;
3532
45d189c6 3533 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
3534 return -EAGAIN;
3535
48aba79b 3536 sock = sock_from_file(req->file);
36f4fa68 3537 if (unlikely(!sock))
48aba79b 3538 return -ENOTSOCK;
36f4fa68
JA
3539
3540 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d
JA
3541 if (ret < 0)
3542 req_set_fail_links(req);
36f4fa68
JA
3543 io_req_complete(req, ret);
3544 return 0;
3545#else
3546 return -EOPNOTSUPP;
3547#endif
3548}
3549
f2a8d5c7
PB
3550static int __io_splice_prep(struct io_kiocb *req,
3551 const struct io_uring_sqe *sqe)
7d67af2c
PB
3552{
3553 struct io_splice* sp = &req->splice;
3554 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 3555
3232dd02
PB
3556 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3557 return -EINVAL;
7d67af2c
PB
3558
3559 sp->file_in = NULL;
7d67af2c
PB
3560 sp->len = READ_ONCE(sqe->len);
3561 sp->flags = READ_ONCE(sqe->splice_flags);
3562
3563 if (unlikely(sp->flags & ~valid_flags))
3564 return -EINVAL;
3565
8371adf5
PB
3566 sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3567 (sp->flags & SPLICE_F_FD_IN_FIXED));
3568 if (!sp->file_in)
3569 return -EBADF;
7d67af2c
PB
3570 req->flags |= REQ_F_NEED_CLEANUP;
3571
7cdaf587
XW
3572 if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3573 /*
3574 * Splice operation will be punted aync, and here need to
3575 * modify io_wq_work.flags, so initialize io_wq_work firstly.
3576 */
7d67af2c 3577 req->work.flags |= IO_WQ_WORK_UNBOUND;
7cdaf587 3578 }
7d67af2c
PB
3579
3580 return 0;
3581}
3582
f2a8d5c7
PB
3583static int io_tee_prep(struct io_kiocb *req,
3584 const struct io_uring_sqe *sqe)
3585{
3586 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3587 return -EINVAL;
3588 return __io_splice_prep(req, sqe);
3589}
3590
45d189c6 3591static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
3592{
3593 struct io_splice *sp = &req->splice;
3594 struct file *in = sp->file_in;
3595 struct file *out = sp->file_out;
3596 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3597 long ret = 0;
3598
45d189c6 3599 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7
PB
3600 return -EAGAIN;
3601 if (sp->len)
3602 ret = do_tee(in, out, sp->len, flags);
3603
3604 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3605 req->flags &= ~REQ_F_NEED_CLEANUP;
3606
f2a8d5c7
PB
3607 if (ret != sp->len)
3608 req_set_fail_links(req);
e1e16097 3609 io_req_complete(req, ret);
f2a8d5c7
PB
3610 return 0;
3611}
3612
3613static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3614{
3615 struct io_splice* sp = &req->splice;
3616
3617 sp->off_in = READ_ONCE(sqe->splice_off_in);
3618 sp->off_out = READ_ONCE(sqe->off);
3619 return __io_splice_prep(req, sqe);
3620}
3621
45d189c6 3622static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
3623{
3624 struct io_splice *sp = &req->splice;
3625 struct file *in = sp->file_in;
3626 struct file *out = sp->file_out;
3627 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3628 loff_t *poff_in, *poff_out;
c9687426 3629 long ret = 0;
7d67af2c 3630
45d189c6 3631 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 3632 return -EAGAIN;
7d67af2c
PB
3633
3634 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3635 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 3636
948a7749 3637 if (sp->len)
c9687426 3638 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c
PB
3639
3640 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3641 req->flags &= ~REQ_F_NEED_CLEANUP;
3642
7d67af2c
PB
3643 if (ret != sp->len)
3644 req_set_fail_links(req);
e1e16097 3645 io_req_complete(req, ret);
7d67af2c
PB
3646 return 0;
3647}
3648
2b188cc1
JA
3649/*
3650 * IORING_OP_NOP just posts a completion event, nothing else.
3651 */
889fca73 3652static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3653{
3654 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 3655
def596e9
JA
3656 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3657 return -EINVAL;
3658
889fca73 3659 __io_req_complete(req, issue_flags, 0, 0);
2b188cc1
JA
3660 return 0;
3661}
3662
1155c76a 3663static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 3664{
6b06314c 3665 struct io_ring_ctx *ctx = req->ctx;
c992fe29 3666
09bb8394
JA
3667 if (!req->file)
3668 return -EBADF;
c992fe29 3669
6b06314c 3670 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 3671 return -EINVAL;
edafccee 3672 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
3673 return -EINVAL;
3674
8ed8d3c3
JA
3675 req->sync.flags = READ_ONCE(sqe->fsync_flags);
3676 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3677 return -EINVAL;
3678
3679 req->sync.off = READ_ONCE(sqe->off);
3680 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
3681 return 0;
3682}
3683
45d189c6 3684static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 3685{
8ed8d3c3 3686 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
3687 int ret;
3688
ac45abc0 3689 /* fsync always requires a blocking context */
45d189c6 3690 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
3691 return -EAGAIN;
3692
9adbd45d 3693 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
3694 end > 0 ? end : LLONG_MAX,
3695 req->sync.flags & IORING_FSYNC_DATASYNC);
3696 if (ret < 0)
3697 req_set_fail_links(req);
e1e16097 3698 io_req_complete(req, ret);
c992fe29
CH
3699 return 0;
3700}
3701
d63d1b5e
JA
3702static int io_fallocate_prep(struct io_kiocb *req,
3703 const struct io_uring_sqe *sqe)
3704{
3705 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3706 return -EINVAL;
3232dd02
PB
3707 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3708 return -EINVAL;
d63d1b5e
JA
3709
3710 req->sync.off = READ_ONCE(sqe->off);
3711 req->sync.len = READ_ONCE(sqe->addr);
3712 req->sync.mode = READ_ONCE(sqe->len);
3713 return 0;
3714}
3715
45d189c6 3716static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 3717{
ac45abc0
PB
3718 int ret;
3719
d63d1b5e 3720 /* fallocate always requiring blocking context */
45d189c6 3721 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 3722 return -EAGAIN;
ac45abc0
PB
3723 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3724 req->sync.len);
ac45abc0
PB
3725 if (ret < 0)
3726 req_set_fail_links(req);
e1e16097 3727 io_req_complete(req, ret);
5d17b4a4
JA
3728 return 0;
3729}
3730
ec65fea5 3731static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 3732{
f8748881 3733 const char __user *fname;
15b71abe 3734 int ret;
b7bb4f7d 3735
ec65fea5 3736 if (unlikely(sqe->ioprio || sqe->buf_index))
15b71abe 3737 return -EINVAL;
ec65fea5 3738 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 3739 return -EBADF;
03b1230c 3740
ec65fea5
PB
3741 /* open.how should be already initialised */
3742 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 3743 req->open.how.flags |= O_LARGEFILE;
3529d8c2 3744
25e72d10
PB
3745 req->open.dfd = READ_ONCE(sqe->fd);
3746 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 3747 req->open.filename = getname(fname);
15b71abe
JA
3748 if (IS_ERR(req->open.filename)) {
3749 ret = PTR_ERR(req->open.filename);
3750 req->open.filename = NULL;
3751 return ret;
3752 }
4022e7af 3753 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 3754 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 3755 return 0;
03b1230c
JA
3756}
3757
ec65fea5
PB
3758static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3759{
3760 u64 flags, mode;
3761
14587a46 3762 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4eb8dded 3763 return -EINVAL;
ec65fea5
PB
3764 mode = READ_ONCE(sqe->len);
3765 flags = READ_ONCE(sqe->open_flags);
3766 req->open.how = build_open_how(flags, mode);
3767 return __io_openat_prep(req, sqe);
3768}
3769
cebdb986 3770static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 3771{
cebdb986 3772 struct open_how __user *how;
cebdb986 3773 size_t len;
0fa03c62
JA
3774 int ret;
3775
14587a46 3776 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4eb8dded 3777 return -EINVAL;
cebdb986
JA
3778 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3779 len = READ_ONCE(sqe->len);
cebdb986
JA
3780 if (len < OPEN_HOW_SIZE_VER0)
3781 return -EINVAL;
3529d8c2 3782
cebdb986
JA
3783 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3784 len);
3785 if (ret)
3786 return ret;
3529d8c2 3787
ec65fea5 3788 return __io_openat_prep(req, sqe);
cebdb986
JA
3789}
3790
45d189c6 3791static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
3792{
3793 struct open_flags op;
15b71abe 3794 struct file *file;
3a81fd02
JA
3795 bool nonblock_set;
3796 bool resolve_nonblock;
15b71abe
JA
3797 int ret;
3798
cebdb986 3799 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
3800 if (ret)
3801 goto err;
3a81fd02
JA
3802 nonblock_set = op.open_flag & O_NONBLOCK;
3803 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 3804 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
3805 /*
3806 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3807 * it'll always -EAGAIN
3808 */
3809 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3810 return -EAGAIN;
3811 op.lookup_flags |= LOOKUP_CACHED;
3812 op.open_flag |= O_NONBLOCK;
3813 }
15b71abe 3814
4022e7af 3815 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
15b71abe
JA
3816 if (ret < 0)
3817 goto err;
3818
3819 file = do_filp_open(req->open.dfd, req->open.filename, &op);
3a81fd02 3820 /* only retry if RESOLVE_CACHED wasn't already set by application */
45d189c6
PB
3821 if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
3822 file == ERR_PTR(-EAGAIN)) {
944d1444 3823 /*
3a81fd02
JA
3824 * We could hang on to this 'fd', but seems like marginal
3825 * gain for something that is now known to be a slower path.
3826 * So just put it, and we'll get a new one when we retry.
944d1444 3827 */
3a81fd02
JA
3828 put_unused_fd(ret);
3829 return -EAGAIN;
3830 }
3831
15b71abe
JA
3832 if (IS_ERR(file)) {
3833 put_unused_fd(ret);
3834 ret = PTR_ERR(file);
3835 } else {
45d189c6 3836 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3a81fd02 3837 file->f_flags &= ~O_NONBLOCK;
15b71abe
JA
3838 fsnotify_open(file);
3839 fd_install(ret, file);
3840 }
3841err:
3842 putname(req->open.filename);
8fef80bf 3843 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe
JA
3844 if (ret < 0)
3845 req_set_fail_links(req);
e1e16097 3846 io_req_complete(req, ret);
15b71abe
JA
3847 return 0;
3848}
3849
45d189c6 3850static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 3851{
e45cff58 3852 return io_openat2(req, issue_flags);
cebdb986
JA
3853}
3854
067524e9
JA
3855static int io_remove_buffers_prep(struct io_kiocb *req,
3856 const struct io_uring_sqe *sqe)
3857{
3858 struct io_provide_buf *p = &req->pbuf;
3859 u64 tmp;
3860
3861 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3862 return -EINVAL;
3863
3864 tmp = READ_ONCE(sqe->fd);
3865 if (!tmp || tmp > USHRT_MAX)
3866 return -EINVAL;
3867
3868 memset(p, 0, sizeof(*p));
3869 p->nbufs = tmp;
3870 p->bgid = READ_ONCE(sqe->buf_group);
3871 return 0;
3872}
3873
3874static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3875 int bgid, unsigned nbufs)
3876{
3877 unsigned i = 0;
3878
3879 /* shouldn't happen */
3880 if (!nbufs)
3881 return 0;
3882
3883 /* the head kbuf is the list itself */
3884 while (!list_empty(&buf->list)) {
3885 struct io_buffer *nxt;
3886
3887 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3888 list_del(&nxt->list);
3889 kfree(nxt);
3890 if (++i == nbufs)
3891 return i;
3892 }
3893 i++;
3894 kfree(buf);
3895 idr_remove(&ctx->io_buffer_idr, bgid);
3896
3897 return i;
3898}
3899
889fca73 3900static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
3901{
3902 struct io_provide_buf *p = &req->pbuf;
3903 struct io_ring_ctx *ctx = req->ctx;
3904 struct io_buffer *head;
3905 int ret = 0;
45d189c6 3906 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
067524e9
JA
3907
3908 io_ring_submit_lock(ctx, !force_nonblock);
3909
3910 lockdep_assert_held(&ctx->uring_lock);
3911
3912 ret = -ENOENT;
3913 head = idr_find(&ctx->io_buffer_idr, p->bgid);
3914 if (head)
3915 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
067524e9
JA
3916 if (ret < 0)
3917 req_set_fail_links(req);
067524e9 3918
31bff9a5
PB
3919 /* need to hold the lock to complete IOPOLL requests */
3920 if (ctx->flags & IORING_SETUP_IOPOLL) {
889fca73 3921 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5
PB
3922 io_ring_submit_unlock(ctx, !force_nonblock);
3923 } else {
3924 io_ring_submit_unlock(ctx, !force_nonblock);
889fca73 3925 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5 3926 }
067524e9
JA
3927 return 0;
3928}
3929
ddf0322d
JA
3930static int io_provide_buffers_prep(struct io_kiocb *req,
3931 const struct io_uring_sqe *sqe)
3932{
3933 struct io_provide_buf *p = &req->pbuf;
3934 u64 tmp;
3935
3936 if (sqe->ioprio || sqe->rw_flags)
3937 return -EINVAL;
3938
3939 tmp = READ_ONCE(sqe->fd);
3940 if (!tmp || tmp > USHRT_MAX)
3941 return -E2BIG;
3942 p->nbufs = tmp;
3943 p->addr = READ_ONCE(sqe->addr);
3944 p->len = READ_ONCE(sqe->len);
3945
efe68c1c 3946 if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
ddf0322d
JA
3947 return -EFAULT;
3948
3949 p->bgid = READ_ONCE(sqe->buf_group);
3950 tmp = READ_ONCE(sqe->off);
3951 if (tmp > USHRT_MAX)
3952 return -E2BIG;
3953 p->bid = tmp;
3954 return 0;
3955}
3956
3957static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3958{
3959 struct io_buffer *buf;
3960 u64 addr = pbuf->addr;
3961 int i, bid = pbuf->bid;
3962
3963 for (i = 0; i < pbuf->nbufs; i++) {
3964 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3965 if (!buf)
3966 break;
3967
3968 buf->addr = addr;
3969 buf->len = pbuf->len;
3970 buf->bid = bid;
3971 addr += pbuf->len;
3972 bid++;
3973 if (!*head) {
3974 INIT_LIST_HEAD(&buf->list);
3975 *head = buf;
3976 } else {
3977 list_add_tail(&buf->list, &(*head)->list);
3978 }
3979 }
3980
3981 return i ? i : -ENOMEM;
3982}
3983
889fca73 3984static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
3985{
3986 struct io_provide_buf *p = &req->pbuf;
3987 struct io_ring_ctx *ctx = req->ctx;
3988 struct io_buffer *head, *list;
3989 int ret = 0;
45d189c6 3990 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ddf0322d
JA
3991
3992 io_ring_submit_lock(ctx, !force_nonblock);
3993
3994 lockdep_assert_held(&ctx->uring_lock);
3995
3996 list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3997
3998 ret = io_add_buffers(p, &head);
3999 if (ret < 0)
4000 goto out;
4001
4002 if (!list) {
4003 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
4004 GFP_KERNEL);
4005 if (ret < 0) {
067524e9 4006 __io_remove_buffers(ctx, head, p->bgid, -1U);
ddf0322d
JA
4007 goto out;
4008 }
4009 }
4010out:
ddf0322d
JA
4011 if (ret < 0)
4012 req_set_fail_links(req);
31bff9a5
PB
4013
4014 /* need to hold the lock to complete IOPOLL requests */
4015 if (ctx->flags & IORING_SETUP_IOPOLL) {
889fca73 4016 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5
PB
4017 io_ring_submit_unlock(ctx, !force_nonblock);
4018 } else {
4019 io_ring_submit_unlock(ctx, !force_nonblock);
889fca73 4020 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5 4021 }
ddf0322d 4022 return 0;
cebdb986
JA
4023}
4024
3e4827b0
JA
4025static int io_epoll_ctl_prep(struct io_kiocb *req,
4026 const struct io_uring_sqe *sqe)
4027{
4028#if defined(CONFIG_EPOLL)
4029 if (sqe->ioprio || sqe->buf_index)
4030 return -EINVAL;
6ca56f84 4031 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
3232dd02 4032 return -EINVAL;
3e4827b0
JA
4033
4034 req->epoll.epfd = READ_ONCE(sqe->fd);
4035 req->epoll.op = READ_ONCE(sqe->len);
4036 req->epoll.fd = READ_ONCE(sqe->off);
4037
4038 if (ep_op_has_event(req->epoll.op)) {
4039 struct epoll_event __user *ev;
4040
4041 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4042 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4043 return -EFAULT;
4044 }
4045
4046 return 0;
4047#else
4048 return -EOPNOTSUPP;
4049#endif
4050}
4051
889fca73 4052static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
4053{
4054#if defined(CONFIG_EPOLL)
4055 struct io_epoll *ie = &req->epoll;
4056 int ret;
45d189c6 4057 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
4058
4059 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4060 if (force_nonblock && ret == -EAGAIN)
4061 return -EAGAIN;
4062
4063 if (ret < 0)
4064 req_set_fail_links(req);
889fca73 4065 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
4066 return 0;
4067#else
4068 return -EOPNOTSUPP;
4069#endif
4070}
4071
c1ca757b
JA
4072static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4073{
4074#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4075 if (sqe->ioprio || sqe->buf_index || sqe->off)
4076 return -EINVAL;
3232dd02
PB
4077 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4078 return -EINVAL;
c1ca757b
JA
4079
4080 req->madvise.addr = READ_ONCE(sqe->addr);
4081 req->madvise.len = READ_ONCE(sqe->len);
4082 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4083 return 0;
4084#else
4085 return -EOPNOTSUPP;
4086#endif
4087}
4088
45d189c6 4089static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
4090{
4091#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4092 struct io_madvise *ma = &req->madvise;
4093 int ret;
4094
45d189c6 4095 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
4096 return -EAGAIN;
4097
0726b01e 4098 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b
JA
4099 if (ret < 0)
4100 req_set_fail_links(req);
e1e16097 4101 io_req_complete(req, ret);
c1ca757b
JA
4102 return 0;
4103#else
4104 return -EOPNOTSUPP;
4105#endif
4106}
4107
4840e418
JA
4108static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4109{
4110 if (sqe->ioprio || sqe->buf_index || sqe->addr)
4111 return -EINVAL;
3232dd02
PB
4112 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4113 return -EINVAL;
4840e418
JA
4114
4115 req->fadvise.offset = READ_ONCE(sqe->off);
4116 req->fadvise.len = READ_ONCE(sqe->len);
4117 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4118 return 0;
4119}
4120
45d189c6 4121static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
4122{
4123 struct io_fadvise *fa = &req->fadvise;
4124 int ret;
4125
45d189c6 4126 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
4127 switch (fa->advice) {
4128 case POSIX_FADV_NORMAL:
4129 case POSIX_FADV_RANDOM:
4130 case POSIX_FADV_SEQUENTIAL:
4131 break;
4132 default:
4133 return -EAGAIN;
4134 }
4135 }
4840e418
JA
4136
4137 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4138 if (ret < 0)
4139 req_set_fail_links(req);
e1e16097 4140 io_req_complete(req, ret);
4840e418
JA
4141 return 0;
4142}
4143
eddc7ef5
JA
4144static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4145{
6ca56f84 4146 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
3232dd02 4147 return -EINVAL;
eddc7ef5
JA
4148 if (sqe->ioprio || sqe->buf_index)
4149 return -EINVAL;
9c280f90 4150 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4151 return -EBADF;
eddc7ef5 4152
1d9e1288
BM
4153 req->statx.dfd = READ_ONCE(sqe->fd);
4154 req->statx.mask = READ_ONCE(sqe->len);
e62753e4 4155 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
4156 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4157 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5
JA
4158
4159 return 0;
4160}
4161
45d189c6 4162static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 4163{
1d9e1288 4164 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
4165 int ret;
4166
45d189c6 4167 if (issue_flags & IO_URING_F_NONBLOCK) {
5b0bbee4
JA
4168 /* only need file table for an actual valid fd */
4169 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4170 req->flags |= REQ_F_NO_FILE_TABLE;
eddc7ef5 4171 return -EAGAIN;
5b0bbee4 4172 }
eddc7ef5 4173
e62753e4
BM
4174 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4175 ctx->buffer);
eddc7ef5 4176
eddc7ef5
JA
4177 if (ret < 0)
4178 req_set_fail_links(req);
e1e16097 4179 io_req_complete(req, ret);
eddc7ef5
JA
4180 return 0;
4181}
4182
b5dba59e
JA
4183static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4184{
14587a46 4185 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4186 return -EINVAL;
b5dba59e
JA
4187 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4188 sqe->rw_flags || sqe->buf_index)
4189 return -EINVAL;
9c280f90 4190 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4191 return -EBADF;
b5dba59e
JA
4192
4193 req->close.fd = READ_ONCE(sqe->fd);
b5dba59e 4194 return 0;
b5dba59e
JA
4195}
4196
889fca73 4197static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 4198{
9eac1904 4199 struct files_struct *files = current->files;
3af73b28 4200 struct io_close *close = &req->close;
9eac1904
JA
4201 struct fdtable *fdt;
4202 struct file *file;
b5dba59e
JA
4203 int ret;
4204
9eac1904
JA
4205 file = NULL;
4206 ret = -EBADF;
4207 spin_lock(&files->file_lock);
4208 fdt = files_fdtable(files);
4209 if (close->fd >= fdt->max_fds) {
4210 spin_unlock(&files->file_lock);
4211 goto err;
4212 }
4213 file = fdt->fd[close->fd];
4214 if (!file) {
4215 spin_unlock(&files->file_lock);
4216 goto err;
4217 }
4218
4219 if (file->f_op == &io_uring_fops) {
4220 spin_unlock(&files->file_lock);
4221 file = NULL;
4222 goto err;
3af73b28 4223 }
b5dba59e
JA
4224
4225 /* if the file has a flush method, be safe and punt to async */
45d189c6 4226 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 4227 spin_unlock(&files->file_lock);
0bf0eefd 4228 return -EAGAIN;
a2100672 4229 }
b5dba59e 4230
9eac1904
JA
4231 ret = __close_fd_get_file(close->fd, &file);
4232 spin_unlock(&files->file_lock);
4233 if (ret < 0) {
4234 if (ret == -ENOENT)
4235 ret = -EBADF;
4236 goto err;
4237 }
4238
3af73b28 4239 /* No ->flush() or already async, safely close from here */
9eac1904
JA
4240 ret = filp_close(file, current->files);
4241err:
3af73b28
PB
4242 if (ret < 0)
4243 req_set_fail_links(req);
9eac1904
JA
4244 if (file)
4245 fput(file);
889fca73 4246 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 4247 return 0;
b5dba59e
JA
4248}
4249
1155c76a 4250static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
4251{
4252 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4 4253
5d17b4a4
JA
4254 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4255 return -EINVAL;
4256 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4257 return -EINVAL;
4258
8ed8d3c3
JA
4259 req->sync.off = READ_ONCE(sqe->off);
4260 req->sync.len = READ_ONCE(sqe->len);
4261 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
4262 return 0;
4263}
4264
45d189c6 4265static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4266{
8ed8d3c3
JA
4267 int ret;
4268
ac45abc0 4269 /* sync_file_range always requires a blocking context */
45d189c6 4270 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4271 return -EAGAIN;
4272
9adbd45d 4273 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
4274 req->sync.flags);
4275 if (ret < 0)
4276 req_set_fail_links(req);
e1e16097 4277 io_req_complete(req, ret);
5d17b4a4
JA
4278 return 0;
4279}
4280
469956e8 4281#if defined(CONFIG_NET)
02d27d89
PB
4282static int io_setup_async_msg(struct io_kiocb *req,
4283 struct io_async_msghdr *kmsg)
4284{
e8c2bc1f
JA
4285 struct io_async_msghdr *async_msg = req->async_data;
4286
4287 if (async_msg)
02d27d89 4288 return -EAGAIN;
e8c2bc1f 4289 if (io_alloc_async_data(req)) {
257e84a5 4290 kfree(kmsg->free_iov);
02d27d89
PB
4291 return -ENOMEM;
4292 }
e8c2bc1f 4293 async_msg = req->async_data;
02d27d89 4294 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 4295 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 4296 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
4297 /* if were using fast_iov, set it to the new one */
4298 if (!async_msg->free_iov)
4299 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4300
02d27d89
PB
4301 return -EAGAIN;
4302}
4303
2ae523ed
PB
4304static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4305 struct io_async_msghdr *iomsg)
4306{
2ae523ed 4307 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 4308 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 4309 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 4310 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
4311}
4312
93642ef8
PB
4313static int io_sendmsg_prep_async(struct io_kiocb *req)
4314{
4315 int ret;
4316
4317 if (!io_op_defs[req->opcode].needs_async_data)
4318 return 0;
4319 ret = io_sendmsg_copy_hdr(req, req->async_data);
4320 if (!ret)
4321 req->flags |= REQ_F_NEED_CLEANUP;
4322 return ret;
4323}
4324
3529d8c2 4325static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 4326{
e47293fd 4327 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 4328
d2b6f48b
PB
4329 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4330 return -EINVAL;
4331
e47293fd 4332 sr->msg_flags = READ_ONCE(sqe->msg_flags);
270a5940 4333 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 4334 sr->len = READ_ONCE(sqe->len);
3529d8c2 4335
d8768362
JA
4336#ifdef CONFIG_COMPAT
4337 if (req->ctx->compat)
4338 sr->msg_flags |= MSG_CMSG_COMPAT;
4339#endif
93642ef8 4340 return 0;
03b1230c
JA
4341}
4342
889fca73 4343static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4344{
6b754c8b 4345 struct io_async_msghdr iomsg, *kmsg;
0fa03c62 4346 struct socket *sock;
7a7cacba 4347 unsigned flags;
0fa03c62
JA
4348 int ret;
4349
dba4a925 4350 sock = sock_from_file(req->file);
7a7cacba 4351 if (unlikely(!sock))
dba4a925 4352 return -ENOTSOCK;
3529d8c2 4353
257e84a5
PB
4354 kmsg = req->async_data;
4355 if (!kmsg) {
7a7cacba
PB
4356 ret = io_sendmsg_copy_hdr(req, &iomsg);
4357 if (ret)
4358 return ret;
4359 kmsg = &iomsg;
0fa03c62 4360 }
0fa03c62 4361
7a7cacba
PB
4362 flags = req->sr_msg.msg_flags;
4363 if (flags & MSG_DONTWAIT)
4364 req->flags |= REQ_F_NOWAIT;
45d189c6 4365 else if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4366 flags |= MSG_DONTWAIT;
e47293fd 4367
7a7cacba 4368 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
45d189c6 4369 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4370 return io_setup_async_msg(req, kmsg);
4371 if (ret == -ERESTARTSYS)
4372 ret = -EINTR;
0fa03c62 4373
257e84a5
PB
4374 /* fast path, check for non-NULL to avoid function call */
4375 if (kmsg->free_iov)
4376 kfree(kmsg->free_iov);
99bc4c38 4377 req->flags &= ~REQ_F_NEED_CLEANUP;
4e88d6e7
JA
4378 if (ret < 0)
4379 req_set_fail_links(req);
889fca73 4380 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 4381 return 0;
03b1230c 4382}
aa1fa28f 4383
889fca73 4384static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4385{
7a7cacba
PB
4386 struct io_sr_msg *sr = &req->sr_msg;
4387 struct msghdr msg;
4388 struct iovec iov;
fddaface 4389 struct socket *sock;
7a7cacba 4390 unsigned flags;
fddaface
JA
4391 int ret;
4392
dba4a925 4393 sock = sock_from_file(req->file);
7a7cacba 4394 if (unlikely(!sock))
dba4a925 4395 return -ENOTSOCK;
fddaface 4396
7a7cacba
PB
4397 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4398 if (unlikely(ret))
14db8411 4399 return ret;
fddaface 4400
7a7cacba
PB
4401 msg.msg_name = NULL;
4402 msg.msg_control = NULL;
4403 msg.msg_controllen = 0;
4404 msg.msg_namelen = 0;
fddaface 4405
7a7cacba
PB
4406 flags = req->sr_msg.msg_flags;
4407 if (flags & MSG_DONTWAIT)
4408 req->flags |= REQ_F_NOWAIT;
45d189c6 4409 else if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4410 flags |= MSG_DONTWAIT;
fddaface 4411
7a7cacba
PB
4412 msg.msg_flags = flags;
4413 ret = sock_sendmsg(sock, &msg);
45d189c6 4414 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4415 return -EAGAIN;
4416 if (ret == -ERESTARTSYS)
4417 ret = -EINTR;
fddaface 4418
fddaface
JA
4419 if (ret < 0)
4420 req_set_fail_links(req);
889fca73 4421 __io_req_complete(req, issue_flags, ret, 0);
fddaface 4422 return 0;
fddaface
JA
4423}
4424
1400e697
PB
4425static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4426 struct io_async_msghdr *iomsg)
52de1fe1
JA
4427{
4428 struct io_sr_msg *sr = &req->sr_msg;
4429 struct iovec __user *uiov;
4430 size_t iov_len;
4431 int ret;
4432
1400e697
PB
4433 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4434 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
4435 if (ret)
4436 return ret;
4437
4438 if (req->flags & REQ_F_BUFFER_SELECT) {
4439 if (iov_len > 1)
4440 return -EINVAL;
5476dfed 4441 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 4442 return -EFAULT;
5476dfed 4443 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 4444 iomsg->free_iov = NULL;
52de1fe1 4445 } else {
257e84a5 4446 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4447 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 4448 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 4449 false);
52de1fe1
JA
4450 if (ret > 0)
4451 ret = 0;
4452 }
4453
4454 return ret;
4455}
4456
4457#ifdef CONFIG_COMPAT
4458static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 4459 struct io_async_msghdr *iomsg)
52de1fe1
JA
4460{
4461 struct compat_msghdr __user *msg_compat;
4462 struct io_sr_msg *sr = &req->sr_msg;
4463 struct compat_iovec __user *uiov;
4464 compat_uptr_t ptr;
4465 compat_size_t len;
4466 int ret;
4467
270a5940 4468 msg_compat = (struct compat_msghdr __user *) sr->umsg;
1400e697 4469 ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
52de1fe1
JA
4470 &ptr, &len);
4471 if (ret)
4472 return ret;
4473
4474 uiov = compat_ptr(ptr);
4475 if (req->flags & REQ_F_BUFFER_SELECT) {
4476 compat_ssize_t clen;
4477
4478 if (len > 1)
4479 return -EINVAL;
4480 if (!access_ok(uiov, sizeof(*uiov)))
4481 return -EFAULT;
4482 if (__get_user(clen, &uiov->iov_len))
4483 return -EFAULT;
4484 if (clen < 0)
4485 return -EINVAL;
2d280bc8 4486 sr->len = clen;
257e84a5 4487 iomsg->free_iov = NULL;
52de1fe1 4488 } else {
257e84a5 4489 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4490 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 4491 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 4492 &iomsg->msg.msg_iter, true);
52de1fe1
JA
4493 if (ret < 0)
4494 return ret;
4495 }
4496
4497 return 0;
4498}
4499#endif
4500
1400e697
PB
4501static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4502 struct io_async_msghdr *iomsg)
52de1fe1 4503{
1400e697 4504 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
4505
4506#ifdef CONFIG_COMPAT
4507 if (req->ctx->compat)
1400e697 4508 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 4509#endif
52de1fe1 4510
1400e697 4511 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
4512}
4513
bcda7baa 4514static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
7fbb1b54 4515 bool needs_lock)
bcda7baa
JA
4516{
4517 struct io_sr_msg *sr = &req->sr_msg;
4518 struct io_buffer *kbuf;
4519
bcda7baa
JA
4520 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4521 if (IS_ERR(kbuf))
4522 return kbuf;
4523
4524 sr->kbuf = kbuf;
4525 req->flags |= REQ_F_BUFFER_SELECTED;
bcda7baa 4526 return kbuf;
fddaface
JA
4527}
4528
7fbb1b54
PB
4529static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4530{
4531 return io_put_kbuf(req, req->sr_msg.kbuf);
4532}
4533
93642ef8 4534static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 4535{
99bc4c38 4536 int ret;
3529d8c2 4537
93642ef8
PB
4538 if (!io_op_defs[req->opcode].needs_async_data)
4539 return 0;
4540 ret = io_recvmsg_copy_hdr(req, req->async_data);
4541 if (!ret)
4542 req->flags |= REQ_F_NEED_CLEANUP;
4543 return ret;
4544}
4545
4546static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4547{
4548 struct io_sr_msg *sr = &req->sr_msg;
4549
d2b6f48b
PB
4550 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4551 return -EINVAL;
4552
3529d8c2 4553 sr->msg_flags = READ_ONCE(sqe->msg_flags);
270a5940 4554 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 4555 sr->len = READ_ONCE(sqe->len);
bcda7baa 4556 sr->bgid = READ_ONCE(sqe->buf_group);
06b76d44 4557
d8768362
JA
4558#ifdef CONFIG_COMPAT
4559 if (req->ctx->compat)
4560 sr->msg_flags |= MSG_CMSG_COMPAT;
4561#endif
93642ef8 4562 return 0;
aa1fa28f
JA
4563}
4564
889fca73 4565static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4566{
6b754c8b 4567 struct io_async_msghdr iomsg, *kmsg;
03b1230c 4568 struct socket *sock;
7fbb1b54 4569 struct io_buffer *kbuf;
7a7cacba 4570 unsigned flags;
52de1fe1 4571 int ret, cflags = 0;
45d189c6 4572 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 4573
dba4a925 4574 sock = sock_from_file(req->file);
7a7cacba 4575 if (unlikely(!sock))
dba4a925 4576 return -ENOTSOCK;
3529d8c2 4577
257e84a5
PB
4578 kmsg = req->async_data;
4579 if (!kmsg) {
7a7cacba
PB
4580 ret = io_recvmsg_copy_hdr(req, &iomsg);
4581 if (ret)
681fda8d 4582 return ret;
7a7cacba
PB
4583 kmsg = &iomsg;
4584 }
03b1230c 4585
bc02ef33 4586 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4587 kbuf = io_recv_buffer_select(req, !force_nonblock);
bc02ef33 4588 if (IS_ERR(kbuf))
52de1fe1 4589 return PTR_ERR(kbuf);
7a7cacba 4590 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5476dfed
PB
4591 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4592 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
7a7cacba
PB
4593 1, req->sr_msg.len);
4594 }
52de1fe1 4595
7a7cacba
PB
4596 flags = req->sr_msg.msg_flags;
4597 if (flags & MSG_DONTWAIT)
4598 req->flags |= REQ_F_NOWAIT;
4599 else if (force_nonblock)
4600 flags |= MSG_DONTWAIT;
e47293fd 4601
7a7cacba
PB
4602 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4603 kmsg->uaddr, flags);
0e1b6fe3
PB
4604 if (force_nonblock && ret == -EAGAIN)
4605 return io_setup_async_msg(req, kmsg);
7a7cacba
PB
4606 if (ret == -ERESTARTSYS)
4607 ret = -EINTR;
03b1230c 4608
7fbb1b54
PB
4609 if (req->flags & REQ_F_BUFFER_SELECTED)
4610 cflags = io_put_recv_kbuf(req);
257e84a5
PB
4611 /* fast path, check for non-NULL to avoid function call */
4612 if (kmsg->free_iov)
4613 kfree(kmsg->free_iov);
99bc4c38 4614 req->flags &= ~REQ_F_NEED_CLEANUP;
4e88d6e7
JA
4615 if (ret < 0)
4616 req_set_fail_links(req);
889fca73 4617 __io_req_complete(req, issue_flags, ret, cflags);
03b1230c 4618 return 0;
0fa03c62 4619}
5d17b4a4 4620
889fca73 4621static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4622{
6b754c8b 4623 struct io_buffer *kbuf;
7a7cacba
PB
4624 struct io_sr_msg *sr = &req->sr_msg;
4625 struct msghdr msg;
4626 void __user *buf = sr->buf;
fddaface 4627 struct socket *sock;
7a7cacba
PB
4628 struct iovec iov;
4629 unsigned flags;
bcda7baa 4630 int ret, cflags = 0;
45d189c6 4631 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 4632
dba4a925 4633 sock = sock_from_file(req->file);
7a7cacba 4634 if (unlikely(!sock))
dba4a925 4635 return -ENOTSOCK;
fddaface 4636
bc02ef33 4637 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4638 kbuf = io_recv_buffer_select(req, !force_nonblock);
bcda7baa
JA
4639 if (IS_ERR(kbuf))
4640 return PTR_ERR(kbuf);
7a7cacba 4641 buf = u64_to_user_ptr(kbuf->addr);
bc02ef33 4642 }
bcda7baa 4643
7a7cacba 4644 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
4645 if (unlikely(ret))
4646 goto out_free;
fddaface 4647
7a7cacba
PB
4648 msg.msg_name = NULL;
4649 msg.msg_control = NULL;
4650 msg.msg_controllen = 0;
4651 msg.msg_namelen = 0;
4652 msg.msg_iocb = NULL;
4653 msg.msg_flags = 0;
fddaface 4654
7a7cacba
PB
4655 flags = req->sr_msg.msg_flags;
4656 if (flags & MSG_DONTWAIT)
4657 req->flags |= REQ_F_NOWAIT;
4658 else if (force_nonblock)
4659 flags |= MSG_DONTWAIT;
4660
4661 ret = sock_recvmsg(sock, &msg, flags);
4662 if (force_nonblock && ret == -EAGAIN)
4663 return -EAGAIN;
4664 if (ret == -ERESTARTSYS)
4665 ret = -EINTR;
14c32eee 4666out_free:
7fbb1b54
PB
4667 if (req->flags & REQ_F_BUFFER_SELECTED)
4668 cflags = io_put_recv_kbuf(req);
fddaface
JA
4669 if (ret < 0)
4670 req_set_fail_links(req);
889fca73 4671 __io_req_complete(req, issue_flags, ret, cflags);
fddaface 4672 return 0;
fddaface
JA
4673}
4674
3529d8c2 4675static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 4676{
8ed8d3c3
JA
4677 struct io_accept *accept = &req->accept;
4678
14587a46 4679 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
17f2fe35 4680 return -EINVAL;
8042d6ce 4681 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
4682 return -EINVAL;
4683
d55e5f5b
JA
4684 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4685 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 4686 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 4687 accept->nofile = rlimit(RLIMIT_NOFILE);
8ed8d3c3 4688 return 0;
8ed8d3c3 4689}
17f2fe35 4690
889fca73 4691static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
4692{
4693 struct io_accept *accept = &req->accept;
45d189c6 4694 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 4695 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
8ed8d3c3
JA
4696 int ret;
4697
e697deed
JX
4698 if (req->file->f_flags & O_NONBLOCK)
4699 req->flags |= REQ_F_NOWAIT;
4700
8ed8d3c3 4701 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
09952e3e
JA
4702 accept->addr_len, accept->flags,
4703 accept->nofile);
8ed8d3c3 4704 if (ret == -EAGAIN && force_nonblock)
17f2fe35 4705 return -EAGAIN;
ac45abc0
PB
4706 if (ret < 0) {
4707 if (ret == -ERESTARTSYS)
4708 ret = -EINTR;
4e88d6e7 4709 req_set_fail_links(req);
ac45abc0 4710 }
889fca73 4711 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 4712 return 0;
8ed8d3c3
JA
4713}
4714
93642ef8
PB
4715static int io_connect_prep_async(struct io_kiocb *req)
4716{
4717 struct io_async_connect *io = req->async_data;
4718 struct io_connect *conn = &req->connect;
4719
4720 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4721}
4722
3529d8c2 4723static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 4724{
3529d8c2 4725 struct io_connect *conn = &req->connect;
f499a021 4726
14587a46 4727 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3fbb51c1
JA
4728 return -EINVAL;
4729 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4730 return -EINVAL;
4731
3529d8c2
JA
4732 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4733 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 4734 return 0;
f499a021
JA
4735}
4736
889fca73 4737static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 4738{
e8c2bc1f 4739 struct io_async_connect __io, *io;
f8e85cf2 4740 unsigned file_flags;
3fbb51c1 4741 int ret;
45d189c6 4742 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 4743
e8c2bc1f
JA
4744 if (req->async_data) {
4745 io = req->async_data;
f499a021 4746 } else {
3529d8c2
JA
4747 ret = move_addr_to_kernel(req->connect.addr,
4748 req->connect.addr_len,
e8c2bc1f 4749 &__io.address);
f499a021
JA
4750 if (ret)
4751 goto out;
4752 io = &__io;
4753 }
4754
3fbb51c1
JA
4755 file_flags = force_nonblock ? O_NONBLOCK : 0;
4756
e8c2bc1f 4757 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 4758 req->connect.addr_len, file_flags);
87f80d62 4759 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
e8c2bc1f 4760 if (req->async_data)
b7bb4f7d 4761 return -EAGAIN;
e8c2bc1f 4762 if (io_alloc_async_data(req)) {
f499a021
JA
4763 ret = -ENOMEM;
4764 goto out;
4765 }
e8c2bc1f
JA
4766 io = req->async_data;
4767 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 4768 return -EAGAIN;
f499a021 4769 }
f8e85cf2
JA
4770 if (ret == -ERESTARTSYS)
4771 ret = -EINTR;
f499a021 4772out:
4e88d6e7
JA
4773 if (ret < 0)
4774 req_set_fail_links(req);
889fca73 4775 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 4776 return 0;
469956e8
Y
4777}
4778#else /* !CONFIG_NET */
99a10081
JA
4779#define IO_NETOP_FN(op) \
4780static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
4781{ \
4782 return -EOPNOTSUPP; \
4783}
4784
4785#define IO_NETOP_PREP(op) \
4786IO_NETOP_FN(op) \
4787static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4788{ \
4789 return -EOPNOTSUPP; \
4790} \
4791
4792#define IO_NETOP_PREP_ASYNC(op) \
4793IO_NETOP_PREP(op) \
4794static int io_##op##_prep_async(struct io_kiocb *req) \
4795{ \
4796 return -EOPNOTSUPP; \
4797}
4798
4799IO_NETOP_PREP_ASYNC(sendmsg);
4800IO_NETOP_PREP_ASYNC(recvmsg);
4801IO_NETOP_PREP_ASYNC(connect);
4802IO_NETOP_PREP(accept);
4803IO_NETOP_FN(send);
4804IO_NETOP_FN(recv);
469956e8 4805#endif /* CONFIG_NET */
f8e85cf2 4806
d7718a9d
JA
4807struct io_poll_table {
4808 struct poll_table_struct pt;
4809 struct io_kiocb *req;
4810 int error;
4811};
ce593a6c 4812
d7718a9d
JA
4813static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4814 __poll_t mask, task_work_func_t func)
4815{
aa96bf8a 4816 int ret;
d7718a9d
JA
4817
4818 /* for instances that support it check for an event match first: */
4819 if (mask && !(mask & poll->events))
4820 return 0;
4821
4822 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4823
4824 list_del_init(&poll->wait.entry);
4825
d7718a9d 4826 req->result = mask;
7cbf1722 4827 req->task_work.func = func;
6d816e08
JA
4828 percpu_ref_get(&req->ctx->refs);
4829
d7718a9d 4830 /*
e3aabf95
JA
4831 * If this fails, then the task is exiting. When a task exits, the
4832 * work gets canceled, so just cancel this request as well instead
4833 * of executing it. We can't safely execute it anyway, as we may not
4834 * have the needed state needed for it anyway.
d7718a9d 4835 */
355fb9e2 4836 ret = io_req_task_work_add(req);
aa96bf8a 4837 if (unlikely(ret)) {
e3aabf95 4838 WRITE_ONCE(poll->canceled, true);
eab30c4d 4839 io_req_task_work_add_fallback(req, func);
aa96bf8a 4840 }
d7718a9d
JA
4841 return 1;
4842}
4843
74ce6ce4
JA
4844static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4845 __acquires(&req->ctx->completion_lock)
4846{
4847 struct io_ring_ctx *ctx = req->ctx;
4848
4849 if (!req->result && !READ_ONCE(poll->canceled)) {
4850 struct poll_table_struct pt = { ._key = poll->events };
4851
4852 req->result = vfs_poll(req->file, &pt) & poll->events;
4853 }
4854
4855 spin_lock_irq(&ctx->completion_lock);
4856 if (!req->result && !READ_ONCE(poll->canceled)) {
4857 add_wait_queue(poll->head, &poll->wait);
4858 return true;
4859 }
4860
4861 return false;
4862}
4863
d4e7cd36 4864static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 4865{
e8c2bc1f 4866 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 4867 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 4868 return req->async_data;
d4e7cd36
JA
4869 return req->apoll->double_poll;
4870}
4871
4872static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4873{
4874 if (req->opcode == IORING_OP_POLL_ADD)
4875 return &req->poll;
4876 return &req->apoll->poll;
4877}
4878
4879static void io_poll_remove_double(struct io_kiocb *req)
4880{
4881 struct io_poll_iocb *poll = io_poll_get_double(req);
18bceab1
JA
4882
4883 lockdep_assert_held(&req->ctx->completion_lock);
4884
4885 if (poll && poll->head) {
4886 struct wait_queue_head *head = poll->head;
4887
4888 spin_lock(&head->lock);
4889 list_del_init(&poll->wait.entry);
4890 if (poll->wait.private)
4891 refcount_dec(&req->refs);
4892 poll->head = NULL;
4893 spin_unlock(&head->lock);
4894 }
4895}
4896
4897static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4898{
4899 struct io_ring_ctx *ctx = req->ctx;
4900
d4e7cd36 4901 io_poll_remove_double(req);
18bceab1
JA
4902 req->poll.done = true;
4903 io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4904 io_commit_cqring(ctx);
4905}
4906
dd221f46 4907static void io_poll_task_func(struct callback_head *cb)
18bceab1 4908{
dd221f46 4909 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
18bceab1 4910 struct io_ring_ctx *ctx = req->ctx;
dd221f46 4911 struct io_kiocb *nxt;
18bceab1
JA
4912
4913 if (io_poll_rewait(req, &req->poll)) {
4914 spin_unlock_irq(&ctx->completion_lock);
dd221f46
PB
4915 } else {
4916 hash_del(&req->hash_node);
4917 io_poll_complete(req, req->result, 0);
4918 spin_unlock_irq(&ctx->completion_lock);
18bceab1 4919
dd221f46
PB
4920 nxt = io_put_req_find_next(req);
4921 io_cqring_ev_posted(ctx);
4922 if (nxt)
4923 __io_req_task_submit(nxt);
4924 }
18bceab1 4925
6d816e08 4926 percpu_ref_put(&ctx->refs);
18bceab1
JA
4927}
4928
4929static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4930 int sync, void *key)
4931{
4932 struct io_kiocb *req = wait->private;
d4e7cd36 4933 struct io_poll_iocb *poll = io_poll_get_single(req);
18bceab1
JA
4934 __poll_t mask = key_to_poll(key);
4935
4936 /* for instances that support it check for an event match first: */
4937 if (mask && !(mask & poll->events))
4938 return 0;
4939
8706e04e
JA
4940 list_del_init(&wait->entry);
4941
807abcb0 4942 if (poll && poll->head) {
18bceab1
JA
4943 bool done;
4944
807abcb0
JA
4945 spin_lock(&poll->head->lock);
4946 done = list_empty(&poll->wait.entry);
18bceab1 4947 if (!done)
807abcb0 4948 list_del_init(&poll->wait.entry);
d4e7cd36
JA
4949 /* make sure double remove sees this as being gone */
4950 wait->private = NULL;
807abcb0 4951 spin_unlock(&poll->head->lock);
c8b5e260
JA
4952 if (!done) {
4953 /* use wait func handler, so it matches the rq type */
4954 poll->wait.func(&poll->wait, mode, sync, key);
4955 }
18bceab1
JA
4956 }
4957 refcount_dec(&req->refs);
4958 return 1;
4959}
4960
4961static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4962 wait_queue_func_t wake_func)
4963{
4964 poll->head = NULL;
4965 poll->done = false;
4966 poll->canceled = false;
4967 poll->events = events;
4968 INIT_LIST_HEAD(&poll->wait.entry);
4969 init_waitqueue_func_entry(&poll->wait, wake_func);
4970}
4971
4972static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
4973 struct wait_queue_head *head,
4974 struct io_poll_iocb **poll_ptr)
18bceab1
JA
4975{
4976 struct io_kiocb *req = pt->req;
4977
4978 /*
4979 * If poll->head is already set, it's because the file being polled
4980 * uses multiple waitqueues for poll handling (eg one for read, one
4981 * for write). Setup a separate io_poll_iocb if this happens.
4982 */
4983 if (unlikely(poll->head)) {
58852d4d
PB
4984 struct io_poll_iocb *poll_one = poll;
4985
18bceab1 4986 /* already have a 2nd entry, fail a third attempt */
807abcb0 4987 if (*poll_ptr) {
18bceab1
JA
4988 pt->error = -EINVAL;
4989 return;
4990 }
1c3b3e65
JA
4991 /* double add on the same waitqueue head, ignore */
4992 if (poll->head == head)
4993 return;
18bceab1
JA
4994 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4995 if (!poll) {
4996 pt->error = -ENOMEM;
4997 return;
4998 }
58852d4d 4999 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
18bceab1
JA
5000 refcount_inc(&req->refs);
5001 poll->wait.private = req;
807abcb0 5002 *poll_ptr = poll;
18bceab1
JA
5003 }
5004
5005 pt->error = 0;
5006 poll->head = head;
a31eb4a2
JX
5007
5008 if (poll->events & EPOLLEXCLUSIVE)
5009 add_wait_queue_exclusive(head, &poll->wait);
5010 else
5011 add_wait_queue(head, &poll->wait);
18bceab1
JA
5012}
5013
5014static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5015 struct poll_table_struct *p)
5016{
5017 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
807abcb0 5018 struct async_poll *apoll = pt->req->apoll;
18bceab1 5019
807abcb0 5020 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
18bceab1
JA
5021}
5022
d7718a9d
JA
5023static void io_async_task_func(struct callback_head *cb)
5024{
5025 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5026 struct async_poll *apoll = req->apoll;
5027 struct io_ring_ctx *ctx = req->ctx;
5028
5029 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5030
74ce6ce4 5031 if (io_poll_rewait(req, &apoll->poll)) {
d7718a9d 5032 spin_unlock_irq(&ctx->completion_lock);
6d816e08 5033 percpu_ref_put(&ctx->refs);
74ce6ce4 5034 return;
d7718a9d
JA
5035 }
5036
31067255 5037 /* If req is still hashed, it cannot have been canceled. Don't check. */
0be0b0e3 5038 if (hash_hashed(&req->hash_node))
74ce6ce4 5039 hash_del(&req->hash_node);
2bae047e 5040
d4e7cd36 5041 io_poll_remove_double(req);
74ce6ce4
JA
5042 spin_unlock_irq(&ctx->completion_lock);
5043
0be0b0e3
PB
5044 if (!READ_ONCE(apoll->poll.canceled))
5045 __io_req_task_submit(req);
5046 else
5047 __io_req_task_cancel(req, -ECANCELED);
aa340845 5048
6d816e08 5049 percpu_ref_put(&ctx->refs);
807abcb0 5050 kfree(apoll->double_poll);
31067255 5051 kfree(apoll);
d7718a9d
JA
5052}
5053
5054static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5055 void *key)
5056{
5057 struct io_kiocb *req = wait->private;
5058 struct io_poll_iocb *poll = &req->apoll->poll;
5059
5060 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5061 key_to_poll(key));
5062
5063 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5064}
5065
5066static void io_poll_req_insert(struct io_kiocb *req)
5067{
5068 struct io_ring_ctx *ctx = req->ctx;
5069 struct hlist_head *list;
5070
5071 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5072 hlist_add_head(&req->hash_node, list);
5073}
5074
5075static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5076 struct io_poll_iocb *poll,
5077 struct io_poll_table *ipt, __poll_t mask,
5078 wait_queue_func_t wake_func)
5079 __acquires(&ctx->completion_lock)
5080{
5081 struct io_ring_ctx *ctx = req->ctx;
5082 bool cancel = false;
5083
4d52f338 5084 INIT_HLIST_NODE(&req->hash_node);
18bceab1 5085 io_init_poll_iocb(poll, mask, wake_func);
b90cd197 5086 poll->file = req->file;
18bceab1 5087 poll->wait.private = req;
d7718a9d
JA
5088
5089 ipt->pt._key = mask;
5090 ipt->req = req;
5091 ipt->error = -EINVAL;
5092
d7718a9d
JA
5093 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5094
5095 spin_lock_irq(&ctx->completion_lock);
5096 if (likely(poll->head)) {
5097 spin_lock(&poll->head->lock);
5098 if (unlikely(list_empty(&poll->wait.entry))) {
5099 if (ipt->error)
5100 cancel = true;
5101 ipt->error = 0;
5102 mask = 0;
5103 }
5104 if (mask || ipt->error)
5105 list_del_init(&poll->wait.entry);
5106 else if (cancel)
5107 WRITE_ONCE(poll->canceled, true);
5108 else if (!poll->done) /* actually waiting for an event */
5109 io_poll_req_insert(req);
5110 spin_unlock(&poll->head->lock);
5111 }
5112
5113 return mask;
5114}
5115
5116static bool io_arm_poll_handler(struct io_kiocb *req)
5117{
5118 const struct io_op_def *def = &io_op_defs[req->opcode];
5119 struct io_ring_ctx *ctx = req->ctx;
5120 struct async_poll *apoll;
5121 struct io_poll_table ipt;
5122 __poll_t mask, ret;
9dab14b8 5123 int rw;
d7718a9d
JA
5124
5125 if (!req->file || !file_can_poll(req->file))
5126 return false;
24c74678 5127 if (req->flags & REQ_F_POLLED)
d7718a9d 5128 return false;
9dab14b8
JA
5129 if (def->pollin)
5130 rw = READ;
5131 else if (def->pollout)
5132 rw = WRITE;
5133 else
5134 return false;
5135 /* if we can't nonblock try, then no point in arming a poll handler */
5136 if (!io_file_supports_async(req->file, rw))
d7718a9d
JA
5137 return false;
5138
5139 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5140 if (unlikely(!apoll))
5141 return false;
807abcb0 5142 apoll->double_poll = NULL;
d7718a9d
JA
5143
5144 req->flags |= REQ_F_POLLED;
d7718a9d 5145 req->apoll = apoll;
d7718a9d 5146
8755d97a 5147 mask = 0;
d7718a9d 5148 if (def->pollin)
8755d97a 5149 mask |= POLLIN | POLLRDNORM;
d7718a9d
JA
5150 if (def->pollout)
5151 mask |= POLLOUT | POLLWRNORM;
901341bb
LH
5152
5153 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5154 if ((req->opcode == IORING_OP_RECVMSG) &&
5155 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5156 mask &= ~POLLIN;
5157
d7718a9d
JA
5158 mask |= POLLERR | POLLPRI;
5159
5160 ipt.pt._qproc = io_async_queue_proc;
5161
5162 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5163 io_async_wake);
a36da65c 5164 if (ret || ipt.error) {
d4e7cd36 5165 io_poll_remove_double(req);
d7718a9d 5166 spin_unlock_irq(&ctx->completion_lock);
807abcb0 5167 kfree(apoll->double_poll);
d7718a9d
JA
5168 kfree(apoll);
5169 return false;
5170 }
5171 spin_unlock_irq(&ctx->completion_lock);
5172 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5173 apoll->poll.events);
5174 return true;
5175}
5176
5177static bool __io_poll_remove_one(struct io_kiocb *req,
5178 struct io_poll_iocb *poll)
221c5eb2 5179{
b41e9852 5180 bool do_complete = false;
221c5eb2
JA
5181
5182 spin_lock(&poll->head->lock);
5183 WRITE_ONCE(poll->canceled, true);
392edb45
JA
5184 if (!list_empty(&poll->wait.entry)) {
5185 list_del_init(&poll->wait.entry);
b41e9852 5186 do_complete = true;
221c5eb2
JA
5187 }
5188 spin_unlock(&poll->head->lock);
3bfa5bcb 5189 hash_del(&req->hash_node);
d7718a9d
JA
5190 return do_complete;
5191}
5192
5193static bool io_poll_remove_one(struct io_kiocb *req)
5194{
5195 bool do_complete;
5196
d4e7cd36
JA
5197 io_poll_remove_double(req);
5198
d7718a9d
JA
5199 if (req->opcode == IORING_OP_POLL_ADD) {
5200 do_complete = __io_poll_remove_one(req, &req->poll);
5201 } else {
3bfa5bcb
JA
5202 struct async_poll *apoll = req->apoll;
5203
d7718a9d 5204 /* non-poll requests have submit ref still */
3bfa5bcb
JA
5205 do_complete = __io_poll_remove_one(req, &apoll->poll);
5206 if (do_complete) {
d7718a9d 5207 io_put_req(req);
807abcb0 5208 kfree(apoll->double_poll);
3bfa5bcb
JA
5209 kfree(apoll);
5210 }
b1f573bd
XW
5211 }
5212
b41e9852
JA
5213 if (do_complete) {
5214 io_cqring_fill_event(req, -ECANCELED);
5215 io_commit_cqring(req->ctx);
f254ac04 5216 req_set_fail_links(req);
216578e5 5217 io_put_req_deferred(req, 1);
b41e9852
JA
5218 }
5219
5220 return do_complete;
221c5eb2
JA
5221}
5222
76e1b642
JA
5223/*
5224 * Returns true if we found and killed one or more poll requests
5225 */
6b81928d
PB
5226static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5227 struct files_struct *files)
221c5eb2 5228{
78076bb6 5229 struct hlist_node *tmp;
221c5eb2 5230 struct io_kiocb *req;
8e2e1faf 5231 int posted = 0, i;
221c5eb2
JA
5232
5233 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
5234 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5235 struct hlist_head *list;
5236
5237 list = &ctx->cancel_hash[i];
f3606e3a 5238 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
6b81928d 5239 if (io_match_task(req, tsk, files))
f3606e3a
JA
5240 posted += io_poll_remove_one(req);
5241 }
221c5eb2
JA
5242 }
5243 spin_unlock_irq(&ctx->completion_lock);
b41e9852 5244
8e2e1faf
JA
5245 if (posted)
5246 io_cqring_ev_posted(ctx);
76e1b642
JA
5247
5248 return posted != 0;
221c5eb2
JA
5249}
5250
47f46768
JA
5251static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5252{
78076bb6 5253 struct hlist_head *list;
47f46768
JA
5254 struct io_kiocb *req;
5255
78076bb6
JA
5256 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5257 hlist_for_each_entry(req, list, hash_node) {
b41e9852
JA
5258 if (sqe_addr != req->user_data)
5259 continue;
5260 if (io_poll_remove_one(req))
eac406c6 5261 return 0;
b41e9852 5262 return -EALREADY;
47f46768
JA
5263 }
5264
5265 return -ENOENT;
5266}
5267
3529d8c2
JA
5268static int io_poll_remove_prep(struct io_kiocb *req,
5269 const struct io_uring_sqe *sqe)
0969e783 5270{
0969e783
JA
5271 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5272 return -EINVAL;
5273 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5274 sqe->poll_events)
5275 return -EINVAL;
5276
018043be 5277 req->poll_remove.addr = READ_ONCE(sqe->addr);
0969e783
JA
5278 return 0;
5279}
5280
221c5eb2
JA
5281/*
5282 * Find a running poll command that matches one specified in sqe->addr,
5283 * and remove it if found.
5284 */
61e98203 5285static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
221c5eb2
JA
5286{
5287 struct io_ring_ctx *ctx = req->ctx;
47f46768 5288 int ret;
221c5eb2 5289
221c5eb2 5290 spin_lock_irq(&ctx->completion_lock);
018043be 5291 ret = io_poll_cancel(ctx, req->poll_remove.addr);
221c5eb2
JA
5292 spin_unlock_irq(&ctx->completion_lock);
5293
4e88d6e7
JA
5294 if (ret < 0)
5295 req_set_fail_links(req);
e1e16097 5296 io_req_complete(req, ret);
221c5eb2
JA
5297 return 0;
5298}
5299
221c5eb2
JA
5300static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5301 void *key)
5302{
c2f2eb7d
JA
5303 struct io_kiocb *req = wait->private;
5304 struct io_poll_iocb *poll = &req->poll;
221c5eb2 5305
d7718a9d 5306 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
221c5eb2
JA
5307}
5308
221c5eb2
JA
5309static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5310 struct poll_table_struct *p)
5311{
5312 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5313
e8c2bc1f 5314 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
eac406c6
JA
5315}
5316
3529d8c2 5317static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
5318{
5319 struct io_poll_iocb *poll = &req->poll;
5769a351 5320 u32 events;
221c5eb2
JA
5321
5322 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5323 return -EINVAL;
5324 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5325 return -EINVAL;
5326
5769a351
JX
5327 events = READ_ONCE(sqe->poll32_events);
5328#ifdef __BIG_ENDIAN
5329 events = swahw32(events);
5330#endif
a31eb4a2
JX
5331 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5332 (events & EPOLLEXCLUSIVE);
0969e783
JA
5333 return 0;
5334}
5335
61e98203 5336static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
5337{
5338 struct io_poll_iocb *poll = &req->poll;
5339 struct io_ring_ctx *ctx = req->ctx;
5340 struct io_poll_table ipt;
0969e783 5341 __poll_t mask;
0969e783 5342
d7718a9d 5343 ipt.pt._qproc = io_poll_queue_proc;
36703247 5344
d7718a9d
JA
5345 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5346 io_poll_wake);
221c5eb2 5347
8c838788 5348 if (mask) { /* no async, we'd stolen it */
221c5eb2 5349 ipt.error = 0;
b0dd8a41 5350 io_poll_complete(req, mask, 0);
221c5eb2 5351 }
221c5eb2
JA
5352 spin_unlock_irq(&ctx->completion_lock);
5353
8c838788
JA
5354 if (mask) {
5355 io_cqring_ev_posted(ctx);
014db007 5356 io_put_req(req);
221c5eb2 5357 }
8c838788 5358 return ipt.error;
221c5eb2
JA
5359}
5360
5262f567
JA
5361static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5362{
ad8a48ac
JA
5363 struct io_timeout_data *data = container_of(timer,
5364 struct io_timeout_data, timer);
5365 struct io_kiocb *req = data->req;
5366 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
5367 unsigned long flags;
5368
5262f567 5369 spin_lock_irqsave(&ctx->completion_lock, flags);
a71976f3 5370 list_del_init(&req->timeout.list);
01cec8c1
PB
5371 atomic_set(&req->ctx->cq_timeouts,
5372 atomic_read(&req->ctx->cq_timeouts) + 1);
5373
78e19bbe 5374 io_cqring_fill_event(req, -ETIME);
5262f567
JA
5375 io_commit_cqring(ctx);
5376 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5377
5378 io_cqring_ev_posted(ctx);
4e88d6e7 5379 req_set_fail_links(req);
5262f567
JA
5380 io_put_req(req);
5381 return HRTIMER_NORESTART;
5382}
5383
fbd15848
PB
5384static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5385 __u64 user_data)
f254ac04 5386{
fbd15848 5387 struct io_timeout_data *io;
47f46768
JA
5388 struct io_kiocb *req;
5389 int ret = -ENOENT;
f254ac04 5390
135fcde8 5391 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
47f46768 5392 if (user_data == req->user_data) {
47f46768
JA
5393 ret = 0;
5394 break;
5395 }
5396 }
5397
5398 if (ret == -ENOENT)
fbd15848
PB
5399 return ERR_PTR(ret);
5400
5401 io = req->async_data;
e8c2bc1f 5402 ret = hrtimer_try_to_cancel(&io->timer);
f254ac04 5403 if (ret == -1)
fbd15848 5404 return ERR_PTR(-EALREADY);
a71976f3 5405 list_del_init(&req->timeout.list);
fbd15848
PB
5406 return req;
5407}
47f46768 5408
fbd15848
PB
5409static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5410{
5411 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5412
5413 if (IS_ERR(req))
5414 return PTR_ERR(req);
f254ac04
JA
5415
5416 req_set_fail_links(req);
f254ac04 5417 io_cqring_fill_event(req, -ECANCELED);
216578e5 5418 io_put_req_deferred(req, 1);
f254ac04
JA
5419 return 0;
5420}
5421
9c8e11b3
PB
5422static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5423 struct timespec64 *ts, enum hrtimer_mode mode)
47f46768 5424{
9c8e11b3
PB
5425 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5426 struct io_timeout_data *data;
47f46768 5427
9c8e11b3
PB
5428 if (IS_ERR(req))
5429 return PTR_ERR(req);
47f46768 5430
9c8e11b3
PB
5431 req->timeout.off = 0; /* noseq */
5432 data = req->async_data;
5433 list_add_tail(&req->timeout.list, &ctx->timeout_list);
5434 hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5435 data->timer.function = io_timeout_fn;
5436 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5437 return 0;
47f46768
JA
5438}
5439
3529d8c2
JA
5440static int io_timeout_remove_prep(struct io_kiocb *req,
5441 const struct io_uring_sqe *sqe)
b29472ee 5442{
9c8e11b3
PB
5443 struct io_timeout_rem *tr = &req->timeout_rem;
5444
b29472ee
JA
5445 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5446 return -EINVAL;
61710e43
DA
5447 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5448 return -EINVAL;
9c8e11b3 5449 if (sqe->ioprio || sqe->buf_index || sqe->len)
b29472ee
JA
5450 return -EINVAL;
5451
9c8e11b3
PB
5452 tr->addr = READ_ONCE(sqe->addr);
5453 tr->flags = READ_ONCE(sqe->timeout_flags);
5454 if (tr->flags & IORING_TIMEOUT_UPDATE) {
5455 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5456 return -EINVAL;
5457 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5458 return -EFAULT;
5459 } else if (tr->flags) {
5460 /* timeout removal doesn't support flags */
b29472ee 5461 return -EINVAL;
9c8e11b3 5462 }
b29472ee 5463
b29472ee
JA
5464 return 0;
5465}
5466
8662daec
PB
5467static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5468{
5469 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5470 : HRTIMER_MODE_REL;
5471}
5472
11365043
JA
5473/*
5474 * Remove or update an existing timeout command
5475 */
61e98203 5476static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 5477{
9c8e11b3 5478 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 5479 struct io_ring_ctx *ctx = req->ctx;
47f46768 5480 int ret;
11365043 5481
11365043 5482 spin_lock_irq(&ctx->completion_lock);
8662daec 5483 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
9c8e11b3 5484 ret = io_timeout_cancel(ctx, tr->addr);
8662daec
PB
5485 else
5486 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5487 io_translate_timeout_mode(tr->flags));
11365043 5488
47f46768 5489 io_cqring_fill_event(req, ret);
11365043
JA
5490 io_commit_cqring(ctx);
5491 spin_unlock_irq(&ctx->completion_lock);
5262f567 5492 io_cqring_ev_posted(ctx);
4e88d6e7
JA
5493 if (ret < 0)
5494 req_set_fail_links(req);
ec9c02ad 5495 io_put_req(req);
11365043 5496 return 0;
5262f567
JA
5497}
5498
3529d8c2 5499static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 5500 bool is_timeout_link)
5262f567 5501{
ad8a48ac 5502 struct io_timeout_data *data;
a41525ab 5503 unsigned flags;
56080b02 5504 u32 off = READ_ONCE(sqe->off);
5262f567 5505
ad8a48ac 5506 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 5507 return -EINVAL;
ad8a48ac 5508 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 5509 return -EINVAL;
56080b02 5510 if (off && is_timeout_link)
2d28390a 5511 return -EINVAL;
a41525ab
JA
5512 flags = READ_ONCE(sqe->timeout_flags);
5513 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 5514 return -EINVAL;
bdf20073 5515
bfe68a22 5516 req->timeout.off = off;
26a61679 5517
e8c2bc1f 5518 if (!req->async_data && io_alloc_async_data(req))
26a61679
JA
5519 return -ENOMEM;
5520
e8c2bc1f 5521 data = req->async_data;
ad8a48ac 5522 data->req = req;
ad8a48ac
JA
5523
5524 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
5525 return -EFAULT;
5526
8662daec 5527 data->mode = io_translate_timeout_mode(flags);
ad8a48ac 5528 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
dd59a3d5 5529 io_req_track_inflight(req);
ad8a48ac
JA
5530 return 0;
5531}
5532
61e98203 5533static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 5534{
ad8a48ac 5535 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 5536 struct io_timeout_data *data = req->async_data;
ad8a48ac 5537 struct list_head *entry;
bfe68a22 5538 u32 tail, off = req->timeout.off;
ad8a48ac 5539
733f5c95 5540 spin_lock_irq(&ctx->completion_lock);
93bd25bb 5541
5262f567
JA
5542 /*
5543 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
5544 * timeout event to be satisfied. If it isn't set, then this is
5545 * a pure timeout request, sequence isn't used.
5262f567 5546 */
8eb7e2d0 5547 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
5548 entry = ctx->timeout_list.prev;
5549 goto add;
5550 }
5262f567 5551
bfe68a22
PB
5552 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5553 req->timeout.target_seq = tail + off;
5262f567 5554
f010505b
MDG
5555 /* Update the last seq here in case io_flush_timeouts() hasn't.
5556 * This is safe because ->completion_lock is held, and submissions
5557 * and completions are never mixed in the same ->completion_lock section.
5558 */
5559 ctx->cq_last_tm_flush = tail;
5560
5262f567
JA
5561 /*
5562 * Insertion sort, ensuring the first entry in the list is always
5563 * the one we need first.
5564 */
5262f567 5565 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
5566 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5567 timeout.list);
5262f567 5568
8eb7e2d0 5569 if (io_is_timeout_noseq(nxt))
93bd25bb 5570 continue;
bfe68a22
PB
5571 /* nxt.seq is behind @tail, otherwise would've been completed */
5572 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
5573 break;
5574 }
93bd25bb 5575add:
135fcde8 5576 list_add(&req->timeout.list, entry);
ad8a48ac
JA
5577 data->timer.function = io_timeout_fn;
5578 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 5579 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
5580 return 0;
5581}
5262f567 5582
f458dd84
PB
5583struct io_cancel_data {
5584 struct io_ring_ctx *ctx;
5585 u64 user_data;
5586};
5587
62755e35
JA
5588static bool io_cancel_cb(struct io_wq_work *work, void *data)
5589{
5590 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f458dd84 5591 struct io_cancel_data *cd = data;
62755e35 5592
f458dd84 5593 return req->ctx == cd->ctx && req->user_data == cd->user_data;
62755e35
JA
5594}
5595
f458dd84
PB
5596static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
5597 struct io_ring_ctx *ctx)
62755e35 5598{
f458dd84 5599 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
62755e35 5600 enum io_wq_cancel cancel_ret;
62755e35
JA
5601 int ret = 0;
5602
f458dd84 5603 if (!tctx || !tctx->io_wq)
5aa75ed5
JA
5604 return -ENOENT;
5605
f458dd84 5606 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
62755e35
JA
5607 switch (cancel_ret) {
5608 case IO_WQ_CANCEL_OK:
5609 ret = 0;
5610 break;
5611 case IO_WQ_CANCEL_RUNNING:
5612 ret = -EALREADY;
5613 break;
5614 case IO_WQ_CANCEL_NOTFOUND:
5615 ret = -ENOENT;
5616 break;
5617 }
5618
e977d6d3
JA
5619 return ret;
5620}
5621
47f46768
JA
5622static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5623 struct io_kiocb *req, __u64 sqe_addr,
014db007 5624 int success_ret)
47f46768
JA
5625{
5626 unsigned long flags;
5627 int ret;
5628
f458dd84 5629 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
47f46768
JA
5630 if (ret != -ENOENT) {
5631 spin_lock_irqsave(&ctx->completion_lock, flags);
5632 goto done;
5633 }
5634
5635 spin_lock_irqsave(&ctx->completion_lock, flags);
5636 ret = io_timeout_cancel(ctx, sqe_addr);
5637 if (ret != -ENOENT)
5638 goto done;
5639 ret = io_poll_cancel(ctx, sqe_addr);
5640done:
b0dd8a41
JA
5641 if (!ret)
5642 ret = success_ret;
47f46768
JA
5643 io_cqring_fill_event(req, ret);
5644 io_commit_cqring(ctx);
5645 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5646 io_cqring_ev_posted(ctx);
5647
4e88d6e7
JA
5648 if (ret < 0)
5649 req_set_fail_links(req);
014db007 5650 io_put_req(req);
47f46768
JA
5651}
5652
3529d8c2
JA
5653static int io_async_cancel_prep(struct io_kiocb *req,
5654 const struct io_uring_sqe *sqe)
e977d6d3 5655{
fbf23849 5656 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3 5657 return -EINVAL;
61710e43
DA
5658 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5659 return -EINVAL;
5660 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
e977d6d3
JA
5661 return -EINVAL;
5662
fbf23849
JA
5663 req->cancel.addr = READ_ONCE(sqe->addr);
5664 return 0;
5665}
5666
61e98203 5667static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
5668{
5669 struct io_ring_ctx *ctx = req->ctx;
fbf23849 5670
014db007 5671 io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5262f567
JA
5672 return 0;
5673}
5674
269bbe5f 5675static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
5676 const struct io_uring_sqe *sqe)
5677{
6ca56f84
JA
5678 if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5679 return -EINVAL;
61710e43
DA
5680 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5681 return -EINVAL;
5682 if (sqe->ioprio || sqe->rw_flags)
05f3fb3c
JA
5683 return -EINVAL;
5684
269bbe5f
BM
5685 req->rsrc_update.offset = READ_ONCE(sqe->off);
5686 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
5687 if (!req->rsrc_update.nr_args)
05f3fb3c 5688 return -EINVAL;
269bbe5f 5689 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
5690 return 0;
5691}
5692
889fca73 5693static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
5694{
5695 struct io_ring_ctx *ctx = req->ctx;
269bbe5f 5696 struct io_uring_rsrc_update up;
05f3fb3c 5697 int ret;
fbf23849 5698
45d189c6 5699 if (issue_flags & IO_URING_F_NONBLOCK)
05f3fb3c 5700 return -EAGAIN;
05f3fb3c 5701
269bbe5f
BM
5702 up.offset = req->rsrc_update.offset;
5703 up.data = req->rsrc_update.arg;
05f3fb3c
JA
5704
5705 mutex_lock(&ctx->uring_lock);
269bbe5f 5706 ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
05f3fb3c
JA
5707 mutex_unlock(&ctx->uring_lock);
5708
5709 if (ret < 0)
5710 req_set_fail_links(req);
889fca73 5711 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
5712 return 0;
5713}
5714
bfe76559 5715static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 5716{
d625c6ee 5717 switch (req->opcode) {
e781573e 5718 case IORING_OP_NOP:
bfe76559 5719 return 0;
f67676d1
JA
5720 case IORING_OP_READV:
5721 case IORING_OP_READ_FIXED:
3a6820f2 5722 case IORING_OP_READ:
bfe76559 5723 return io_read_prep(req, sqe);
f67676d1
JA
5724 case IORING_OP_WRITEV:
5725 case IORING_OP_WRITE_FIXED:
3a6820f2 5726 case IORING_OP_WRITE:
bfe76559 5727 return io_write_prep(req, sqe);
0969e783 5728 case IORING_OP_POLL_ADD:
bfe76559 5729 return io_poll_add_prep(req, sqe);
0969e783 5730 case IORING_OP_POLL_REMOVE:
bfe76559 5731 return io_poll_remove_prep(req, sqe);
8ed8d3c3 5732 case IORING_OP_FSYNC:
1155c76a 5733 return io_fsync_prep(req, sqe);
8ed8d3c3 5734 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 5735 return io_sfr_prep(req, sqe);
03b1230c 5736 case IORING_OP_SENDMSG:
fddaface 5737 case IORING_OP_SEND:
bfe76559 5738 return io_sendmsg_prep(req, sqe);
03b1230c 5739 case IORING_OP_RECVMSG:
fddaface 5740 case IORING_OP_RECV:
bfe76559 5741 return io_recvmsg_prep(req, sqe);
f499a021 5742 case IORING_OP_CONNECT:
bfe76559 5743 return io_connect_prep(req, sqe);
2d28390a 5744 case IORING_OP_TIMEOUT:
bfe76559 5745 return io_timeout_prep(req, sqe, false);
b29472ee 5746 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 5747 return io_timeout_remove_prep(req, sqe);
fbf23849 5748 case IORING_OP_ASYNC_CANCEL:
bfe76559 5749 return io_async_cancel_prep(req, sqe);
2d28390a 5750 case IORING_OP_LINK_TIMEOUT:
bfe76559 5751 return io_timeout_prep(req, sqe, true);
8ed8d3c3 5752 case IORING_OP_ACCEPT:
bfe76559 5753 return io_accept_prep(req, sqe);
d63d1b5e 5754 case IORING_OP_FALLOCATE:
bfe76559 5755 return io_fallocate_prep(req, sqe);
15b71abe 5756 case IORING_OP_OPENAT:
bfe76559 5757 return io_openat_prep(req, sqe);
b5dba59e 5758 case IORING_OP_CLOSE:
bfe76559 5759 return io_close_prep(req, sqe);
05f3fb3c 5760 case IORING_OP_FILES_UPDATE:
269bbe5f 5761 return io_rsrc_update_prep(req, sqe);
eddc7ef5 5762 case IORING_OP_STATX:
bfe76559 5763 return io_statx_prep(req, sqe);
4840e418 5764 case IORING_OP_FADVISE:
bfe76559 5765 return io_fadvise_prep(req, sqe);
c1ca757b 5766 case IORING_OP_MADVISE:
bfe76559 5767 return io_madvise_prep(req, sqe);
cebdb986 5768 case IORING_OP_OPENAT2:
bfe76559 5769 return io_openat2_prep(req, sqe);
3e4827b0 5770 case IORING_OP_EPOLL_CTL:
bfe76559 5771 return io_epoll_ctl_prep(req, sqe);
7d67af2c 5772 case IORING_OP_SPLICE:
bfe76559 5773 return io_splice_prep(req, sqe);
ddf0322d 5774 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 5775 return io_provide_buffers_prep(req, sqe);
067524e9 5776 case IORING_OP_REMOVE_BUFFERS:
bfe76559 5777 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 5778 case IORING_OP_TEE:
bfe76559 5779 return io_tee_prep(req, sqe);
36f4fa68
JA
5780 case IORING_OP_SHUTDOWN:
5781 return io_shutdown_prep(req, sqe);
80a261fd
JA
5782 case IORING_OP_RENAMEAT:
5783 return io_renameat_prep(req, sqe);
14a1143b
JA
5784 case IORING_OP_UNLINKAT:
5785 return io_unlinkat_prep(req, sqe);
f67676d1
JA
5786 }
5787
bfe76559
PB
5788 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5789 req->opcode);
5790 return-EINVAL;
5791}
5792
93642ef8 5793static int io_req_prep_async(struct io_kiocb *req)
bfe76559 5794{
93642ef8
PB
5795 switch (req->opcode) {
5796 case IORING_OP_READV:
5797 case IORING_OP_READ_FIXED:
5798 case IORING_OP_READ:
5799 return io_rw_prep_async(req, READ);
5800 case IORING_OP_WRITEV:
5801 case IORING_OP_WRITE_FIXED:
5802 case IORING_OP_WRITE:
5803 return io_rw_prep_async(req, WRITE);
5804 case IORING_OP_SENDMSG:
5805 case IORING_OP_SEND:
5806 return io_sendmsg_prep_async(req);
5807 case IORING_OP_RECVMSG:
5808 case IORING_OP_RECV:
5809 return io_recvmsg_prep_async(req);
5810 case IORING_OP_CONNECT:
5811 return io_connect_prep_async(req);
5812 }
5813 return 0;
5814}
5815
be7053b7 5816static int io_req_defer_prep(struct io_kiocb *req)
bfe76559 5817{
be7053b7 5818 if (!io_op_defs[req->opcode].needs_async_data)
bfe76559 5819 return 0;
be7053b7 5820 /* some opcodes init it during the inital prep */
93642ef8 5821 if (req->async_data)
be7053b7
PB
5822 return 0;
5823 if (__io_alloc_async_data(req))
bfe76559 5824 return -EAGAIN;
be7053b7 5825 return io_req_prep_async(req);
f67676d1
JA
5826}
5827
9cf7c104
PB
5828static u32 io_get_sequence(struct io_kiocb *req)
5829{
5830 struct io_kiocb *pos;
5831 struct io_ring_ctx *ctx = req->ctx;
f2f87370 5832 u32 total_submitted, nr_reqs = 0;
9cf7c104 5833
f2f87370
PB
5834 io_for_each_link(pos, req)
5835 nr_reqs++;
9cf7c104
PB
5836
5837 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5838 return total_submitted - nr_reqs;
5839}
5840
be7053b7 5841static int io_req_defer(struct io_kiocb *req)
de0617e4 5842{
a197f664 5843 struct io_ring_ctx *ctx = req->ctx;
27dc8338 5844 struct io_defer_entry *de;
f67676d1 5845 int ret;
9cf7c104 5846 u32 seq;
de0617e4 5847
9d858b21 5848 /* Still need defer if there is pending req in defer list. */
9cf7c104
PB
5849 if (likely(list_empty_careful(&ctx->defer_list) &&
5850 !(req->flags & REQ_F_IO_DRAIN)))
5851 return 0;
5852
5853 seq = io_get_sequence(req);
5854 /* Still a chance to pass the sequence check */
5855 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
de0617e4
JA
5856 return 0;
5857
be7053b7
PB
5858 ret = io_req_defer_prep(req);
5859 if (ret)
5860 return ret;
cbdcb435 5861 io_prep_async_link(req);
27dc8338
PB
5862 de = kmalloc(sizeof(*de), GFP_KERNEL);
5863 if (!de)
5864 return -ENOMEM;
2d28390a 5865
de0617e4 5866 spin_lock_irq(&ctx->completion_lock);
9cf7c104 5867 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
de0617e4 5868 spin_unlock_irq(&ctx->completion_lock);
27dc8338 5869 kfree(de);
ae34817b
PB
5870 io_queue_async_work(req);
5871 return -EIOCBQUEUED;
de0617e4
JA
5872 }
5873
915967f6 5874 trace_io_uring_defer(ctx, req, req->user_data);
27dc8338 5875 de->req = req;
9cf7c104 5876 de->seq = seq;
27dc8338 5877 list_add_tail(&de->list, &ctx->defer_list);
de0617e4
JA
5878 spin_unlock_irq(&ctx->completion_lock);
5879 return -EIOCBQUEUED;
5880}
5881
3ca405eb 5882static void __io_clean_op(struct io_kiocb *req)
99bc4c38 5883{
0e1b6fe3
PB
5884 if (req->flags & REQ_F_BUFFER_SELECTED) {
5885 switch (req->opcode) {
5886 case IORING_OP_READV:
5887 case IORING_OP_READ_FIXED:
5888 case IORING_OP_READ:
bcda7baa 5889 kfree((void *)(unsigned long)req->rw.addr);
0e1b6fe3
PB
5890 break;
5891 case IORING_OP_RECVMSG:
5892 case IORING_OP_RECV:
bcda7baa 5893 kfree(req->sr_msg.kbuf);
0e1b6fe3
PB
5894 break;
5895 }
5896 req->flags &= ~REQ_F_BUFFER_SELECTED;
99bc4c38
PB
5897 }
5898
0e1b6fe3
PB
5899 if (req->flags & REQ_F_NEED_CLEANUP) {
5900 switch (req->opcode) {
5901 case IORING_OP_READV:
5902 case IORING_OP_READ_FIXED:
5903 case IORING_OP_READ:
5904 case IORING_OP_WRITEV:
5905 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
5906 case IORING_OP_WRITE: {
5907 struct io_async_rw *io = req->async_data;
5908 if (io->free_iovec)
5909 kfree(io->free_iovec);
0e1b6fe3 5910 break;
e8c2bc1f 5911 }
0e1b6fe3 5912 case IORING_OP_RECVMSG:
e8c2bc1f
JA
5913 case IORING_OP_SENDMSG: {
5914 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
5915
5916 kfree(io->free_iov);
0e1b6fe3 5917 break;
e8c2bc1f 5918 }
0e1b6fe3
PB
5919 case IORING_OP_SPLICE:
5920 case IORING_OP_TEE:
5921 io_put_file(req, req->splice.file_in,
5922 (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5923 break;
f3cd4850
JA
5924 case IORING_OP_OPENAT:
5925 case IORING_OP_OPENAT2:
5926 if (req->open.filename)
5927 putname(req->open.filename);
5928 break;
80a261fd
JA
5929 case IORING_OP_RENAMEAT:
5930 putname(req->rename.oldpath);
5931 putname(req->rename.newpath);
5932 break;
14a1143b
JA
5933 case IORING_OP_UNLINKAT:
5934 putname(req->unlink.filename);
5935 break;
0e1b6fe3
PB
5936 }
5937 req->flags &= ~REQ_F_NEED_CLEANUP;
99bc4c38 5938 }
99bc4c38
PB
5939}
5940
889fca73 5941static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 5942{
a197f664 5943 struct io_ring_ctx *ctx = req->ctx;
5730b27e 5944 const struct cred *creds = NULL;
d625c6ee 5945 int ret;
2b188cc1 5946
003e8dcc
JA
5947 if (req->work.creds && req->work.creds != current_cred())
5948 creds = override_creds(req->work.creds);
5730b27e 5949
d625c6ee 5950 switch (req->opcode) {
2b188cc1 5951 case IORING_OP_NOP:
889fca73 5952 ret = io_nop(req, issue_flags);
2b188cc1
JA
5953 break;
5954 case IORING_OP_READV:
edafccee 5955 case IORING_OP_READ_FIXED:
3a6820f2 5956 case IORING_OP_READ:
889fca73 5957 ret = io_read(req, issue_flags);
edafccee 5958 break;
3529d8c2 5959 case IORING_OP_WRITEV:
edafccee 5960 case IORING_OP_WRITE_FIXED:
3a6820f2 5961 case IORING_OP_WRITE:
889fca73 5962 ret = io_write(req, issue_flags);
2b188cc1 5963 break;
c992fe29 5964 case IORING_OP_FSYNC:
45d189c6 5965 ret = io_fsync(req, issue_flags);
c992fe29 5966 break;
221c5eb2 5967 case IORING_OP_POLL_ADD:
61e98203 5968 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
5969 break;
5970 case IORING_OP_POLL_REMOVE:
61e98203 5971 ret = io_poll_remove(req, issue_flags);
221c5eb2 5972 break;
5d17b4a4 5973 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 5974 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 5975 break;
0fa03c62 5976 case IORING_OP_SENDMSG:
889fca73 5977 ret = io_sendmsg(req, issue_flags);
062d04d7 5978 break;
fddaface 5979 case IORING_OP_SEND:
889fca73 5980 ret = io_send(req, issue_flags);
0fa03c62 5981 break;
aa1fa28f 5982 case IORING_OP_RECVMSG:
889fca73 5983 ret = io_recvmsg(req, issue_flags);
062d04d7 5984 break;
fddaface 5985 case IORING_OP_RECV:
889fca73 5986 ret = io_recv(req, issue_flags);
aa1fa28f 5987 break;
5262f567 5988 case IORING_OP_TIMEOUT:
61e98203 5989 ret = io_timeout(req, issue_flags);
5262f567 5990 break;
11365043 5991 case IORING_OP_TIMEOUT_REMOVE:
61e98203 5992 ret = io_timeout_remove(req, issue_flags);
11365043 5993 break;
17f2fe35 5994 case IORING_OP_ACCEPT:
889fca73 5995 ret = io_accept(req, issue_flags);
17f2fe35 5996 break;
f8e85cf2 5997 case IORING_OP_CONNECT:
889fca73 5998 ret = io_connect(req, issue_flags);
f8e85cf2 5999 break;
62755e35 6000 case IORING_OP_ASYNC_CANCEL:
61e98203 6001 ret = io_async_cancel(req, issue_flags);
62755e35 6002 break;
d63d1b5e 6003 case IORING_OP_FALLOCATE:
45d189c6 6004 ret = io_fallocate(req, issue_flags);
d63d1b5e 6005 break;
15b71abe 6006 case IORING_OP_OPENAT:
45d189c6 6007 ret = io_openat(req, issue_flags);
15b71abe 6008 break;
b5dba59e 6009 case IORING_OP_CLOSE:
889fca73 6010 ret = io_close(req, issue_flags);
b5dba59e 6011 break;
05f3fb3c 6012 case IORING_OP_FILES_UPDATE:
889fca73 6013 ret = io_files_update(req, issue_flags);
05f3fb3c 6014 break;
eddc7ef5 6015 case IORING_OP_STATX:
45d189c6 6016 ret = io_statx(req, issue_flags);
eddc7ef5 6017 break;
4840e418 6018 case IORING_OP_FADVISE:
45d189c6 6019 ret = io_fadvise(req, issue_flags);
4840e418 6020 break;
c1ca757b 6021 case IORING_OP_MADVISE:
45d189c6 6022 ret = io_madvise(req, issue_flags);
c1ca757b 6023 break;
cebdb986 6024 case IORING_OP_OPENAT2:
45d189c6 6025 ret = io_openat2(req, issue_flags);
cebdb986 6026 break;
3e4827b0 6027 case IORING_OP_EPOLL_CTL:
889fca73 6028 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 6029 break;
7d67af2c 6030 case IORING_OP_SPLICE:
45d189c6 6031 ret = io_splice(req, issue_flags);
7d67af2c 6032 break;
ddf0322d 6033 case IORING_OP_PROVIDE_BUFFERS:
889fca73 6034 ret = io_provide_buffers(req, issue_flags);
ddf0322d 6035 break;
067524e9 6036 case IORING_OP_REMOVE_BUFFERS:
889fca73 6037 ret = io_remove_buffers(req, issue_flags);
3e4827b0 6038 break;
f2a8d5c7 6039 case IORING_OP_TEE:
45d189c6 6040 ret = io_tee(req, issue_flags);
f2a8d5c7 6041 break;
36f4fa68 6042 case IORING_OP_SHUTDOWN:
45d189c6 6043 ret = io_shutdown(req, issue_flags);
36f4fa68 6044 break;
80a261fd 6045 case IORING_OP_RENAMEAT:
45d189c6 6046 ret = io_renameat(req, issue_flags);
80a261fd 6047 break;
14a1143b 6048 case IORING_OP_UNLINKAT:
45d189c6 6049 ret = io_unlinkat(req, issue_flags);
14a1143b 6050 break;
2b188cc1
JA
6051 default:
6052 ret = -EINVAL;
6053 break;
6054 }
6055
5730b27e
JA
6056 if (creds)
6057 revert_creds(creds);
6058
def596e9
JA
6059 if (ret)
6060 return ret;
6061
b532576e
JA
6062 /* If the op doesn't have a file, we're not polling for it */
6063 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
11ba820b
JA
6064 const bool in_async = io_wq_current_is_worker();
6065
11ba820b
JA
6066 /* workqueue context doesn't hold uring_lock, grab it now */
6067 if (in_async)
6068 mutex_lock(&ctx->uring_lock);
6069
2e9dbe90 6070 io_iopoll_req_issued(req, in_async);
11ba820b
JA
6071
6072 if (in_async)
6073 mutex_unlock(&ctx->uring_lock);
def596e9
JA
6074 }
6075
6076 return 0;
2b188cc1
JA
6077}
6078
5280f7e5 6079static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
6080{
6081 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6df1db6b 6082 struct io_kiocb *timeout;
561fb04a 6083 int ret = 0;
2b188cc1 6084
6df1db6b
PB
6085 timeout = io_prep_linked_timeout(req);
6086 if (timeout)
6087 io_queue_linked_timeout(timeout);
d4c81f38 6088
4014d943 6089 if (work->flags & IO_WQ_WORK_CANCEL)
561fb04a 6090 ret = -ECANCELED;
31b51510 6091
561fb04a 6092 if (!ret) {
561fb04a 6093 do {
889fca73 6094 ret = io_issue_sqe(req, 0);
561fb04a
JA
6095 /*
6096 * We can get EAGAIN for polled IO even though we're
6097 * forcing a sync submission from here, since we can't
6098 * wait for request slots on the block side.
6099 */
6100 if (ret != -EAGAIN)
6101 break;
6102 cond_resched();
6103 } while (1);
6104 }
31b51510 6105
a3df7698 6106 /* avoid locking problems by failing it from a clean context */
561fb04a 6107 if (ret) {
a3df7698
PB
6108 /* io-wq is going to take one down */
6109 refcount_inc(&req->refs);
6110 io_req_task_queue_fail(req, ret);
edafccee 6111 }
2b188cc1
JA
6112}
6113
65e19f54
JA
6114static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6115 int index)
6116{
269bbe5f 6117 struct fixed_rsrc_table *table;
65e19f54 6118
05f3fb3c 6119 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
84695089 6120 return table->files[index & IORING_FILE_TABLE_MASK];
65e19f54
JA
6121}
6122
8371adf5
PB
6123static struct file *io_file_get(struct io_submit_state *state,
6124 struct io_kiocb *req, int fd, bool fixed)
09bb8394 6125{
a197f664 6126 struct io_ring_ctx *ctx = req->ctx;
8da11c19 6127 struct file *file;
09bb8394 6128
8da11c19 6129 if (fixed) {
479f517b 6130 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
8371adf5 6131 return NULL;
b7620121 6132 fd = array_index_nospec(fd, ctx->nr_user_files);
8da11c19 6133 file = io_file_from_index(ctx, fd);
36f72fe2 6134 io_set_resource_node(req);
09bb8394 6135 } else {
c826bd7a 6136 trace_io_uring_file_get(ctx, fd);
8da11c19 6137 file = __io_file_get(state, fd);
09bb8394
JA
6138 }
6139
ce3d5aae
PB
6140 if (file && unlikely(file->f_op == &io_uring_fops))
6141 io_req_track_inflight(req);
8371adf5 6142 return file;
09bb8394
JA
6143}
6144
2665abfd 6145static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 6146{
ad8a48ac
JA
6147 struct io_timeout_data *data = container_of(timer,
6148 struct io_timeout_data, timer);
90cd7e42 6149 struct io_kiocb *prev, *req = data->req;
2665abfd 6150 struct io_ring_ctx *ctx = req->ctx;
2665abfd 6151 unsigned long flags;
2665abfd
JA
6152
6153 spin_lock_irqsave(&ctx->completion_lock, flags);
90cd7e42
PB
6154 prev = req->timeout.head;
6155 req->timeout.head = NULL;
2665abfd
JA
6156
6157 /*
6158 * We don't expect the list to be empty, that will only happen if we
6159 * race with the completion of the linked work.
6160 */
90cd7e42 6161 if (prev && refcount_inc_not_zero(&prev->refs))
f2f87370 6162 io_remove_next_linked(prev);
90cd7e42
PB
6163 else
6164 prev = NULL;
2665abfd
JA
6165 spin_unlock_irqrestore(&ctx->completion_lock, flags);
6166
6167 if (prev) {
4e88d6e7 6168 req_set_fail_links(prev);
014db007 6169 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
9ae1f8dd 6170 io_put_req_deferred(prev, 1);
47f46768 6171 } else {
9ae1f8dd
PB
6172 io_req_complete_post(req, -ETIME, 0);
6173 io_put_req_deferred(req, 1);
2665abfd 6174 }
2665abfd
JA
6175 return HRTIMER_NORESTART;
6176}
6177
7271ef3a 6178static void __io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 6179{
76a46e06 6180 /*
f2f87370
PB
6181 * If the back reference is NULL, then our linked request finished
6182 * before we got a chance to setup the timer
76a46e06 6183 */
90cd7e42 6184 if (req->timeout.head) {
e8c2bc1f 6185 struct io_timeout_data *data = req->async_data;
94ae5e77 6186
ad8a48ac
JA
6187 data->timer.function = io_link_timeout_fn;
6188 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6189 data->mode);
2665abfd 6190 }
7271ef3a
JA
6191}
6192
6193static void io_queue_linked_timeout(struct io_kiocb *req)
6194{
6195 struct io_ring_ctx *ctx = req->ctx;
6196
6197 spin_lock_irq(&ctx->completion_lock);
6198 __io_queue_linked_timeout(req);
76a46e06 6199 spin_unlock_irq(&ctx->completion_lock);
2665abfd 6200
2665abfd 6201 /* drop submission reference */
76a46e06
JA
6202 io_put_req(req);
6203}
2665abfd 6204
ad8a48ac 6205static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd 6206{
f2f87370 6207 struct io_kiocb *nxt = req->link;
2665abfd 6208
f2f87370
PB
6209 if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6210 nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 6211 return NULL;
2665abfd 6212
90cd7e42 6213 nxt->timeout.head = req;
900fad45 6214 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
76a46e06 6215 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 6216 return nxt;
2665abfd
JA
6217}
6218
c5eef2b9 6219static void __io_queue_sqe(struct io_kiocb *req)
2b188cc1 6220{
d3d7298d 6221 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
e0c5c576 6222 int ret;
2b188cc1 6223
c5eef2b9 6224 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 6225
491381ce
JA
6226 /*
6227 * We async punt it if the file wasn't marked NOWAIT, or if the file
6228 * doesn't support non-blocking read/write attempts
6229 */
24c74678 6230 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
f063c547 6231 if (!io_arm_poll_handler(req)) {
f063c547
PB
6232 /*
6233 * Queued up for async execution, worker will release
6234 * submit reference when the iocb is actually submitted.
6235 */
6236 io_queue_async_work(req);
2b188cc1 6237 }
0d63c148
PB
6238 } else if (likely(!ret)) {
6239 /* drop submission reference */
e342c807 6240 if (req->flags & REQ_F_COMPLETE_INLINE) {
c5eef2b9
PB
6241 struct io_ring_ctx *ctx = req->ctx;
6242 struct io_comp_state *cs = &ctx->submit_state.comp;
e65ef56d 6243
6dd0be1e 6244 cs->reqs[cs->nr++] = req;
d3d7298d 6245 if (cs->nr == ARRAY_SIZE(cs->reqs))
c5eef2b9 6246 io_submit_flush_completions(cs, ctx);
9affd664 6247 } else {
d3d7298d 6248 io_put_req(req);
0d63c148
PB
6249 }
6250 } else {
4e88d6e7 6251 req_set_fail_links(req);
e65ef56d 6252 io_put_req(req);
e1e16097 6253 io_req_complete(req, ret);
9e645e11 6254 }
d3d7298d
PB
6255 if (linked_timeout)
6256 io_queue_linked_timeout(linked_timeout);
2b188cc1
JA
6257}
6258
be7053b7 6259static void io_queue_sqe(struct io_kiocb *req)
4fe2c963
JL
6260{
6261 int ret;
6262
be7053b7 6263 ret = io_req_defer(req);
4fe2c963
JL
6264 if (ret) {
6265 if (ret != -EIOCBQUEUED) {
1118591a 6266fail_req:
4e88d6e7 6267 req_set_fail_links(req);
e1e16097
JA
6268 io_put_req(req);
6269 io_req_complete(req, ret);
4fe2c963 6270 }
2550878f 6271 } else if (req->flags & REQ_F_FORCE_ASYNC) {
be7053b7
PB
6272 ret = io_req_defer_prep(req);
6273 if (unlikely(ret))
6274 goto fail_req;
ce35a47a
JA
6275 io_queue_async_work(req);
6276 } else {
c5eef2b9 6277 __io_queue_sqe(req);
ce35a47a 6278 }
4fe2c963
JL
6279}
6280
b16fed66
PB
6281/*
6282 * Check SQE restrictions (opcode and flags).
6283 *
6284 * Returns 'true' if SQE is allowed, 'false' otherwise.
6285 */
6286static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6287 struct io_kiocb *req,
6288 unsigned int sqe_flags)
4fe2c963 6289{
b16fed66
PB
6290 if (!ctx->restricted)
6291 return true;
6292
6293 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6294 return false;
6295
6296 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6297 ctx->restrictions.sqe_flags_required)
6298 return false;
6299
6300 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6301 ctx->restrictions.sqe_flags_required))
6302 return false;
6303
6304 return true;
4fe2c963
JL
6305}
6306
b16fed66
PB
6307static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6308 const struct io_uring_sqe *sqe)
6309{
6310 struct io_submit_state *state;
6311 unsigned int sqe_flags;
003e8dcc 6312 int personality, ret = 0;
b16fed66
PB
6313
6314 req->opcode = READ_ONCE(sqe->opcode);
6315 /* same numerical values with corresponding REQ_F_*, safe to copy */
6316 req->flags = sqe_flags = READ_ONCE(sqe->flags);
6317 req->user_data = READ_ONCE(sqe->user_data);
6318 req->async_data = NULL;
6319 req->file = NULL;
6320 req->ctx = ctx;
6321 req->link = NULL;
6322 req->fixed_rsrc_refs = NULL;
6323 /* one is dropped after submission, the other at completion */
6324 refcount_set(&req->refs, 2);
6325 req->task = current;
6326 req->result = 0;
93e68e03
JA
6327 req->work.list.next = NULL;
6328 req->work.creds = NULL;
6329 req->work.flags = 0;
b16fed66
PB
6330
6331 /* enforce forwards compatibility on users */
ebf4a5db
PB
6332 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
6333 req->flags = 0;
b16fed66 6334 return -EINVAL;
ebf4a5db 6335 }
b16fed66
PB
6336
6337 if (unlikely(req->opcode >= IORING_OP_LAST))
6338 return -EINVAL;
6339
b16fed66
PB
6340 if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6341 return -EACCES;
6342
6343 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6344 !io_op_defs[req->opcode].buffer_select)
6345 return -EOPNOTSUPP;
863e0560 6346
003e8dcc
JA
6347 personality = READ_ONCE(sqe->personality);
6348 if (personality) {
61cf9370 6349 req->work.creds = xa_load(&ctx->personalities, personality);
003e8dcc
JA
6350 if (!req->work.creds)
6351 return -EINVAL;
6352 get_cred(req->work.creds);
003e8dcc 6353 }
b16fed66
PB
6354 state = &ctx->submit_state;
6355
6356 /*
6357 * Plug now if we have more than 1 IO left after this, and the target
6358 * is potentially a read/write to block based storage.
6359 */
6360 if (!state->plug_started && state->ios_left > 1 &&
6361 io_op_defs[req->opcode].plug) {
6362 blk_start_plug(&state->plug);
6363 state->plug_started = true;
6364 }
6365
6366 if (io_op_defs[req->opcode].needs_file) {
6367 bool fixed = req->flags & REQ_F_FIXED_FILE;
6368
6369 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6370 if (unlikely(!req->file))
6371 ret = -EBADF;
6372 }
6373
6374 state->ios_left--;
6375 return ret;
6376}
6377
a6b8cadc 6378static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 6379 const struct io_uring_sqe *sqe)
9e645e11 6380{
a1ab7b35 6381 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 6382 int ret;
9e645e11 6383
a6b8cadc
PB
6384 ret = io_init_req(ctx, req, sqe);
6385 if (unlikely(ret)) {
6386fail_req:
6387 io_put_req(req);
6388 io_req_complete(req, ret);
de59bc10
PB
6389 if (link->head) {
6390 /* fail even hard links since we don't submit */
cf109604 6391 link->head->flags |= REQ_F_FAIL_LINK;
de59bc10
PB
6392 io_put_req(link->head);
6393 io_req_complete(link->head, -ECANCELED);
6394 link->head = NULL;
6395 }
a6b8cadc
PB
6396 return ret;
6397 }
be7053b7
PB
6398 ret = io_req_prep(req, sqe);
6399 if (unlikely(ret))
6400 goto fail_req;
a6b8cadc 6401
be7053b7 6402 /* don't need @sqe from now on */
a6b8cadc
PB
6403 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6404 true, ctx->flags & IORING_SETUP_SQPOLL);
6405
9e645e11
JA
6406 /*
6407 * If we already have a head request, queue this one for async
6408 * submittal once the head completes. If we don't have a head but
6409 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6410 * submitted sync once the chain is complete. If none of those
6411 * conditions are true (normal request), then just queue it.
6412 */
863e0560
PB
6413 if (link->head) {
6414 struct io_kiocb *head = link->head;
4e88d6e7 6415
8cdf2193
PB
6416 /*
6417 * Taking sequential execution of a link, draining both sides
6418 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6419 * requests in the link. So, it drains the head and the
6420 * next after the link request. The last one is done via
6421 * drain_next flag to persist the effect across calls.
6422 */
ef4ff581 6423 if (req->flags & REQ_F_IO_DRAIN) {
711be031
PB
6424 head->flags |= REQ_F_IO_DRAIN;
6425 ctx->drain_next = 1;
6426 }
be7053b7 6427 ret = io_req_defer_prep(req);
cf109604 6428 if (unlikely(ret))
a6b8cadc 6429 goto fail_req;
9d76377f 6430 trace_io_uring_link(ctx, req, head);
f2f87370 6431 link->last->link = req;
863e0560 6432 link->last = req;
32fe525b
PB
6433
6434 /* last request of a link, enqueue the link */
ef4ff581 6435 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
de59bc10 6436 io_queue_sqe(head);
863e0560 6437 link->head = NULL;
32fe525b 6438 }
9e645e11 6439 } else {
711be031
PB
6440 if (unlikely(ctx->drain_next)) {
6441 req->flags |= REQ_F_IO_DRAIN;
ef4ff581 6442 ctx->drain_next = 0;
711be031 6443 }
ef4ff581 6444 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
863e0560
PB
6445 link->head = req;
6446 link->last = req;
711be031 6447 } else {
be7053b7 6448 io_queue_sqe(req);
711be031 6449 }
9e645e11 6450 }
2e6e1fde 6451
1d4240cc 6452 return 0;
9e645e11
JA
6453}
6454
9a56a232
JA
6455/*
6456 * Batched submission is done, ensure local IO is flushed out.
6457 */
ba88ff11
PB
6458static void io_submit_state_end(struct io_submit_state *state,
6459 struct io_ring_ctx *ctx)
9a56a232 6460{
a1ab7b35 6461 if (state->link.head)
de59bc10 6462 io_queue_sqe(state->link.head);
6dd0be1e 6463 if (state->comp.nr)
ba88ff11 6464 io_submit_flush_completions(&state->comp, ctx);
27926b68
JA
6465 if (state->plug_started)
6466 blk_finish_plug(&state->plug);
9f13c35b 6467 io_state_file_put(state);
9a56a232
JA
6468}
6469
6470/*
6471 * Start submission side cache.
6472 */
6473static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 6474 unsigned int max_ios)
9a56a232 6475{
27926b68 6476 state->plug_started = false;
9a56a232 6477 state->ios_left = max_ios;
a1ab7b35
PB
6478 /* set only head, no need to init link_last in advance */
6479 state->link.head = NULL;
9a56a232
JA
6480}
6481
2b188cc1
JA
6482static void io_commit_sqring(struct io_ring_ctx *ctx)
6483{
75b28aff 6484 struct io_rings *rings = ctx->rings;
2b188cc1 6485
caf582c6
PB
6486 /*
6487 * Ensure any loads from the SQEs are done at this point,
6488 * since once we write the new head, the application could
6489 * write new data to them.
6490 */
6491 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
6492}
6493
2b188cc1 6494/*
3529d8c2 6495 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
2b188cc1
JA
6496 * that is mapped by userspace. This means that care needs to be taken to
6497 * ensure that reads are stable, as we cannot rely on userspace always
6498 * being a good citizen. If members of the sqe are validated and then later
6499 * used, it's important that those reads are done through READ_ONCE() to
6500 * prevent a re-load down the line.
6501 */
709b302f 6502static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 6503{
75b28aff 6504 u32 *sq_array = ctx->sq_array;
2b188cc1
JA
6505 unsigned head;
6506
6507 /*
6508 * The cached sq head (or cq tail) serves two purposes:
6509 *
6510 * 1) allows us to batch the cost of updating the user visible
6511 * head updates.
6512 * 2) allows the kernel side to track the head on its own, even
6513 * though the application is the one updating it.
6514 */
4fccfcbb 6515 head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
709b302f
PB
6516 if (likely(head < ctx->sq_entries))
6517 return &ctx->sq_sqes[head];
2b188cc1
JA
6518
6519 /* drop invalid entries */
498ccd9e 6520 ctx->cached_sq_dropped++;
ee7d46d9 6521 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
709b302f
PB
6522 return NULL;
6523}
6524
0f212204 6525static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6c271ce2 6526{
46c4e16a 6527 int submitted = 0;
6c271ce2 6528
c4a2ed72 6529 /* if we have a backlog and couldn't flush it all, return BUSY */
ad3eb2c8 6530 if (test_bit(0, &ctx->sq_check_overflow)) {
6c503150 6531 if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
ad3eb2c8
JA
6532 return -EBUSY;
6533 }
6c271ce2 6534
ee7d46d9
PB
6535 /* make sure SQ entry isn't read before tail */
6536 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
9ef4f124 6537
2b85edfc
PB
6538 if (!percpu_ref_tryget_many(&ctx->refs, nr))
6539 return -EAGAIN;
6c271ce2 6540
d8a6df10 6541 percpu_counter_add(&current->io_uring->inflight, nr);
faf7b51c 6542 refcount_add(nr, &current->usage);
ba88ff11 6543 io_submit_state_start(&ctx->submit_state, nr);
b14cca0c 6544
46c4e16a 6545 while (submitted < nr) {
3529d8c2 6546 const struct io_uring_sqe *sqe;
196be95c 6547 struct io_kiocb *req;
fb5ccc98 6548
258b29a9 6549 req = io_alloc_req(ctx);
196be95c
PB
6550 if (unlikely(!req)) {
6551 if (!submitted)
6552 submitted = -EAGAIN;
fb5ccc98 6553 break;
196be95c 6554 }
4fccfcbb
PB
6555 sqe = io_get_sqe(ctx);
6556 if (unlikely(!sqe)) {
6557 kmem_cache_free(req_cachep, req);
6558 break;
6559 }
d3656344
JA
6560 /* will complete beyond this point, count as submitted */
6561 submitted++;
a1ab7b35 6562 if (io_submit_sqe(ctx, req, sqe))
196be95c 6563 break;
6c271ce2
JA
6564 }
6565
9466f437
PB
6566 if (unlikely(submitted != nr)) {
6567 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
d8a6df10
JA
6568 struct io_uring_task *tctx = current->io_uring;
6569 int unused = nr - ref_used;
9466f437 6570
d8a6df10
JA
6571 percpu_ref_put_many(&ctx->refs, unused);
6572 percpu_counter_sub(&tctx->inflight, unused);
6573 put_task_struct_many(current, unused);
9466f437 6574 }
6c271ce2 6575
a1ab7b35 6576 io_submit_state_end(&ctx->submit_state, ctx);
ae9428ca
PB
6577 /* Commit SQ ring head once we've consumed and submitted all SQEs */
6578 io_commit_sqring(ctx);
6579
6c271ce2
JA
6580 return submitted;
6581}
6582
23b3628e
XW
6583static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6584{
6585 /* Tell userspace we may need a wakeup call */
6586 spin_lock_irq(&ctx->completion_lock);
6587 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6588 spin_unlock_irq(&ctx->completion_lock);
6589}
6590
6591static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6592{
6593 spin_lock_irq(&ctx->completion_lock);
6594 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6595 spin_unlock_irq(&ctx->completion_lock);
6596}
6597
08369246 6598static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 6599{
c8d1ba58 6600 unsigned int to_submit;
bdcd3eab 6601 int ret = 0;
6c271ce2 6602
c8d1ba58 6603 to_submit = io_sqring_entries(ctx);
e95eee2d
JA
6604 /* if we're handling multiple rings, cap submit size for fairness */
6605 if (cap_entries && to_submit > 8)
6606 to_submit = 8;
6607
906a3c6f 6608 if (!list_empty(&ctx->iopoll_list) || to_submit) {
c8d1ba58 6609 unsigned nr_events = 0;
a4c0b3de 6610
c8d1ba58 6611 mutex_lock(&ctx->uring_lock);
906a3c6f 6612 if (!list_empty(&ctx->iopoll_list))
c8d1ba58 6613 io_do_iopoll(ctx, &nr_events, 0);
906a3c6f 6614
0298ef96
PB
6615 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
6616 !(ctx->flags & IORING_SETUP_R_DISABLED))
08369246 6617 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58
JA
6618 mutex_unlock(&ctx->uring_lock);
6619 }
6c271ce2 6620
90554200
JA
6621 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6622 wake_up(&ctx->sqo_sq_wait);
6c271ce2 6623
08369246
XW
6624 return ret;
6625}
6c271ce2 6626
08369246
XW
6627static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
6628{
6629 struct io_ring_ctx *ctx;
6630 unsigned sq_thread_idle = 0;
6c271ce2 6631
08369246
XW
6632 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6633 if (sq_thread_idle < ctx->sq_thread_idle)
6634 sq_thread_idle = ctx->sq_thread_idle;
c8d1ba58 6635 }
c1edbf5f 6636
08369246 6637 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 6638}
6c271ce2 6639
c8d1ba58
JA
6640static int io_sq_thread(void *data)
6641{
69fb2131
JA
6642 struct io_sq_data *sqd = data;
6643 struct io_ring_ctx *ctx;
a0d9205f 6644 unsigned long timeout = 0;
37d1e2e3 6645 char buf[TASK_COMM_LEN];
08369246 6646 DEFINE_WAIT(wait);
6c271ce2 6647
37d1e2e3
JA
6648 sprintf(buf, "iou-sqp-%d", sqd->task_pid);
6649 set_task_comm(current, buf);
37d1e2e3
JA
6650 current->pf_io_worker = NULL;
6651
6652 if (sqd->sq_cpu != -1)
6653 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
6654 else
6655 set_cpus_allowed_ptr(current, cpu_online_mask);
6656 current->flags |= PF_NO_SETAFFINITY;
6657
05962f95
JA
6658 down_read(&sqd->rw_lock);
6659
6660 while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
08369246
XW
6661 int ret;
6662 bool cap_entries, sqt_spin, needs_sched;
c1edbf5f 6663
05962f95
JA
6664 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
6665 up_read(&sqd->rw_lock);
6666 cond_resched();
6667 down_read(&sqd->rw_lock);
521d6a73 6668 io_run_task_work();
08369246 6669 timeout = jiffies + sqd->sq_thread_idle;
7d41e854 6670 continue;
08369246 6671 }
37d1e2e3
JA
6672 if (fatal_signal_pending(current))
6673 break;
08369246 6674 sqt_spin = false;
e95eee2d 6675 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131 6676 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7c30f36a
SM
6677 const struct cred *creds = NULL;
6678
6679 if (ctx->sq_creds != current_cred())
6680 creds = override_creds(ctx->sq_creds);
08369246 6681 ret = __io_sq_thread(ctx, cap_entries);
7c30f36a
SM
6682 if (creds)
6683 revert_creds(creds);
08369246
XW
6684 if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
6685 sqt_spin = true;
69fb2131 6686 }
6c271ce2 6687
08369246 6688 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58
JA
6689 io_run_task_work();
6690 cond_resched();
08369246
XW
6691 if (sqt_spin)
6692 timeout = jiffies + sqd->sq_thread_idle;
6693 continue;
6694 }
6695
08369246
XW
6696 needs_sched = true;
6697 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
6698 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6699 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6700 !list_empty_careful(&ctx->iopoll_list)) {
6701 needs_sched = false;
6702 break;
6703 }
6704 if (io_sqring_entries(ctx)) {
6705 needs_sched = false;
6706 break;
6707 }
6708 }
6709
05962f95 6710 if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
69fb2131
JA
6711 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6712 io_ring_set_wakeup_flag(ctx);
08369246 6713
05962f95 6714 up_read(&sqd->rw_lock);
69fb2131 6715 schedule();
05962f95 6716 down_read(&sqd->rw_lock);
69fb2131
JA
6717 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6718 io_ring_clear_wakeup_flag(ctx);
6c271ce2 6719 }
08369246
XW
6720
6721 finish_wait(&sqd->wait, &wait);
6722 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2 6723 }
05962f95 6724 up_read(&sqd->rw_lock);
521d6a73
PB
6725 down_write(&sqd->rw_lock);
6726 /*
6727 * someone may have parked and added a cancellation task_work, run
6728 * it first because we don't want it in io_uring_cancel_sqpoll()
6729 */
37d1e2e3 6730 io_run_task_work();
28cea78a 6731
521d6a73
PB
6732 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6733 io_uring_cancel_sqpoll(ctx);
37d1e2e3 6734 sqd->thread = NULL;
05962f95 6735 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
5f3f26f9 6736 io_ring_set_wakeup_flag(ctx);
05962f95 6737 up_write(&sqd->rw_lock);
521d6a73
PB
6738
6739 io_run_task_work();
37d1e2e3
JA
6740 complete(&sqd->exited);
6741 do_exit(0);
6c271ce2
JA
6742}
6743
bda52162
JA
6744struct io_wait_queue {
6745 struct wait_queue_entry wq;
6746 struct io_ring_ctx *ctx;
6747 unsigned to_wait;
6748 unsigned nr_timeouts;
6749};
6750
6c503150 6751static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
6752{
6753 struct io_ring_ctx *ctx = iowq->ctx;
6754
6755 /*
d195a66e 6756 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
6757 * started waiting. For timeouts, we always want to return to userspace,
6758 * regardless of event count.
6759 */
6c503150 6760 return io_cqring_events(ctx) >= iowq->to_wait ||
bda52162
JA
6761 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6762}
6763
6764static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6765 int wake_flags, void *key)
6766{
6767 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6768 wq);
6769
6c503150
PB
6770 /*
6771 * Cannot safely flush overflowed CQEs from here, ensure we wake up
6772 * the task, and the next invocation will do it.
6773 */
6774 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
6775 return autoremove_wake_function(curr, mode, wake_flags, key);
6776 return -1;
bda52162
JA
6777}
6778
af9c1a44
JA
6779static int io_run_task_work_sig(void)
6780{
6781 if (io_run_task_work())
6782 return 1;
6783 if (!signal_pending(current))
6784 return 0;
792ee0f6
JA
6785 if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
6786 return -ERESTARTSYS;
af9c1a44
JA
6787 return -EINTR;
6788}
6789
eeb60b9a
PB
6790/* when returns >0, the caller should retry */
6791static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
6792 struct io_wait_queue *iowq,
6793 signed long *timeout)
6794{
6795 int ret;
6796
6797 /* make sure we run task_work before checking for signals */
6798 ret = io_run_task_work_sig();
6799 if (ret || io_should_wake(iowq))
6800 return ret;
6801 /* let the caller flush overflows, retry */
6802 if (test_bit(0, &ctx->cq_check_overflow))
6803 return 1;
6804
6805 *timeout = schedule_timeout(*timeout);
6806 return !*timeout ? -ETIME : 1;
6807}
6808
2b188cc1
JA
6809/*
6810 * Wait until events become available, if we don't already have some. The
6811 * application must reap them itself, as they reside on the shared cq ring.
6812 */
6813static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
6814 const sigset_t __user *sig, size_t sigsz,
6815 struct __kernel_timespec __user *uts)
2b188cc1 6816{
bda52162
JA
6817 struct io_wait_queue iowq = {
6818 .wq = {
6819 .private = current,
6820 .func = io_wake_function,
6821 .entry = LIST_HEAD_INIT(iowq.wq.entry),
6822 },
6823 .ctx = ctx,
6824 .to_wait = min_events,
6825 };
75b28aff 6826 struct io_rings *rings = ctx->rings;
c1d5a224
PB
6827 signed long timeout = MAX_SCHEDULE_TIMEOUT;
6828 int ret;
2b188cc1 6829
b41e9852 6830 do {
6c503150
PB
6831 io_cqring_overflow_flush(ctx, false, NULL, NULL);
6832 if (io_cqring_events(ctx) >= min_events)
b41e9852 6833 return 0;
4c6e277c 6834 if (!io_run_task_work())
b41e9852 6835 break;
b41e9852 6836 } while (1);
2b188cc1
JA
6837
6838 if (sig) {
9e75ad5d
AB
6839#ifdef CONFIG_COMPAT
6840 if (in_compat_syscall())
6841 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 6842 sigsz);
9e75ad5d
AB
6843 else
6844#endif
b772434b 6845 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 6846
2b188cc1
JA
6847 if (ret)
6848 return ret;
6849 }
6850
c73ebb68 6851 if (uts) {
c1d5a224
PB
6852 struct timespec64 ts;
6853
c73ebb68
HX
6854 if (get_timespec64(&ts, uts))
6855 return -EFAULT;
6856 timeout = timespec64_to_jiffies(&ts);
6857 }
6858
bda52162 6859 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
c826bd7a 6860 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 6861 do {
ca0a2651
JA
6862 /* if we can't even flush overflow, don't wait for more */
6863 if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
6864 ret = -EBUSY;
6865 break;
6866 }
bda52162
JA
6867 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6868 TASK_INTERRUPTIBLE);
eeb60b9a
PB
6869 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
6870 finish_wait(&ctx->wait, &iowq.wq);
ca0a2651 6871 cond_resched();
eeb60b9a 6872 } while (ret > 0);
bda52162 6873
b7db41c9 6874 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 6875
75b28aff 6876 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
6877}
6878
6b06314c
JA
6879static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6880{
6881#if defined(CONFIG_UNIX)
6882 if (ctx->ring_sock) {
6883 struct sock *sock = ctx->ring_sock->sk;
6884 struct sk_buff *skb;
6885
6886 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6887 kfree_skb(skb);
6888 }
6889#else
6890 int i;
6891
65e19f54
JA
6892 for (i = 0; i < ctx->nr_user_files; i++) {
6893 struct file *file;
6894
6895 file = io_file_from_index(ctx, i);
6896 if (file)
6897 fput(file);
6898 }
6b06314c
JA
6899#endif
6900}
6901
00835dce 6902static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
05f3fb3c 6903{
269bbe5f 6904 struct fixed_rsrc_data *data;
05f3fb3c 6905
269bbe5f 6906 data = container_of(ref, struct fixed_rsrc_data, refs);
05f3fb3c
JA
6907 complete(&data->done);
6908}
6909
2a63b2d9 6910static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
1642b445 6911{
2a63b2d9 6912 spin_lock_bh(&ctx->rsrc_ref_lock);
1642b445
PB
6913}
6914
2a63b2d9 6915static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
6b06314c 6916{
2a63b2d9
BM
6917 spin_unlock_bh(&ctx->rsrc_ref_lock);
6918}
65e19f54 6919
d67d2263
BM
6920static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
6921 struct fixed_rsrc_data *rsrc_data,
269bbe5f 6922 struct fixed_rsrc_ref_node *ref_node)
1642b445 6923{
2a63b2d9 6924 io_rsrc_ref_lock(ctx);
269bbe5f 6925 rsrc_data->node = ref_node;
d67d2263 6926 list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
2a63b2d9 6927 io_rsrc_ref_unlock(ctx);
269bbe5f 6928 percpu_ref_get(&rsrc_data->refs);
1642b445
PB
6929}
6930
8bad28d8 6931static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
6b06314c 6932{
8bad28d8 6933 struct fixed_rsrc_ref_node *ref_node = NULL;
6b06314c 6934
2a63b2d9 6935 io_rsrc_ref_lock(ctx);
1e5d770b 6936 ref_node = data->node;
e6cb007c 6937 data->node = NULL;
2a63b2d9 6938 io_rsrc_ref_unlock(ctx);
05589553
XW
6939 if (ref_node)
6940 percpu_ref_kill(&ref_node->refs);
8bad28d8
HX
6941}
6942
6943static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
6944 struct io_ring_ctx *ctx,
f2303b1f
PB
6945 void (*rsrc_put)(struct io_ring_ctx *ctx,
6946 struct io_rsrc_put *prsrc))
8bad28d8 6947{
f2303b1f 6948 struct fixed_rsrc_ref_node *backup_node;
8bad28d8 6949 int ret;
05589553 6950
8bad28d8
HX
6951 if (data->quiesce)
6952 return -ENXIO;
05589553 6953
8bad28d8 6954 data->quiesce = true;
1ffc5422 6955 do {
f2303b1f
PB
6956 ret = -ENOMEM;
6957 backup_node = alloc_fixed_rsrc_ref_node(ctx);
6958 if (!backup_node)
6959 break;
6960 backup_node->rsrc_data = data;
6961 backup_node->rsrc_put = rsrc_put;
6962
8bad28d8
HX
6963 io_sqe_rsrc_kill_node(ctx, data);
6964 percpu_ref_kill(&data->refs);
6965 flush_delayed_work(&ctx->rsrc_put_work);
6966
1ffc5422
PB
6967 ret = wait_for_completion_interruptible(&data->done);
6968 if (!ret)
6969 break;
8bad28d8 6970
cb5e1b81 6971 percpu_ref_resurrect(&data->refs);
8bad28d8
HX
6972 io_sqe_rsrc_set_node(ctx, data, backup_node);
6973 backup_node = NULL;
cb5e1b81 6974 reinit_completion(&data->done);
8bad28d8 6975 mutex_unlock(&ctx->uring_lock);
1ffc5422 6976 ret = io_run_task_work_sig();
8bad28d8 6977 mutex_lock(&ctx->uring_lock);
f2303b1f 6978 } while (ret >= 0);
8bad28d8 6979 data->quiesce = false;
05f3fb3c 6980
8bad28d8
HX
6981 if (backup_node)
6982 destroy_fixed_rsrc_ref_node(backup_node);
6983 return ret;
d7954b2b
BM
6984}
6985
1ad555c6
BM
6986static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
6987{
6988 struct fixed_rsrc_data *data;
6989
6990 data = kzalloc(sizeof(*data), GFP_KERNEL);
6991 if (!data)
6992 return NULL;
6993
00835dce 6994 if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
1ad555c6
BM
6995 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6996 kfree(data);
6997 return NULL;
6998 }
6999 data->ctx = ctx;
7000 init_completion(&data->done);
7001 return data;
7002}
7003
7004static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
7005{
7006 percpu_ref_exit(&data->refs);
7007 kfree(data->table);
7008 kfree(data);
7009}
7010
d7954b2b
BM
7011static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7012{
7013 struct fixed_rsrc_data *data = ctx->file_data;
d7954b2b
BM
7014 unsigned nr_tables, i;
7015 int ret;
7016
8bad28d8
HX
7017 /*
7018 * percpu_ref_is_dying() is to stop parallel files unregister
7019 * Since we possibly drop uring lock later in this function to
7020 * run task work.
7021 */
7022 if (!data || percpu_ref_is_dying(&data->refs))
d7954b2b 7023 return -ENXIO;
f2303b1f 7024 ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
d7954b2b
BM
7025 if (ret)
7026 return ret;
7027
6b06314c 7028 __io_sqe_files_unregister(ctx);
65e19f54
JA
7029 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7030 for (i = 0; i < nr_tables; i++)
05f3fb3c 7031 kfree(data->table[i].files);
1ad555c6 7032 free_fixed_rsrc_data(data);
05f3fb3c 7033 ctx->file_data = NULL;
6b06314c
JA
7034 ctx->nr_user_files = 0;
7035 return 0;
7036}
7037
37d1e2e3 7038static void io_sq_thread_unpark(struct io_sq_data *sqd)
05962f95 7039 __releases(&sqd->rw_lock)
37d1e2e3 7040{
521d6a73
PB
7041 WARN_ON_ONCE(sqd->thread == current);
7042
37d1e2e3 7043 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
05962f95 7044 up_write(&sqd->rw_lock);
37d1e2e3
JA
7045}
7046
86e0d676 7047static void io_sq_thread_park(struct io_sq_data *sqd)
05962f95 7048 __acquires(&sqd->rw_lock)
37d1e2e3 7049{
521d6a73
PB
7050 WARN_ON_ONCE(sqd->thread == current);
7051
86e0d676 7052 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
05962f95
JA
7053 down_write(&sqd->rw_lock);
7054 /* set again for consistency, in case concurrent parks are happening */
7055 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7056 if (sqd->thread)
86e0d676 7057 wake_up_process(sqd->thread);
37d1e2e3
JA
7058}
7059
7060static void io_sq_thread_stop(struct io_sq_data *sqd)
7061{
521d6a73
PB
7062 WARN_ON_ONCE(sqd->thread == current);
7063
05962f95 7064 down_write(&sqd->rw_lock);
05962f95 7065 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
e8f98f24
JA
7066 if (sqd->thread)
7067 wake_up_process(sqd->thread);
05962f95
JA
7068 up_write(&sqd->rw_lock);
7069 wait_for_completion(&sqd->exited);
37d1e2e3
JA
7070}
7071
534ca6d6 7072static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 7073{
534ca6d6 7074 if (refcount_dec_and_test(&sqd->refs)) {
37d1e2e3
JA
7075 io_sq_thread_stop(sqd);
7076 kfree(sqd);
7077 }
7078}
7079
7080static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7081{
7082 struct io_sq_data *sqd = ctx->sq_data;
7083
7084 if (sqd) {
05962f95 7085 io_sq_thread_park(sqd);
521d6a73 7086 list_del_init(&ctx->sqd_list);
37d1e2e3 7087 io_sqd_update_thread_idle(sqd);
05962f95 7088 io_sq_thread_unpark(sqd);
37d1e2e3
JA
7089
7090 io_put_sq_data(sqd);
7091 ctx->sq_data = NULL;
7c30f36a
SM
7092 if (ctx->sq_creds)
7093 put_cred(ctx->sq_creds);
534ca6d6
JA
7094 }
7095}
7096
aa06165d
JA
7097static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7098{
7099 struct io_ring_ctx *ctx_attach;
7100 struct io_sq_data *sqd;
7101 struct fd f;
7102
7103 f = fdget(p->wq_fd);
7104 if (!f.file)
7105 return ERR_PTR(-ENXIO);
7106 if (f.file->f_op != &io_uring_fops) {
7107 fdput(f);
7108 return ERR_PTR(-EINVAL);
7109 }
7110
7111 ctx_attach = f.file->private_data;
7112 sqd = ctx_attach->sq_data;
7113 if (!sqd) {
7114 fdput(f);
7115 return ERR_PTR(-EINVAL);
7116 }
5c2469e0
JA
7117 if (sqd->task_tgid != current->tgid) {
7118 fdput(f);
7119 return ERR_PTR(-EPERM);
7120 }
aa06165d
JA
7121
7122 refcount_inc(&sqd->refs);
7123 fdput(f);
7124 return sqd;
7125}
7126
26984fbf
PB
7127static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7128 bool *attached)
534ca6d6
JA
7129{
7130 struct io_sq_data *sqd;
7131
26984fbf 7132 *attached = false;
5c2469e0
JA
7133 if (p->flags & IORING_SETUP_ATTACH_WQ) {
7134 sqd = io_attach_sq_data(p);
26984fbf
PB
7135 if (!IS_ERR(sqd)) {
7136 *attached = true;
5c2469e0 7137 return sqd;
26984fbf 7138 }
5c2469e0
JA
7139 /* fall through for EPERM case, setup new sqd/task */
7140 if (PTR_ERR(sqd) != -EPERM)
7141 return sqd;
7142 }
aa06165d 7143
534ca6d6
JA
7144 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7145 if (!sqd)
7146 return ERR_PTR(-ENOMEM);
7147
7148 refcount_set(&sqd->refs, 1);
69fb2131 7149 INIT_LIST_HEAD(&sqd->ctx_list);
05962f95 7150 init_rwsem(&sqd->rw_lock);
534ca6d6 7151 init_waitqueue_head(&sqd->wait);
37d1e2e3 7152 init_completion(&sqd->exited);
534ca6d6
JA
7153 return sqd;
7154}
7155
6b06314c 7156#if defined(CONFIG_UNIX)
6b06314c
JA
7157/*
7158 * Ensure the UNIX gc is aware of our file set, so we are certain that
7159 * the io_uring can be safely unregistered on process exit, even if we have
7160 * loops in the file referencing.
7161 */
7162static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7163{
7164 struct sock *sk = ctx->ring_sock->sk;
7165 struct scm_fp_list *fpl;
7166 struct sk_buff *skb;
08a45173 7167 int i, nr_files;
6b06314c 7168
6b06314c
JA
7169 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7170 if (!fpl)
7171 return -ENOMEM;
7172
7173 skb = alloc_skb(0, GFP_KERNEL);
7174 if (!skb) {
7175 kfree(fpl);
7176 return -ENOMEM;
7177 }
7178
7179 skb->sk = sk;
6b06314c 7180
08a45173 7181 nr_files = 0;
62e398be 7182 fpl->user = get_uid(current_user());
6b06314c 7183 for (i = 0; i < nr; i++) {
65e19f54
JA
7184 struct file *file = io_file_from_index(ctx, i + offset);
7185
7186 if (!file)
08a45173 7187 continue;
65e19f54 7188 fpl->fp[nr_files] = get_file(file);
08a45173
JA
7189 unix_inflight(fpl->user, fpl->fp[nr_files]);
7190 nr_files++;
6b06314c
JA
7191 }
7192
08a45173
JA
7193 if (nr_files) {
7194 fpl->max = SCM_MAX_FD;
7195 fpl->count = nr_files;
7196 UNIXCB(skb).fp = fpl;
05f3fb3c 7197 skb->destructor = unix_destruct_scm;
08a45173
JA
7198 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7199 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 7200
08a45173
JA
7201 for (i = 0; i < nr_files; i++)
7202 fput(fpl->fp[i]);
7203 } else {
7204 kfree_skb(skb);
7205 kfree(fpl);
7206 }
6b06314c
JA
7207
7208 return 0;
7209}
7210
7211/*
7212 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7213 * causes regular reference counting to break down. We rely on the UNIX
7214 * garbage collection to take care of this problem for us.
7215 */
7216static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7217{
7218 unsigned left, total;
7219 int ret = 0;
7220
7221 total = 0;
7222 left = ctx->nr_user_files;
7223 while (left) {
7224 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
7225
7226 ret = __io_sqe_files_scm(ctx, this_files, total);
7227 if (ret)
7228 break;
7229 left -= this_files;
7230 total += this_files;
7231 }
7232
7233 if (!ret)
7234 return 0;
7235
7236 while (total < ctx->nr_user_files) {
65e19f54
JA
7237 struct file *file = io_file_from_index(ctx, total);
7238
7239 if (file)
7240 fput(file);
6b06314c
JA
7241 total++;
7242 }
7243
7244 return ret;
7245}
7246#else
7247static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7248{
7249 return 0;
7250}
7251#endif
7252
269bbe5f 7253static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
5398ae69 7254 unsigned nr_tables, unsigned nr_files)
65e19f54
JA
7255{
7256 int i;
7257
7258 for (i = 0; i < nr_tables; i++) {
269bbe5f 7259 struct fixed_rsrc_table *table = &file_data->table[i];
65e19f54
JA
7260 unsigned this_files;
7261
7262 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7263 table->files = kcalloc(this_files, sizeof(struct file *),
7264 GFP_KERNEL);
7265 if (!table->files)
7266 break;
7267 nr_files -= this_files;
7268 }
7269
7270 if (i == nr_tables)
7271 return 0;
7272
7273 for (i = 0; i < nr_tables; i++) {
269bbe5f 7274 struct fixed_rsrc_table *table = &file_data->table[i];
65e19f54
JA
7275 kfree(table->files);
7276 }
7277 return 1;
7278}
7279
50238531 7280static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 7281{
50238531 7282 struct file *file = prsrc->file;
05f3fb3c
JA
7283#if defined(CONFIG_UNIX)
7284 struct sock *sock = ctx->ring_sock->sk;
7285 struct sk_buff_head list, *head = &sock->sk_receive_queue;
7286 struct sk_buff *skb;
7287 int i;
7288
7289 __skb_queue_head_init(&list);
7290
7291 /*
7292 * Find the skb that holds this file in its SCM_RIGHTS. When found,
7293 * remove this entry and rearrange the file array.
7294 */
7295 skb = skb_dequeue(head);
7296 while (skb) {
7297 struct scm_fp_list *fp;
7298
7299 fp = UNIXCB(skb).fp;
7300 for (i = 0; i < fp->count; i++) {
7301 int left;
7302
7303 if (fp->fp[i] != file)
7304 continue;
7305
7306 unix_notinflight(fp->user, fp->fp[i]);
7307 left = fp->count - 1 - i;
7308 if (left) {
7309 memmove(&fp->fp[i], &fp->fp[i + 1],
7310 left * sizeof(struct file *));
7311 }
7312 fp->count--;
7313 if (!fp->count) {
7314 kfree_skb(skb);
7315 skb = NULL;
7316 } else {
7317 __skb_queue_tail(&list, skb);
7318 }
7319 fput(file);
7320 file = NULL;
7321 break;
7322 }
7323
7324 if (!file)
7325 break;
7326
7327 __skb_queue_tail(&list, skb);
7328
7329 skb = skb_dequeue(head);
7330 }
7331
7332 if (skb_peek(&list)) {
7333 spin_lock_irq(&head->lock);
7334 while ((skb = __skb_dequeue(&list)) != NULL)
7335 __skb_queue_tail(head, skb);
7336 spin_unlock_irq(&head->lock);
7337 }
7338#else
7339 fput(file);
7340#endif
7341}
7342
269bbe5f 7343static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
65e19f54 7344{
269bbe5f
BM
7345 struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
7346 struct io_ring_ctx *ctx = rsrc_data->ctx;
7347 struct io_rsrc_put *prsrc, *tmp;
05589553 7348
269bbe5f
BM
7349 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7350 list_del(&prsrc->list);
50238531 7351 ref_node->rsrc_put(ctx, prsrc);
269bbe5f 7352 kfree(prsrc);
65e19f54 7353 }
05589553 7354
05589553
XW
7355 percpu_ref_exit(&ref_node->refs);
7356 kfree(ref_node);
269bbe5f 7357 percpu_ref_put(&rsrc_data->refs);
2faf852d 7358}
65e19f54 7359
269bbe5f 7360static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
7361{
7362 struct io_ring_ctx *ctx;
7363 struct llist_node *node;
7364
269bbe5f
BM
7365 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7366 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
7367
7368 while (node) {
269bbe5f 7369 struct fixed_rsrc_ref_node *ref_node;
4a38aed2
JA
7370 struct llist_node *next = node->next;
7371
269bbe5f
BM
7372 ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
7373 __io_rsrc_put_work(ref_node);
4a38aed2
JA
7374 node = next;
7375 }
7376}
7377
ea64ec02
PB
7378static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
7379 unsigned i)
2faf852d 7380{
ea64ec02
PB
7381 struct fixed_rsrc_table *table;
7382
7383 table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7384 return &table->files[i & IORING_FILE_TABLE_MASK];
7385}
7386
00835dce 7387static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
2faf852d 7388{
269bbe5f
BM
7389 struct fixed_rsrc_ref_node *ref_node;
7390 struct fixed_rsrc_data *data;
4a38aed2 7391 struct io_ring_ctx *ctx;
e297822b 7392 bool first_add = false;
4a38aed2 7393 int delay = HZ;
65e19f54 7394
269bbe5f
BM
7395 ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
7396 data = ref_node->rsrc_data;
e297822b
PB
7397 ctx = data->ctx;
7398
2a63b2d9 7399 io_rsrc_ref_lock(ctx);
e297822b
PB
7400 ref_node->done = true;
7401
d67d2263
BM
7402 while (!list_empty(&ctx->rsrc_ref_list)) {
7403 ref_node = list_first_entry(&ctx->rsrc_ref_list,
269bbe5f 7404 struct fixed_rsrc_ref_node, node);
e297822b
PB
7405 /* recycle ref nodes in order */
7406 if (!ref_node->done)
7407 break;
7408 list_del(&ref_node->node);
269bbe5f 7409 first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
e297822b 7410 }
2a63b2d9 7411 io_rsrc_ref_unlock(ctx);
05589553 7412
e297822b 7413 if (percpu_ref_is_dying(&data->refs))
4a38aed2 7414 delay = 0;
05589553 7415
4a38aed2 7416 if (!delay)
269bbe5f 7417 mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
4a38aed2 7418 else if (first_add)
269bbe5f 7419 queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
05f3fb3c 7420}
65e19f54 7421
6802535d 7422static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
05589553 7423 struct io_ring_ctx *ctx)
05f3fb3c 7424{
269bbe5f 7425 struct fixed_rsrc_ref_node *ref_node;
05f3fb3c 7426
05589553
XW
7427 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7428 if (!ref_node)
3e2224c5 7429 return NULL;
05f3fb3c 7430
00835dce 7431 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
05589553
XW
7432 0, GFP_KERNEL)) {
7433 kfree(ref_node);
3e2224c5 7434 return NULL;
05589553
XW
7435 }
7436 INIT_LIST_HEAD(&ref_node->node);
269bbe5f 7437 INIT_LIST_HEAD(&ref_node->rsrc_list);
e297822b 7438 ref_node->done = false;
05589553 7439 return ref_node;
05589553
XW
7440}
7441
bc9744cd
PB
7442static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
7443 struct fixed_rsrc_ref_node *ref_node)
6802535d 7444{
269bbe5f 7445 ref_node->rsrc_data = ctx->file_data;
50238531 7446 ref_node->rsrc_put = io_ring_file_put;
05589553
XW
7447}
7448
269bbe5f 7449static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
05589553
XW
7450{
7451 percpu_ref_exit(&ref_node->refs);
7452 kfree(ref_node);
65e19f54
JA
7453}
7454
ea64ec02 7455
6b06314c
JA
7456static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7457 unsigned nr_args)
7458{
7459 __s32 __user *fds = (__s32 __user *) arg;
600cf3f8 7460 unsigned nr_tables, i;
05f3fb3c 7461 struct file *file;
600cf3f8 7462 int fd, ret = -ENOMEM;
269bbe5f
BM
7463 struct fixed_rsrc_ref_node *ref_node;
7464 struct fixed_rsrc_data *file_data;
6b06314c 7465
05f3fb3c 7466 if (ctx->file_data)
6b06314c
JA
7467 return -EBUSY;
7468 if (!nr_args)
7469 return -EINVAL;
7470 if (nr_args > IORING_MAX_FIXED_FILES)
7471 return -EMFILE;
7472
1ad555c6 7473 file_data = alloc_fixed_rsrc_data(ctx);
5398ae69 7474 if (!file_data)
05f3fb3c 7475 return -ENOMEM;
13770a71 7476 ctx->file_data = file_data;
05f3fb3c 7477
65e19f54 7478 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
035fbafc 7479 file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
5398ae69 7480 GFP_KERNEL);
600cf3f8
PB
7481 if (!file_data->table)
7482 goto out_free;
05f3fb3c 7483
600cf3f8 7484 if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
1ad555c6 7485 goto out_free;
65e19f54 7486
08a45173 7487 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
600cf3f8
PB
7488 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7489 ret = -EFAULT;
7490 goto out_fput;
7491 }
08a45173 7492 /* allow sparse sets */
600cf3f8 7493 if (fd == -1)
08a45173 7494 continue;
6b06314c 7495
05f3fb3c 7496 file = fget(fd);
6b06314c 7497 ret = -EBADF;
05f3fb3c 7498 if (!file)
600cf3f8 7499 goto out_fput;
05f3fb3c 7500
6b06314c
JA
7501 /*
7502 * Don't allow io_uring instances to be registered. If UNIX
7503 * isn't enabled, then this causes a reference cycle and this
7504 * instance can never get freed. If UNIX is enabled we'll
7505 * handle it just fine, but there's still no point in allowing
7506 * a ring fd as it doesn't support regular read/write anyway.
7507 */
05f3fb3c
JA
7508 if (file->f_op == &io_uring_fops) {
7509 fput(file);
600cf3f8 7510 goto out_fput;
6b06314c 7511 }
ea64ec02 7512 *io_fixed_file_slot(file_data, i) = file;
6b06314c
JA
7513 }
7514
6b06314c 7515 ret = io_sqe_files_scm(ctx);
05589553 7516 if (ret) {
6b06314c 7517 io_sqe_files_unregister(ctx);
05589553
XW
7518 return ret;
7519 }
6b06314c 7520
bc9744cd 7521 ref_node = alloc_fixed_rsrc_ref_node(ctx);
3e2224c5 7522 if (!ref_node) {
05589553 7523 io_sqe_files_unregister(ctx);
3e2224c5 7524 return -ENOMEM;
05589553 7525 }
bc9744cd 7526 init_fixed_file_ref_node(ctx, ref_node);
05589553 7527
d67d2263 7528 io_sqe_rsrc_set_node(ctx, file_data, ref_node);
6b06314c 7529 return ret;
600cf3f8
PB
7530out_fput:
7531 for (i = 0; i < ctx->nr_user_files; i++) {
7532 file = io_file_from_index(ctx, i);
7533 if (file)
7534 fput(file);
7535 }
7536 for (i = 0; i < nr_tables; i++)
7537 kfree(file_data->table[i].files);
7538 ctx->nr_user_files = 0;
600cf3f8 7539out_free:
1ad555c6 7540 free_fixed_rsrc_data(ctx->file_data);
55cbc256 7541 ctx->file_data = NULL;
6b06314c
JA
7542 return ret;
7543}
7544
c3a31e60
JA
7545static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7546 int index)
7547{
7548#if defined(CONFIG_UNIX)
7549 struct sock *sock = ctx->ring_sock->sk;
7550 struct sk_buff_head *head = &sock->sk_receive_queue;
7551 struct sk_buff *skb;
7552
7553 /*
7554 * See if we can merge this file into an existing skb SCM_RIGHTS
7555 * file set. If there's no room, fall back to allocating a new skb
7556 * and filling it in.
7557 */
7558 spin_lock_irq(&head->lock);
7559 skb = skb_peek(head);
7560 if (skb) {
7561 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7562
7563 if (fpl->count < SCM_MAX_FD) {
7564 __skb_unlink(skb, head);
7565 spin_unlock_irq(&head->lock);
7566 fpl->fp[fpl->count] = get_file(file);
7567 unix_inflight(fpl->user, fpl->fp[fpl->count]);
7568 fpl->count++;
7569 spin_lock_irq(&head->lock);
7570 __skb_queue_head(head, skb);
7571 } else {
7572 skb = NULL;
7573 }
7574 }
7575 spin_unlock_irq(&head->lock);
7576
7577 if (skb) {
7578 fput(file);
7579 return 0;
7580 }
7581
7582 return __io_sqe_files_scm(ctx, 1, index);
7583#else
7584 return 0;
7585#endif
7586}
7587
50238531 7588static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
05f3fb3c 7589{
269bbe5f
BM
7590 struct io_rsrc_put *prsrc;
7591 struct fixed_rsrc_ref_node *ref_node = data->node;
05f3fb3c 7592
269bbe5f
BM
7593 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7594 if (!prsrc)
a5318d3c 7595 return -ENOMEM;
05f3fb3c 7596
50238531 7597 prsrc->rsrc = rsrc;
269bbe5f 7598 list_add(&prsrc->list, &ref_node->rsrc_list);
05589553 7599
a5318d3c 7600 return 0;
05f3fb3c
JA
7601}
7602
269bbe5f
BM
7603static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
7604 struct file *file)
7605{
50238531 7606 return io_queue_rsrc_removal(data, (void *)file);
269bbe5f
BM
7607}
7608
05f3fb3c 7609static int __io_sqe_files_update(struct io_ring_ctx *ctx,
269bbe5f 7610 struct io_uring_rsrc_update *up,
05f3fb3c
JA
7611 unsigned nr_args)
7612{
269bbe5f
BM
7613 struct fixed_rsrc_data *data = ctx->file_data;
7614 struct fixed_rsrc_ref_node *ref_node;
ea64ec02 7615 struct file *file, **file_slot;
c3a31e60
JA
7616 __s32 __user *fds;
7617 int fd, i, err;
7618 __u32 done;
05589553 7619 bool needs_switch = false;
c3a31e60 7620
05f3fb3c 7621 if (check_add_overflow(up->offset, nr_args, &done))
c3a31e60
JA
7622 return -EOVERFLOW;
7623 if (done > ctx->nr_user_files)
7624 return -EINVAL;
7625
bc9744cd 7626 ref_node = alloc_fixed_rsrc_ref_node(ctx);
3e2224c5
MWO
7627 if (!ref_node)
7628 return -ENOMEM;
bc9744cd 7629 init_fixed_file_ref_node(ctx, ref_node);
05589553 7630
269bbe5f 7631 fds = u64_to_user_ptr(up->data);
67973b93 7632 for (done = 0; done < nr_args; done++) {
c3a31e60
JA
7633 err = 0;
7634 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7635 err = -EFAULT;
7636 break;
7637 }
4e0377a1 7638 if (fd == IORING_REGISTER_FILES_SKIP)
7639 continue;
7640
67973b93 7641 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
ea64ec02
PB
7642 file_slot = io_fixed_file_slot(ctx->file_data, i);
7643
7644 if (*file_slot) {
7645 err = io_queue_file_removal(data, *file_slot);
a5318d3c
HD
7646 if (err)
7647 break;
ea64ec02 7648 *file_slot = NULL;
05589553 7649 needs_switch = true;
c3a31e60
JA
7650 }
7651 if (fd != -1) {
c3a31e60
JA
7652 file = fget(fd);
7653 if (!file) {
7654 err = -EBADF;
7655 break;
7656 }
7657 /*
7658 * Don't allow io_uring instances to be registered. If
7659 * UNIX isn't enabled, then this causes a reference
7660 * cycle and this instance can never get freed. If UNIX
7661 * is enabled we'll handle it just fine, but there's
7662 * still no point in allowing a ring fd as it doesn't
7663 * support regular read/write anyway.
7664 */
7665 if (file->f_op == &io_uring_fops) {
7666 fput(file);
7667 err = -EBADF;
7668 break;
7669 }
e68a3ff8 7670 *file_slot = file;
c3a31e60 7671 err = io_sqe_file_register(ctx, file, i);
f3bd9dae 7672 if (err) {
e68a3ff8 7673 *file_slot = NULL;
f3bd9dae 7674 fput(file);
c3a31e60 7675 break;
f3bd9dae 7676 }
c3a31e60 7677 }
05f3fb3c
JA
7678 }
7679
05589553 7680 if (needs_switch) {
b2e96852 7681 percpu_ref_kill(&data->node->refs);
d67d2263 7682 io_sqe_rsrc_set_node(ctx, data, ref_node);
05589553 7683 } else
269bbe5f 7684 destroy_fixed_rsrc_ref_node(ref_node);
c3a31e60
JA
7685
7686 return done ? done : err;
7687}
05589553 7688
05f3fb3c
JA
7689static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7690 unsigned nr_args)
7691{
269bbe5f 7692 struct io_uring_rsrc_update up;
05f3fb3c
JA
7693
7694 if (!ctx->file_data)
7695 return -ENXIO;
7696 if (!nr_args)
7697 return -EINVAL;
7698 if (copy_from_user(&up, arg, sizeof(up)))
7699 return -EFAULT;
7700 if (up.resv)
7701 return -EINVAL;
7702
7703 return __io_sqe_files_update(ctx, &up, nr_args);
7704}
c3a31e60 7705
5280f7e5 7706static struct io_wq_work *io_free_work(struct io_wq_work *work)
7d723065
JA
7707{
7708 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7709
5280f7e5
PB
7710 req = io_put_req_find_next(req);
7711 return req ? &req->work : NULL;
7d723065
JA
7712}
7713
5aa75ed5 7714static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
24369c2e 7715{
e941894e 7716 struct io_wq_hash *hash;
24369c2e 7717 struct io_wq_data data;
24369c2e 7718 unsigned int concurrency;
24369c2e 7719
e941894e
JA
7720 hash = ctx->hash_map;
7721 if (!hash) {
7722 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
7723 if (!hash)
7724 return ERR_PTR(-ENOMEM);
7725 refcount_set(&hash->refs, 1);
7726 init_waitqueue_head(&hash->wait);
7727 ctx->hash_map = hash;
24369c2e
PB
7728 }
7729
e941894e 7730 data.hash = hash;
e9fd9396 7731 data.free_work = io_free_work;
f5fa38c5 7732 data.do_work = io_wq_submit_work;
24369c2e 7733
d25e3a3d
JA
7734 /* Do QD, or 4 * CPUS, whatever is smallest */
7735 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
24369c2e 7736
5aa75ed5 7737 return io_wq_create(concurrency, &data);
24369c2e
PB
7738}
7739
5aa75ed5
JA
7740static int io_uring_alloc_task_context(struct task_struct *task,
7741 struct io_ring_ctx *ctx)
0f212204
JA
7742{
7743 struct io_uring_task *tctx;
d8a6df10 7744 int ret;
0f212204
JA
7745
7746 tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7747 if (unlikely(!tctx))
7748 return -ENOMEM;
7749
d8a6df10
JA
7750 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7751 if (unlikely(ret)) {
7752 kfree(tctx);
7753 return ret;
7754 }
7755
5aa75ed5
JA
7756 tctx->io_wq = io_init_wq_offload(ctx);
7757 if (IS_ERR(tctx->io_wq)) {
7758 ret = PTR_ERR(tctx->io_wq);
7759 percpu_counter_destroy(&tctx->inflight);
7760 kfree(tctx);
7761 return ret;
7762 }
7763
0f212204
JA
7764 xa_init(&tctx->xa);
7765 init_waitqueue_head(&tctx->wait);
7766 tctx->last = NULL;
fdaf083c 7767 atomic_set(&tctx->in_idle, 0);
0f212204 7768 task->io_uring = tctx;
7cbf1722
JA
7769 spin_lock_init(&tctx->task_lock);
7770 INIT_WQ_LIST(&tctx->task_list);
7771 tctx->task_state = 0;
7772 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
7773 return 0;
7774}
7775
7776void __io_uring_free(struct task_struct *tsk)
7777{
7778 struct io_uring_task *tctx = tsk->io_uring;
7779
7780 WARN_ON_ONCE(!xa_empty(&tctx->xa));
ef8eaa4e
PB
7781 WARN_ON_ONCE(tctx->io_wq);
7782
d8a6df10 7783 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
7784 kfree(tctx);
7785 tsk->io_uring = NULL;
7786}
7787
7e84e1c7
SG
7788static int io_sq_offload_create(struct io_ring_ctx *ctx,
7789 struct io_uring_params *p)
2b188cc1
JA
7790{
7791 int ret;
7792
d25e3a3d
JA
7793 /* Retain compatibility with failing for an invalid attach attempt */
7794 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
7795 IORING_SETUP_ATTACH_WQ) {
7796 struct fd f;
7797
7798 f = fdget(p->wq_fd);
7799 if (!f.file)
7800 return -ENXIO;
7801 if (f.file->f_op != &io_uring_fops) {
7802 fdput(f);
7803 return -EINVAL;
7804 }
7805 fdput(f);
7806 }
6c271ce2 7807 if (ctx->flags & IORING_SETUP_SQPOLL) {
46fe18b1 7808 struct task_struct *tsk;
534ca6d6 7809 struct io_sq_data *sqd;
26984fbf 7810 bool attached;
534ca6d6 7811
3ec482d1 7812 ret = -EPERM;
ce59fc69 7813 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
3ec482d1
JA
7814 goto err;
7815
26984fbf 7816 sqd = io_get_sq_data(p, &attached);
534ca6d6
JA
7817 if (IS_ERR(sqd)) {
7818 ret = PTR_ERR(sqd);
7819 goto err;
7820 }
69fb2131 7821
7c30f36a 7822 ctx->sq_creds = get_current_cred();
534ca6d6 7823 ctx->sq_data = sqd;
917257da
JA
7824 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7825 if (!ctx->sq_thread_idle)
7826 ctx->sq_thread_idle = HZ;
7827
26984fbf 7828 ret = 0;
78d7f6ba 7829 io_sq_thread_park(sqd);
26984fbf
PB
7830 /* don't attach to a dying SQPOLL thread, would be racy */
7831 if (attached && !sqd->thread) {
7832 ret = -ENXIO;
7833 } else {
7834 list_add(&ctx->sqd_list, &sqd->ctx_list);
7835 io_sqd_update_thread_idle(sqd);
7836 }
78d7f6ba
PB
7837 io_sq_thread_unpark(sqd);
7838
26984fbf
PB
7839 if (ret < 0) {
7840 io_put_sq_data(sqd);
7841 ctx->sq_data = NULL;
7842 return ret;
7843 } else if (attached) {
5aa75ed5 7844 return 0;
26984fbf 7845 }
aa06165d 7846
6c271ce2 7847 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 7848 int cpu = p->sq_thread_cpu;
6c271ce2 7849
917257da 7850 ret = -EINVAL;
44a9bd18 7851 if (cpu >= nr_cpu_ids)
e8f98f24 7852 goto err_sqpoll;
7889f44d 7853 if (!cpu_online(cpu))
e8f98f24 7854 goto err_sqpoll;
917257da 7855
37d1e2e3 7856 sqd->sq_cpu = cpu;
6c271ce2 7857 } else {
37d1e2e3 7858 sqd->sq_cpu = -1;
6c271ce2 7859 }
37d1e2e3
JA
7860
7861 sqd->task_pid = current->pid;
5c2469e0 7862 sqd->task_tgid = current->tgid;
46fe18b1
JA
7863 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
7864 if (IS_ERR(tsk)) {
7865 ret = PTR_ERR(tsk);
e8f98f24 7866 goto err_sqpoll;
6c271ce2 7867 }
97a73a0f 7868
46fe18b1 7869 sqd->thread = tsk;
97a73a0f 7870 ret = io_uring_alloc_task_context(tsk, ctx);
46fe18b1 7871 wake_up_new_task(tsk);
0f212204
JA
7872 if (ret)
7873 goto err;
6c271ce2
JA
7874 } else if (p->flags & IORING_SETUP_SQ_AFF) {
7875 /* Can't have SQ_AFF without SQPOLL */
7876 ret = -EINVAL;
7877 goto err;
7878 }
7879
2b188cc1
JA
7880 return 0;
7881err:
37d1e2e3 7882 io_sq_thread_finish(ctx);
2b188cc1 7883 return ret;
e8f98f24
JA
7884err_sqpoll:
7885 complete(&ctx->sq_data->exited);
7886 goto err;
2b188cc1
JA
7887}
7888
a087e2b5
BM
7889static inline void __io_unaccount_mem(struct user_struct *user,
7890 unsigned long nr_pages)
2b188cc1
JA
7891{
7892 atomic_long_sub(nr_pages, &user->locked_vm);
7893}
7894
a087e2b5
BM
7895static inline int __io_account_mem(struct user_struct *user,
7896 unsigned long nr_pages)
2b188cc1
JA
7897{
7898 unsigned long page_limit, cur_pages, new_pages;
7899
7900 /* Don't allow more pages than we can safely lock */
7901 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7902
7903 do {
7904 cur_pages = atomic_long_read(&user->locked_vm);
7905 new_pages = cur_pages + nr_pages;
7906 if (new_pages > page_limit)
7907 return -ENOMEM;
7908 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7909 new_pages) != cur_pages);
7910
7911 return 0;
7912}
7913
26bfa89e 7914static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 7915{
62e398be 7916 if (ctx->user)
a087e2b5 7917 __io_unaccount_mem(ctx->user, nr_pages);
30975825 7918
26bfa89e
JA
7919 if (ctx->mm_account)
7920 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
7921}
7922
26bfa89e 7923static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 7924{
30975825
BM
7925 int ret;
7926
62e398be 7927 if (ctx->user) {
30975825
BM
7928 ret = __io_account_mem(ctx->user, nr_pages);
7929 if (ret)
7930 return ret;
7931 }
7932
26bfa89e
JA
7933 if (ctx->mm_account)
7934 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
7935
7936 return 0;
7937}
7938
2b188cc1
JA
7939static void io_mem_free(void *ptr)
7940{
52e04ef4
MR
7941 struct page *page;
7942
7943 if (!ptr)
7944 return;
2b188cc1 7945
52e04ef4 7946 page = virt_to_head_page(ptr);
2b188cc1
JA
7947 if (put_page_testzero(page))
7948 free_compound_page(page);
7949}
7950
7951static void *io_mem_alloc(size_t size)
7952{
7953 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
26bfa89e 7954 __GFP_NORETRY | __GFP_ACCOUNT;
2b188cc1
JA
7955
7956 return (void *) __get_free_pages(gfp_flags, get_order(size));
7957}
7958
75b28aff
HV
7959static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7960 size_t *sq_offset)
7961{
7962 struct io_rings *rings;
7963 size_t off, sq_array_size;
7964
7965 off = struct_size(rings, cqes, cq_entries);
7966 if (off == SIZE_MAX)
7967 return SIZE_MAX;
7968
7969#ifdef CONFIG_SMP
7970 off = ALIGN(off, SMP_CACHE_BYTES);
7971 if (off == 0)
7972 return SIZE_MAX;
7973#endif
7974
b36200f5
DV
7975 if (sq_offset)
7976 *sq_offset = off;
7977
75b28aff
HV
7978 sq_array_size = array_size(sizeof(u32), sq_entries);
7979 if (sq_array_size == SIZE_MAX)
7980 return SIZE_MAX;
7981
7982 if (check_add_overflow(off, sq_array_size, &off))
7983 return SIZE_MAX;
7984
75b28aff
HV
7985 return off;
7986}
7987
0a96bbe4 7988static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee
JA
7989{
7990 int i, j;
7991
7992 if (!ctx->user_bufs)
7993 return -ENXIO;
7994
7995 for (i = 0; i < ctx->nr_user_bufs; i++) {
7996 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7997
7998 for (j = 0; j < imu->nr_bvecs; j++)
f1f6a7dd 7999 unpin_user_page(imu->bvec[j].bv_page);
edafccee 8000
de293938 8001 if (imu->acct_pages)
26bfa89e 8002 io_unaccount_mem(ctx, imu->acct_pages);
d4ef6475 8003 kvfree(imu->bvec);
edafccee
JA
8004 imu->nr_bvecs = 0;
8005 }
8006
8007 kfree(ctx->user_bufs);
8008 ctx->user_bufs = NULL;
8009 ctx->nr_user_bufs = 0;
8010 return 0;
8011}
8012
8013static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8014 void __user *arg, unsigned index)
8015{
8016 struct iovec __user *src;
8017
8018#ifdef CONFIG_COMPAT
8019 if (ctx->compat) {
8020 struct compat_iovec __user *ciovs;
8021 struct compat_iovec ciov;
8022
8023 ciovs = (struct compat_iovec __user *) arg;
8024 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8025 return -EFAULT;
8026
d55e5f5b 8027 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
8028 dst->iov_len = ciov.iov_len;
8029 return 0;
8030 }
8031#endif
8032 src = (struct iovec __user *) arg;
8033 if (copy_from_user(dst, &src[index], sizeof(*dst)))
8034 return -EFAULT;
8035 return 0;
8036}
8037
de293938
JA
8038/*
8039 * Not super efficient, but this is just a registration time. And we do cache
8040 * the last compound head, so generally we'll only do a full search if we don't
8041 * match that one.
8042 *
8043 * We check if the given compound head page has already been accounted, to
8044 * avoid double accounting it. This allows us to account the full size of the
8045 * page, not just the constituent pages of a huge page.
8046 */
8047static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8048 int nr_pages, struct page *hpage)
8049{
8050 int i, j;
8051
8052 /* check current page array */
8053 for (i = 0; i < nr_pages; i++) {
8054 if (!PageCompound(pages[i]))
8055 continue;
8056 if (compound_head(pages[i]) == hpage)
8057 return true;
8058 }
8059
8060 /* check previously registered pages */
8061 for (i = 0; i < ctx->nr_user_bufs; i++) {
8062 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8063
8064 for (j = 0; j < imu->nr_bvecs; j++) {
8065 if (!PageCompound(imu->bvec[j].bv_page))
8066 continue;
8067 if (compound_head(imu->bvec[j].bv_page) == hpage)
8068 return true;
8069 }
8070 }
8071
8072 return false;
8073}
8074
8075static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8076 int nr_pages, struct io_mapped_ubuf *imu,
8077 struct page **last_hpage)
8078{
8079 int i, ret;
8080
8081 for (i = 0; i < nr_pages; i++) {
8082 if (!PageCompound(pages[i])) {
8083 imu->acct_pages++;
8084 } else {
8085 struct page *hpage;
8086
8087 hpage = compound_head(pages[i]);
8088 if (hpage == *last_hpage)
8089 continue;
8090 *last_hpage = hpage;
8091 if (headpage_already_acct(ctx, pages, i, hpage))
8092 continue;
8093 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8094 }
8095 }
8096
8097 if (!imu->acct_pages)
8098 return 0;
8099
26bfa89e 8100 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
8101 if (ret)
8102 imu->acct_pages = 0;
8103 return ret;
8104}
8105
0a96bbe4
BM
8106static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8107 struct io_mapped_ubuf *imu,
8108 struct page **last_hpage)
edafccee
JA
8109{
8110 struct vm_area_struct **vmas = NULL;
8111 struct page **pages = NULL;
0a96bbe4
BM
8112 unsigned long off, start, end, ubuf;
8113 size_t size;
8114 int ret, pret, nr_pages, i;
8115
8116 ubuf = (unsigned long) iov->iov_base;
8117 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8118 start = ubuf >> PAGE_SHIFT;
8119 nr_pages = end - start;
8120
8121 ret = -ENOMEM;
8122
8123 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8124 if (!pages)
8125 goto done;
8126
8127 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8128 GFP_KERNEL);
8129 if (!vmas)
8130 goto done;
edafccee 8131
0a96bbe4
BM
8132 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8133 GFP_KERNEL);
8134 if (!imu->bvec)
8135 goto done;
8136
8137 ret = 0;
8138 mmap_read_lock(current->mm);
8139 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8140 pages, vmas);
8141 if (pret == nr_pages) {
8142 /* don't support file backed memory */
8143 for (i = 0; i < nr_pages; i++) {
8144 struct vm_area_struct *vma = vmas[i];
8145
8146 if (vma->vm_file &&
8147 !is_file_hugepages(vma->vm_file)) {
8148 ret = -EOPNOTSUPP;
8149 break;
8150 }
8151 }
8152 } else {
8153 ret = pret < 0 ? pret : -EFAULT;
8154 }
8155 mmap_read_unlock(current->mm);
8156 if (ret) {
8157 /*
8158 * if we did partial map, or found file backed vmas,
8159 * release any pages we did get
8160 */
8161 if (pret > 0)
8162 unpin_user_pages(pages, pret);
8163 kvfree(imu->bvec);
8164 goto done;
8165 }
8166
8167 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8168 if (ret) {
8169 unpin_user_pages(pages, pret);
8170 kvfree(imu->bvec);
8171 goto done;
8172 }
8173
8174 off = ubuf & ~PAGE_MASK;
8175 size = iov->iov_len;
8176 for (i = 0; i < nr_pages; i++) {
8177 size_t vec_len;
8178
8179 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8180 imu->bvec[i].bv_page = pages[i];
8181 imu->bvec[i].bv_len = vec_len;
8182 imu->bvec[i].bv_offset = off;
8183 off = 0;
8184 size -= vec_len;
8185 }
8186 /* store original address for later verification */
8187 imu->ubuf = ubuf;
8188 imu->len = iov->iov_len;
8189 imu->nr_bvecs = nr_pages;
8190 ret = 0;
8191done:
8192 kvfree(pages);
8193 kvfree(vmas);
8194 return ret;
8195}
8196
2b358604 8197static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 8198{
edafccee
JA
8199 if (ctx->user_bufs)
8200 return -EBUSY;
8201 if (!nr_args || nr_args > UIO_MAXIOV)
8202 return -EINVAL;
8203
8204 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8205 GFP_KERNEL);
8206 if (!ctx->user_bufs)
8207 return -ENOMEM;
8208
2b358604
BM
8209 return 0;
8210}
edafccee 8211
2b358604
BM
8212static int io_buffer_validate(struct iovec *iov)
8213{
8214 /*
8215 * Don't impose further limits on the size and buffer
8216 * constraints here, we'll -EINVAL later when IO is
8217 * submitted if they are wrong.
8218 */
8219 if (!iov->iov_base || !iov->iov_len)
8220 return -EFAULT;
edafccee 8221
2b358604
BM
8222 /* arbitrary limit, but we need something */
8223 if (iov->iov_len > SZ_1G)
8224 return -EFAULT;
edafccee 8225
2b358604
BM
8226 return 0;
8227}
edafccee 8228
2b358604
BM
8229static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8230 unsigned int nr_args)
8231{
8232 int i, ret;
8233 struct iovec iov;
8234 struct page *last_hpage = NULL;
edafccee 8235
2b358604
BM
8236 ret = io_buffers_map_alloc(ctx, nr_args);
8237 if (ret)
8238 return ret;
edafccee 8239
edafccee
JA
8240 for (i = 0; i < nr_args; i++) {
8241 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
edafccee 8242
edafccee
JA
8243 ret = io_copy_iov(ctx, &iov, arg, i);
8244 if (ret)
0a96bbe4 8245 break;
de293938 8246
2b358604
BM
8247 ret = io_buffer_validate(&iov);
8248 if (ret)
0a96bbe4 8249 break;
edafccee 8250
0a96bbe4
BM
8251 ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
8252 if (ret)
8253 break;
edafccee
JA
8254
8255 ctx->nr_user_bufs++;
8256 }
0a96bbe4
BM
8257
8258 if (ret)
8259 io_sqe_buffers_unregister(ctx);
8260
edafccee
JA
8261 return ret;
8262}
8263
9b402849
JA
8264static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8265{
8266 __s32 __user *fds = arg;
8267 int fd;
8268
8269 if (ctx->cq_ev_fd)
8270 return -EBUSY;
8271
8272 if (copy_from_user(&fd, fds, sizeof(*fds)))
8273 return -EFAULT;
8274
8275 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8276 if (IS_ERR(ctx->cq_ev_fd)) {
8277 int ret = PTR_ERR(ctx->cq_ev_fd);
8278 ctx->cq_ev_fd = NULL;
8279 return ret;
8280 }
8281
8282 return 0;
8283}
8284
8285static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8286{
8287 if (ctx->cq_ev_fd) {
8288 eventfd_ctx_put(ctx->cq_ev_fd);
8289 ctx->cq_ev_fd = NULL;
8290 return 0;
8291 }
8292
8293 return -ENXIO;
8294}
8295
5a2e745d
JA
8296static int __io_destroy_buffers(int id, void *p, void *data)
8297{
8298 struct io_ring_ctx *ctx = data;
8299 struct io_buffer *buf = p;
8300
067524e9 8301 __io_remove_buffers(ctx, buf, id, -1U);
5a2e745d
JA
8302 return 0;
8303}
8304
8305static void io_destroy_buffers(struct io_ring_ctx *ctx)
8306{
8307 idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8308 idr_destroy(&ctx->io_buffer_idr);
8309}
8310
68e68ee6 8311static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
1b4c351f 8312{
68e68ee6 8313 struct io_kiocb *req, *nxt;
1b4c351f 8314
68e68ee6
JA
8315 list_for_each_entry_safe(req, nxt, list, compl.list) {
8316 if (tsk && req->task != tsk)
8317 continue;
1b4c351f
JA
8318 list_del(&req->compl.list);
8319 kmem_cache_free(req_cachep, req);
8320 }
8321}
8322
4010fec4 8323static void io_req_caches_free(struct io_ring_ctx *ctx)
2b188cc1 8324{
bf019da7 8325 struct io_submit_state *submit_state = &ctx->submit_state;
e5547d2c 8326 struct io_comp_state *cs = &ctx->submit_state.comp;
bf019da7 8327
9a4fdbd8
JA
8328 mutex_lock(&ctx->uring_lock);
8329
8e5c66c4 8330 if (submit_state->free_reqs) {
9a4fdbd8
JA
8331 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8332 submit_state->reqs);
8e5c66c4
PB
8333 submit_state->free_reqs = 0;
8334 }
9a4fdbd8
JA
8335
8336 spin_lock_irq(&ctx->completion_lock);
e5547d2c
PB
8337 list_splice_init(&cs->locked_free_list, &cs->free_list);
8338 cs->locked_free_nr = 0;
9a4fdbd8
JA
8339 spin_unlock_irq(&ctx->completion_lock);
8340
e5547d2c
PB
8341 io_req_cache_free(&cs->free_list, NULL);
8342
9a4fdbd8
JA
8343 mutex_unlock(&ctx->uring_lock);
8344}
8345
2b188cc1
JA
8346static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8347{
04fc6c80
PB
8348 /*
8349 * Some may use context even when all refs and requests have been put,
8350 * and they are free to do so while still holding uring_lock, see
8351 * __io_req_task_submit(). Wait for them to finish.
8352 */
8353 mutex_lock(&ctx->uring_lock);
8354 mutex_unlock(&ctx->uring_lock);
8355
37d1e2e3 8356 io_sq_thread_finish(ctx);
0a96bbe4 8357 io_sqe_buffers_unregister(ctx);
2aede0e4 8358
37d1e2e3 8359 if (ctx->mm_account) {
2aede0e4
JA
8360 mmdrop(ctx->mm_account);
8361 ctx->mm_account = NULL;
30975825 8362 }
def596e9 8363
8bad28d8 8364 mutex_lock(&ctx->uring_lock);
6b06314c 8365 io_sqe_files_unregister(ctx);
8bad28d8 8366 mutex_unlock(&ctx->uring_lock);
9b402849 8367 io_eventfd_unregister(ctx);
5a2e745d 8368 io_destroy_buffers(ctx);
def596e9 8369
2b188cc1 8370#if defined(CONFIG_UNIX)
355e8d26
EB
8371 if (ctx->ring_sock) {
8372 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 8373 sock_release(ctx->ring_sock);
355e8d26 8374 }
2b188cc1
JA
8375#endif
8376
75b28aff 8377 io_mem_free(ctx->rings);
2b188cc1 8378 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
8379
8380 percpu_ref_exit(&ctx->refs);
2b188cc1 8381 free_uid(ctx->user);
4010fec4 8382 io_req_caches_free(ctx);
e941894e
JA
8383 if (ctx->hash_map)
8384 io_wq_put_hash(ctx->hash_map);
78076bb6 8385 kfree(ctx->cancel_hash);
2b188cc1
JA
8386 kfree(ctx);
8387}
8388
8389static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8390{
8391 struct io_ring_ctx *ctx = file->private_data;
8392 __poll_t mask = 0;
8393
8394 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
8395 /*
8396 * synchronizes with barrier from wq_has_sleeper call in
8397 * io_commit_cqring
8398 */
2b188cc1 8399 smp_rmb();
90554200 8400 if (!io_sqring_full(ctx))
2b188cc1 8401 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
8402
8403 /*
8404 * Don't flush cqring overflow list here, just do a simple check.
8405 * Otherwise there could possible be ABBA deadlock:
8406 * CPU0 CPU1
8407 * ---- ----
8408 * lock(&ctx->uring_lock);
8409 * lock(&ep->mtx);
8410 * lock(&ctx->uring_lock);
8411 * lock(&ep->mtx);
8412 *
8413 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8414 * pushs them to do the flush.
8415 */
8416 if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
2b188cc1
JA
8417 mask |= EPOLLIN | EPOLLRDNORM;
8418
8419 return mask;
8420}
8421
8422static int io_uring_fasync(int fd, struct file *file, int on)
8423{
8424 struct io_ring_ctx *ctx = file->private_data;
8425
8426 return fasync_helper(fd, file, on, &ctx->cq_fasync);
8427}
8428
0bead8cd 8429static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 8430{
4379bf8b 8431 const struct cred *creds;
071698e1 8432
61cf9370 8433 creds = xa_erase(&ctx->personalities, id);
4379bf8b
JA
8434 if (creds) {
8435 put_cred(creds);
0bead8cd 8436 return 0;
1e6fa521 8437 }
0bead8cd
YD
8438
8439 return -EINVAL;
8440}
8441
ba50a036 8442static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
7c25c0d1 8443{
28c4721b 8444 struct callback_head *work, *next;
ba50a036 8445 bool executed = false;
7c25c0d1
JA
8446
8447 do {
28c4721b 8448 work = xchg(&ctx->exit_task_work, NULL);
7c25c0d1
JA
8449 if (!work)
8450 break;
8451
8452 do {
8453 next = work->next;
8454 work->func(work);
8455 work = next;
8456 cond_resched();
8457 } while (work);
ba50a036 8458 executed = true;
7c25c0d1 8459 } while (1);
ba50a036
PB
8460
8461 return executed;
7c25c0d1
JA
8462}
8463
d56d938b
PB
8464struct io_tctx_exit {
8465 struct callback_head task_work;
8466 struct completion completion;
baf186c4 8467 struct io_ring_ctx *ctx;
d56d938b
PB
8468};
8469
8470static void io_tctx_exit_cb(struct callback_head *cb)
8471{
8472 struct io_uring_task *tctx = current->io_uring;
8473 struct io_tctx_exit *work;
8474
8475 work = container_of(cb, struct io_tctx_exit, task_work);
8476 /*
8477 * When @in_idle, we're in cancellation and it's racy to remove the
8478 * node. It'll be removed by the end of cancellation, just ignore it.
8479 */
8480 if (!atomic_read(&tctx->in_idle))
baf186c4 8481 io_uring_del_task_file((unsigned long)work->ctx);
d56d938b
PB
8482 complete(&work->completion);
8483}
8484
85faa7b8
JA
8485static void io_ring_exit_work(struct work_struct *work)
8486{
d56d938b 8487 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
b5bb3a24 8488 unsigned long timeout = jiffies + HZ * 60 * 5;
d56d938b
PB
8489 struct io_tctx_exit exit;
8490 struct io_tctx_node *node;
8491 int ret;
85faa7b8 8492
56952e91
JA
8493 /*
8494 * If we're doing polled IO and end up having requests being
8495 * submitted async (out-of-line), then completions can come in while
8496 * we're waiting for refs to drop. We need to reap these manually,
8497 * as nobody else will be looking for them.
8498 */
b2edc0a7 8499 do {
9936c7c2 8500 io_uring_try_cancel_requests(ctx, NULL, NULL);
b5bb3a24
PB
8501
8502 WARN_ON_ONCE(time_after(jiffies, timeout));
b2edc0a7 8503 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
d56d938b
PB
8504
8505 mutex_lock(&ctx->uring_lock);
8506 while (!list_empty(&ctx->tctx_list)) {
b5bb3a24
PB
8507 WARN_ON_ONCE(time_after(jiffies, timeout));
8508
d56d938b
PB
8509 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
8510 ctx_node);
baf186c4 8511 exit.ctx = ctx;
d56d938b
PB
8512 init_completion(&exit.completion);
8513 init_task_work(&exit.task_work, io_tctx_exit_cb);
8514 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
8515 if (WARN_ON_ONCE(ret))
8516 continue;
8517 wake_up_process(node->task);
8518
8519 mutex_unlock(&ctx->uring_lock);
8520 wait_for_completion(&exit.completion);
8521 cond_resched();
8522 mutex_lock(&ctx->uring_lock);
8523 }
8524 mutex_unlock(&ctx->uring_lock);
8525
85faa7b8
JA
8526 io_ring_ctx_free(ctx);
8527}
8528
2b188cc1
JA
8529static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8530{
61cf9370
MWO
8531 unsigned long index;
8532 struct creds *creds;
8533
2b188cc1
JA
8534 mutex_lock(&ctx->uring_lock);
8535 percpu_ref_kill(&ctx->refs);
cda286f0
PB
8536 /* if force is set, the ring is going away. always drop after that */
8537 ctx->cq_overflow_flushed = 1;
634578f8 8538 if (ctx->rings)
6c503150 8539 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
61cf9370
MWO
8540 xa_for_each(&ctx->personalities, index, creds)
8541 io_unregister_personality(ctx, index);
2b188cc1
JA
8542 mutex_unlock(&ctx->uring_lock);
8543
6b81928d
PB
8544 io_kill_timeouts(ctx, NULL, NULL);
8545 io_poll_remove_all(ctx, NULL, NULL);
561fb04a 8546
15dff286 8547 /* if we failed setting up the ctx, we might not have any rings */
b2edc0a7 8548 io_iopoll_try_reap_events(ctx);
309fc03a 8549
85faa7b8 8550 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
8551 /*
8552 * Use system_unbound_wq to avoid spawning tons of event kworkers
8553 * if we're exiting a ton of rings at the same time. It just adds
8554 * noise and overhead, there's no discernable change in runtime
8555 * over using system_wq.
8556 */
8557 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
8558}
8559
8560static int io_uring_release(struct inode *inode, struct file *file)
8561{
8562 struct io_ring_ctx *ctx = file->private_data;
8563
8564 file->private_data = NULL;
8565 io_ring_ctx_wait_and_kill(ctx);
8566 return 0;
8567}
8568
f6edbabb
PB
8569struct io_task_cancel {
8570 struct task_struct *task;
8571 struct files_struct *files;
8572};
f254ac04 8573
f6edbabb 8574static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 8575{
9a472ef7 8576 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 8577 struct io_task_cancel *cancel = data;
9a472ef7
PB
8578 bool ret;
8579
f6edbabb 8580 if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
9a472ef7
PB
8581 unsigned long flags;
8582 struct io_ring_ctx *ctx = req->ctx;
8583
8584 /* protect against races with linked timeouts */
8585 spin_lock_irqsave(&ctx->completion_lock, flags);
f6edbabb 8586 ret = io_match_task(req, cancel->task, cancel->files);
9a472ef7
PB
8587 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8588 } else {
f6edbabb 8589 ret = io_match_task(req, cancel->task, cancel->files);
9a472ef7
PB
8590 }
8591 return ret;
b711d4ea
JA
8592}
8593
e1915f76 8594static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
ef9865a4 8595 struct task_struct *task,
b7ddce3c
PB
8596 struct files_struct *files)
8597{
e1915f76 8598 struct io_defer_entry *de;
b7ddce3c
PB
8599 LIST_HEAD(list);
8600
8601 spin_lock_irq(&ctx->completion_lock);
8602 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
08d23634 8603 if (io_match_task(de->req, task, files)) {
b7ddce3c
PB
8604 list_cut_position(&list, &ctx->defer_list, &de->list);
8605 break;
8606 }
8607 }
8608 spin_unlock_irq(&ctx->completion_lock);
e1915f76
PB
8609 if (list_empty(&list))
8610 return false;
b7ddce3c
PB
8611
8612 while (!list_empty(&list)) {
8613 de = list_first_entry(&list, struct io_defer_entry, list);
8614 list_del_init(&de->list);
8615 req_set_fail_links(de->req);
8616 io_put_req(de->req);
8617 io_req_complete(de->req, -ECANCELED);
8618 kfree(de);
8619 }
e1915f76 8620 return true;
b7ddce3c
PB
8621}
8622
1b00764f
PB
8623static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
8624{
8625 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8626
8627 return req->ctx == data;
8628}
8629
8630static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
8631{
8632 struct io_tctx_node *node;
8633 enum io_wq_cancel cret;
8634 bool ret = false;
8635
8636 mutex_lock(&ctx->uring_lock);
8637 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
8638 struct io_uring_task *tctx = node->task->io_uring;
8639
8640 /*
8641 * io_wq will stay alive while we hold uring_lock, because it's
8642 * killed after ctx nodes, which requires to take the lock.
8643 */
8644 if (!tctx || !tctx->io_wq)
8645 continue;
8646 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
8647 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8648 }
8649 mutex_unlock(&ctx->uring_lock);
8650
8651 return ret;
8652}
8653
9936c7c2
PB
8654static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8655 struct task_struct *task,
8656 struct files_struct *files)
8657{
8658 struct io_task_cancel cancel = { .task = task, .files = files, };
1b00764f 8659 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9936c7c2
PB
8660
8661 while (1) {
8662 enum io_wq_cancel cret;
8663 bool ret = false;
8664
1b00764f
PB
8665 if (!task) {
8666 ret |= io_uring_try_cancel_iowq(ctx);
8667 } else if (tctx && tctx->io_wq) {
8668 /*
8669 * Cancels requests of all rings, not only @ctx, but
8670 * it's fine as the task is in exit/exec.
8671 */
5aa75ed5 8672 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9936c7c2
PB
8673 &cancel, true);
8674 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8675 }
8676
8677 /* SQPOLL thread does its own polling */
d052d1d6
JA
8678 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) ||
8679 (ctx->sq_data && ctx->sq_data->thread == current)) {
9936c7c2
PB
8680 while (!list_empty_careful(&ctx->iopoll_list)) {
8681 io_iopoll_try_reap_events(ctx);
8682 ret = true;
8683 }
8684 }
8685
e1915f76 8686 ret |= io_cancel_defer_files(ctx, task, files);
9936c7c2
PB
8687 ret |= io_poll_remove_all(ctx, task, files);
8688 ret |= io_kill_timeouts(ctx, task, files);
8689 ret |= io_run_task_work();
ba50a036 8690 ret |= io_run_ctx_fallback(ctx);
9936c7c2
PB
8691 io_cqring_overflow_flush(ctx, true, task, files);
8692 if (!ret)
8693 break;
8694 cond_resched();
8695 }
8696}
8697
ca70f00b
PB
8698static int io_uring_count_inflight(struct io_ring_ctx *ctx,
8699 struct task_struct *task,
8700 struct files_struct *files)
8701{
8702 struct io_kiocb *req;
8703 int cnt = 0;
8704
8705 spin_lock_irq(&ctx->inflight_lock);
8706 list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
8707 cnt += io_match_task(req, task, files);
8708 spin_unlock_irq(&ctx->inflight_lock);
8709 return cnt;
8710}
8711
b52fda00 8712static void io_uring_cancel_files(struct io_ring_ctx *ctx,
df9923f9 8713 struct task_struct *task,
fcb323cc
JA
8714 struct files_struct *files)
8715{
fcb323cc 8716 while (!list_empty_careful(&ctx->inflight_list)) {
d8f1b971 8717 DEFINE_WAIT(wait);
ca70f00b 8718 int inflight;
fcb323cc 8719
ca70f00b
PB
8720 inflight = io_uring_count_inflight(ctx, task, files);
8721 if (!inflight)
fcb323cc 8722 break;
f6edbabb 8723
9936c7c2 8724 io_uring_try_cancel_requests(ctx, task, files);
ca70f00b
PB
8725
8726 prepare_to_wait(&task->io_uring->wait, &wait,
8727 TASK_UNINTERRUPTIBLE);
8728 if (inflight == io_uring_count_inflight(ctx, task, files))
8729 schedule();
c98de08c 8730 finish_wait(&task->io_uring->wait, &wait);
0f212204 8731 }
0f212204
JA
8732}
8733
0f212204
JA
8734/*
8735 * Note that this task has used io_uring. We use it for cancelation purposes.
8736 */
baf186c4 8737static int io_uring_add_task_file(struct io_ring_ctx *ctx)
0f212204 8738{
236434c3 8739 struct io_uring_task *tctx = current->io_uring;
13bf43f5 8740 struct io_tctx_node *node;
a528b04e 8741 int ret;
236434c3
MWO
8742
8743 if (unlikely(!tctx)) {
5aa75ed5 8744 ret = io_uring_alloc_task_context(current, ctx);
0f212204
JA
8745 if (unlikely(ret))
8746 return ret;
236434c3 8747 tctx = current->io_uring;
0f212204 8748 }
baf186c4
PB
8749 if (tctx->last != ctx) {
8750 void *old = xa_load(&tctx->xa, (unsigned long)ctx);
0f212204 8751
236434c3 8752 if (!old) {
13bf43f5
PB
8753 node = kmalloc(sizeof(*node), GFP_KERNEL);
8754 if (!node)
8755 return -ENOMEM;
8756 node->ctx = ctx;
13bf43f5
PB
8757 node->task = current;
8758
baf186c4 8759 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
13bf43f5 8760 node, GFP_KERNEL));
a528b04e 8761 if (ret) {
13bf43f5 8762 kfree(node);
a528b04e
PB
8763 return ret;
8764 }
13bf43f5
PB
8765
8766 mutex_lock(&ctx->uring_lock);
8767 list_add(&node->ctx_node, &ctx->tctx_list);
8768 mutex_unlock(&ctx->uring_lock);
0f212204 8769 }
baf186c4 8770 tctx->last = ctx;
0f212204 8771 }
0f212204
JA
8772 return 0;
8773}
8774
8775/*
8776 * Remove this io_uring_file -> task mapping.
8777 */
2941267b 8778static void io_uring_del_task_file(unsigned long index)
0f212204
JA
8779{
8780 struct io_uring_task *tctx = current->io_uring;
13bf43f5 8781 struct io_tctx_node *node;
2941267b 8782
eebd2e37
PB
8783 if (!tctx)
8784 return;
13bf43f5
PB
8785 node = xa_erase(&tctx->xa, index);
8786 if (!node)
2941267b 8787 return;
0f212204 8788
13bf43f5
PB
8789 WARN_ON_ONCE(current != node->task);
8790 WARN_ON_ONCE(list_empty(&node->ctx_node));
8791
8792 mutex_lock(&node->ctx->uring_lock);
8793 list_del(&node->ctx_node);
8794 mutex_unlock(&node->ctx->uring_lock);
8795
baf186c4 8796 if (tctx->last == node->ctx)
0f212204 8797 tctx->last = NULL;
13bf43f5 8798 kfree(node);
0f212204
JA
8799}
8800
8452d4a6 8801static void io_uring_clean_tctx(struct io_uring_task *tctx)
de7f1d9e 8802{
13bf43f5 8803 struct io_tctx_node *node;
de7f1d9e
PB
8804 unsigned long index;
8805
13bf43f5 8806 xa_for_each(&tctx->xa, index, node)
2941267b 8807 io_uring_del_task_file(index);
8452d4a6
PB
8808 if (tctx->io_wq) {
8809 io_wq_put_and_exit(tctx->io_wq);
8810 tctx->io_wq = NULL;
8811 }
de7f1d9e
PB
8812}
8813
521d6a73
PB
8814static s64 tctx_inflight(struct io_uring_task *tctx)
8815{
8816 return percpu_counter_sum(&tctx->inflight);
8817}
8818
8819static void io_sqpoll_cancel_cb(struct callback_head *cb)
8820{
8821 struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
8822 struct io_ring_ctx *ctx = work->ctx;
8823 struct io_sq_data *sqd = ctx->sq_data;
8824
8825 if (sqd->thread)
8826 io_uring_cancel_sqpoll(ctx);
8827 complete(&work->completion);
8828}
8829
8830static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
8831{
8832 struct io_sq_data *sqd = ctx->sq_data;
8833 struct io_tctx_exit work = { .ctx = ctx, };
8834 struct task_struct *task;
8835
8836 io_sq_thread_park(sqd);
8837 list_del_init(&ctx->sqd_list);
8838 io_sqd_update_thread_idle(sqd);
8839 task = sqd->thread;
8840 if (task) {
8841 init_completion(&work.completion);
8842 init_task_work(&work.task_work, io_sqpoll_cancel_cb);
8843 WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL));
8844 wake_up_process(task);
8845 }
8846 io_sq_thread_unpark(sqd);
8847
8848 if (task)
8849 wait_for_completion(&work.completion);
8850}
8851
0f212204
JA
8852void __io_uring_files_cancel(struct files_struct *files)
8853{
8854 struct io_uring_task *tctx = current->io_uring;
13bf43f5 8855 struct io_tctx_node *node;
ce765372 8856 unsigned long index;
0f212204
JA
8857
8858 /* make sure overflow events are dropped */
fdaf083c 8859 atomic_inc(&tctx->in_idle);
521d6a73
PB
8860 xa_for_each(&tctx->xa, index, node) {
8861 struct io_ring_ctx *ctx = node->ctx;
8862
8863 if (ctx->sq_data) {
8864 io_sqpoll_cancel_sync(ctx);
8865 continue;
8866 }
8867 io_uring_cancel_files(ctx, current, files);
8868 if (!files)
8869 io_uring_try_cancel_requests(ctx, current, NULL);
8870 }
fdaf083c 8871 atomic_dec(&tctx->in_idle);
de7f1d9e 8872
8452d4a6
PB
8873 if (files)
8874 io_uring_clean_tctx(tctx);
fdaf083c
JA
8875}
8876
521d6a73 8877/* should only be called by SQPOLL task */
0e9ddb39
PB
8878static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
8879{
37d1e2e3 8880 struct io_sq_data *sqd = ctx->sq_data;
521d6a73 8881 struct io_uring_task *tctx = current->io_uring;
0e9ddb39
PB
8882 s64 inflight;
8883 DEFINE_WAIT(wait);
fdaf083c 8884
521d6a73
PB
8885 WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
8886
0e9ddb39
PB
8887 atomic_inc(&tctx->in_idle);
8888 do {
8889 /* read completions before cancelations */
8890 inflight = tctx_inflight(tctx);
8891 if (!inflight)
8892 break;
521d6a73 8893 io_uring_try_cancel_requests(ctx, current, NULL);
fdaf083c 8894
0e9ddb39
PB
8895 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8896 /*
8897 * If we've seen completions, retry without waiting. This
8898 * avoids a race where a completion comes in before we did
8899 * prepare_to_wait().
8900 */
8901 if (inflight == tctx_inflight(tctx))
8902 schedule();
8903 finish_wait(&tctx->wait, &wait);
8904 } while (1);
8905 atomic_dec(&tctx->in_idle);
0f212204
JA
8906}
8907
0f212204
JA
8908/*
8909 * Find any io_uring fd that this task has registered or done IO on, and cancel
8910 * requests.
8911 */
8912void __io_uring_task_cancel(void)
8913{
8914 struct io_uring_task *tctx = current->io_uring;
8915 DEFINE_WAIT(wait);
d8a6df10 8916 s64 inflight;
0f212204
JA
8917
8918 /* make sure overflow events are dropped */
fdaf083c 8919 atomic_inc(&tctx->in_idle);
d8a6df10 8920 do {
0f212204 8921 /* read completions before cancelations */
fdaf083c 8922 inflight = tctx_inflight(tctx);
d8a6df10
JA
8923 if (!inflight)
8924 break;
0f212204
JA
8925 __io_uring_files_cancel(NULL);
8926
8927 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8928
8929 /*
a1bb3cd5
PB
8930 * If we've seen completions, retry without waiting. This
8931 * avoids a race where a completion comes in before we did
8932 * prepare_to_wait().
0f212204 8933 */
a1bb3cd5
PB
8934 if (inflight == tctx_inflight(tctx))
8935 schedule();
f57555ed 8936 finish_wait(&tctx->wait, &wait);
d8a6df10 8937 } while (1);
0f212204 8938
fdaf083c 8939 atomic_dec(&tctx->in_idle);
de7f1d9e 8940
8452d4a6
PB
8941 io_uring_clean_tctx(tctx);
8942 /* all current's requests should be gone, we can kill tctx */
8943 __io_uring_free(current);
44e728b8
PB
8944}
8945
6c5c240e
RP
8946static void *io_uring_validate_mmap_request(struct file *file,
8947 loff_t pgoff, size_t sz)
2b188cc1 8948{
2b188cc1 8949 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 8950 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
8951 struct page *page;
8952 void *ptr;
8953
8954 switch (offset) {
8955 case IORING_OFF_SQ_RING:
75b28aff
HV
8956 case IORING_OFF_CQ_RING:
8957 ptr = ctx->rings;
2b188cc1
JA
8958 break;
8959 case IORING_OFF_SQES:
8960 ptr = ctx->sq_sqes;
8961 break;
2b188cc1 8962 default:
6c5c240e 8963 return ERR_PTR(-EINVAL);
2b188cc1
JA
8964 }
8965
8966 page = virt_to_head_page(ptr);
a50b854e 8967 if (sz > page_size(page))
6c5c240e
RP
8968 return ERR_PTR(-EINVAL);
8969
8970 return ptr;
8971}
8972
8973#ifdef CONFIG_MMU
8974
8975static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8976{
8977 size_t sz = vma->vm_end - vma->vm_start;
8978 unsigned long pfn;
8979 void *ptr;
8980
8981 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
8982 if (IS_ERR(ptr))
8983 return PTR_ERR(ptr);
2b188cc1
JA
8984
8985 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
8986 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
8987}
8988
6c5c240e
RP
8989#else /* !CONFIG_MMU */
8990
8991static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8992{
8993 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
8994}
8995
8996static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
8997{
8998 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
8999}
9000
9001static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9002 unsigned long addr, unsigned long len,
9003 unsigned long pgoff, unsigned long flags)
9004{
9005 void *ptr;
9006
9007 ptr = io_uring_validate_mmap_request(file, pgoff, len);
9008 if (IS_ERR(ptr))
9009 return PTR_ERR(ptr);
9010
9011 return (unsigned long) ptr;
9012}
9013
9014#endif /* !CONFIG_MMU */
9015
d9d05217 9016static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200
JA
9017{
9018 DEFINE_WAIT(wait);
9019
9020 do {
9021 if (!io_sqring_full(ctx))
9022 break;
90554200
JA
9023 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9024
9025 if (!io_sqring_full(ctx))
9026 break;
90554200
JA
9027 schedule();
9028 } while (!signal_pending(current));
9029
9030 finish_wait(&ctx->sqo_sq_wait, &wait);
5199328a 9031 return 0;
90554200
JA
9032}
9033
c73ebb68
HX
9034static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9035 struct __kernel_timespec __user **ts,
9036 const sigset_t __user **sig)
9037{
9038 struct io_uring_getevents_arg arg;
9039
9040 /*
9041 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9042 * is just a pointer to the sigset_t.
9043 */
9044 if (!(flags & IORING_ENTER_EXT_ARG)) {
9045 *sig = (const sigset_t __user *) argp;
9046 *ts = NULL;
9047 return 0;
9048 }
9049
9050 /*
9051 * EXT_ARG is set - ensure we agree on the size of it and copy in our
9052 * timespec and sigset_t pointers if good.
9053 */
9054 if (*argsz != sizeof(arg))
9055 return -EINVAL;
9056 if (copy_from_user(&arg, argp, sizeof(arg)))
9057 return -EFAULT;
9058 *sig = u64_to_user_ptr(arg.sigmask);
9059 *argsz = arg.sigmask_sz;
9060 *ts = u64_to_user_ptr(arg.ts);
9061 return 0;
9062}
9063
2b188cc1 9064SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
9065 u32, min_complete, u32, flags, const void __user *, argp,
9066 size_t, argsz)
2b188cc1
JA
9067{
9068 struct io_ring_ctx *ctx;
9069 long ret = -EBADF;
9070 int submitted = 0;
9071 struct fd f;
9072
4c6e277c 9073 io_run_task_work();
b41e9852 9074
90554200 9075 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
c73ebb68 9076 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
2b188cc1
JA
9077 return -EINVAL;
9078
9079 f = fdget(fd);
9080 if (!f.file)
9081 return -EBADF;
9082
9083 ret = -EOPNOTSUPP;
9084 if (f.file->f_op != &io_uring_fops)
9085 goto out_fput;
9086
9087 ret = -ENXIO;
9088 ctx = f.file->private_data;
9089 if (!percpu_ref_tryget(&ctx->refs))
9090 goto out_fput;
9091
7e84e1c7
SG
9092 ret = -EBADFD;
9093 if (ctx->flags & IORING_SETUP_R_DISABLED)
9094 goto out;
9095
6c271ce2
JA
9096 /*
9097 * For SQ polling, the thread will do all submissions and completions.
9098 * Just return the requested submit count, and wake the thread if
9099 * we were asked to.
9100 */
b2a9eada 9101 ret = 0;
6c271ce2 9102 if (ctx->flags & IORING_SETUP_SQPOLL) {
6c503150 9103 io_cqring_overflow_flush(ctx, false, NULL, NULL);
89448c47 9104
d9d05217 9105 ret = -EOWNERDEAD;
04147488
SM
9106 if (unlikely(ctx->sq_data->thread == NULL)) {
9107 goto out;
9108 }
6c271ce2 9109 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 9110 wake_up(&ctx->sq_data->wait);
d9d05217
PB
9111 if (flags & IORING_ENTER_SQ_WAIT) {
9112 ret = io_sqpoll_wait_sq(ctx);
9113 if (ret)
9114 goto out;
9115 }
6c271ce2 9116 submitted = to_submit;
b2a9eada 9117 } else if (to_submit) {
baf186c4 9118 ret = io_uring_add_task_file(ctx);
0f212204
JA
9119 if (unlikely(ret))
9120 goto out;
2b188cc1 9121 mutex_lock(&ctx->uring_lock);
0f212204 9122 submitted = io_submit_sqes(ctx, to_submit);
2b188cc1 9123 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
9124
9125 if (submitted != to_submit)
9126 goto out;
2b188cc1
JA
9127 }
9128 if (flags & IORING_ENTER_GETEVENTS) {
c73ebb68
HX
9129 const sigset_t __user *sig;
9130 struct __kernel_timespec __user *ts;
9131
9132 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9133 if (unlikely(ret))
9134 goto out;
9135
2b188cc1
JA
9136 min_complete = min(min_complete, ctx->cq_entries);
9137
32b2244a
XW
9138 /*
9139 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9140 * space applications don't need to do io completion events
9141 * polling again, they can rely on io_sq_thread to do polling
9142 * work, which can reduce cpu usage and uring_lock contention.
9143 */
9144 if (ctx->flags & IORING_SETUP_IOPOLL &&
9145 !(ctx->flags & IORING_SETUP_SQPOLL)) {
7668b92a 9146 ret = io_iopoll_check(ctx, min_complete);
def596e9 9147 } else {
c73ebb68 9148 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
def596e9 9149 }
2b188cc1
JA
9150 }
9151
7c504e65 9152out:
6805b32e 9153 percpu_ref_put(&ctx->refs);
2b188cc1
JA
9154out_fput:
9155 fdput(f);
9156 return submitted ? submitted : ret;
9157}
9158
bebdb65e 9159#ifdef CONFIG_PROC_FS
61cf9370
MWO
9160static int io_uring_show_cred(struct seq_file *m, unsigned int id,
9161 const struct cred *cred)
87ce955b 9162{
87ce955b
JA
9163 struct user_namespace *uns = seq_user_ns(m);
9164 struct group_info *gi;
9165 kernel_cap_t cap;
9166 unsigned __capi;
9167 int g;
9168
9169 seq_printf(m, "%5d\n", id);
9170 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9171 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9172 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9173 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9174 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9175 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9176 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9177 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9178 seq_puts(m, "\n\tGroups:\t");
9179 gi = cred->group_info;
9180 for (g = 0; g < gi->ngroups; g++) {
9181 seq_put_decimal_ull(m, g ? " " : "",
9182 from_kgid_munged(uns, gi->gid[g]));
9183 }
9184 seq_puts(m, "\n\tCapEff:\t");
9185 cap = cred->cap_effective;
9186 CAP_FOR_EACH_U32(__capi)
9187 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9188 seq_putc(m, '\n');
9189 return 0;
9190}
9191
9192static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9193{
dbbe9c64 9194 struct io_sq_data *sq = NULL;
fad8e0de 9195 bool has_lock;
87ce955b
JA
9196 int i;
9197
fad8e0de
JA
9198 /*
9199 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9200 * since fdinfo case grabs it in the opposite direction of normal use
9201 * cases. If we fail to get the lock, we just don't iterate any
9202 * structures that could be going away outside the io_uring mutex.
9203 */
9204 has_lock = mutex_trylock(&ctx->uring_lock);
9205
5f3f26f9 9206 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
dbbe9c64 9207 sq = ctx->sq_data;
5f3f26f9
JA
9208 if (!sq->thread)
9209 sq = NULL;
9210 }
dbbe9c64
JQ
9211
9212 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9213 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 9214 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 9215 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
ea64ec02 9216 struct file *f = *io_fixed_file_slot(ctx->file_data, i);
87ce955b 9217
87ce955b
JA
9218 if (f)
9219 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9220 else
9221 seq_printf(m, "%5u: <none>\n", i);
9222 }
9223 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 9224 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
87ce955b
JA
9225 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9226
9227 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9228 (unsigned int) buf->len);
9229 }
61cf9370
MWO
9230 if (has_lock && !xa_empty(&ctx->personalities)) {
9231 unsigned long index;
9232 const struct cred *cred;
9233
87ce955b 9234 seq_printf(m, "Personalities:\n");
61cf9370
MWO
9235 xa_for_each(&ctx->personalities, index, cred)
9236 io_uring_show_cred(m, index, cred);
87ce955b 9237 }
d7718a9d
JA
9238 seq_printf(m, "PollList:\n");
9239 spin_lock_irq(&ctx->completion_lock);
9240 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9241 struct hlist_head *list = &ctx->cancel_hash[i];
9242 struct io_kiocb *req;
9243
9244 hlist_for_each_entry(req, list, hash_node)
9245 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
9246 req->task->task_works != NULL);
9247 }
9248 spin_unlock_irq(&ctx->completion_lock);
fad8e0de
JA
9249 if (has_lock)
9250 mutex_unlock(&ctx->uring_lock);
87ce955b
JA
9251}
9252
9253static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9254{
9255 struct io_ring_ctx *ctx = f->private_data;
9256
9257 if (percpu_ref_tryget(&ctx->refs)) {
9258 __io_uring_show_fdinfo(ctx, m);
9259 percpu_ref_put(&ctx->refs);
9260 }
9261}
bebdb65e 9262#endif
87ce955b 9263
2b188cc1
JA
9264static const struct file_operations io_uring_fops = {
9265 .release = io_uring_release,
9266 .mmap = io_uring_mmap,
6c5c240e
RP
9267#ifndef CONFIG_MMU
9268 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9269 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9270#endif
2b188cc1
JA
9271 .poll = io_uring_poll,
9272 .fasync = io_uring_fasync,
bebdb65e 9273#ifdef CONFIG_PROC_FS
87ce955b 9274 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 9275#endif
2b188cc1
JA
9276};
9277
9278static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9279 struct io_uring_params *p)
9280{
75b28aff
HV
9281 struct io_rings *rings;
9282 size_t size, sq_array_offset;
2b188cc1 9283
bd740481
JA
9284 /* make sure these are sane, as we already accounted them */
9285 ctx->sq_entries = p->sq_entries;
9286 ctx->cq_entries = p->cq_entries;
9287
75b28aff
HV
9288 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9289 if (size == SIZE_MAX)
9290 return -EOVERFLOW;
9291
9292 rings = io_mem_alloc(size);
9293 if (!rings)
2b188cc1
JA
9294 return -ENOMEM;
9295
75b28aff
HV
9296 ctx->rings = rings;
9297 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9298 rings->sq_ring_mask = p->sq_entries - 1;
9299 rings->cq_ring_mask = p->cq_entries - 1;
9300 rings->sq_ring_entries = p->sq_entries;
9301 rings->cq_ring_entries = p->cq_entries;
9302 ctx->sq_mask = rings->sq_ring_mask;
9303 ctx->cq_mask = rings->cq_ring_mask;
2b188cc1
JA
9304
9305 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
9306 if (size == SIZE_MAX) {
9307 io_mem_free(ctx->rings);
9308 ctx->rings = NULL;
2b188cc1 9309 return -EOVERFLOW;
eb065d30 9310 }
2b188cc1
JA
9311
9312 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
9313 if (!ctx->sq_sqes) {
9314 io_mem_free(ctx->rings);
9315 ctx->rings = NULL;
2b188cc1 9316 return -ENOMEM;
eb065d30 9317 }
2b188cc1 9318
2b188cc1
JA
9319 return 0;
9320}
9321
9faadcc8
PB
9322static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9323{
9324 int ret, fd;
9325
9326 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9327 if (fd < 0)
9328 return fd;
9329
baf186c4 9330 ret = io_uring_add_task_file(ctx);
9faadcc8
PB
9331 if (ret) {
9332 put_unused_fd(fd);
9333 return ret;
9334 }
9335 fd_install(fd, file);
9336 return fd;
9337}
9338
2b188cc1
JA
9339/*
9340 * Allocate an anonymous fd, this is what constitutes the application
9341 * visible backing of an io_uring instance. The application mmaps this
9342 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9343 * we have to tie this fd to a socket for file garbage collection purposes.
9344 */
9faadcc8 9345static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
9346{
9347 struct file *file;
9faadcc8 9348#if defined(CONFIG_UNIX)
2b188cc1
JA
9349 int ret;
9350
2b188cc1
JA
9351 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9352 &ctx->ring_sock);
9353 if (ret)
9faadcc8 9354 return ERR_PTR(ret);
2b188cc1
JA
9355#endif
9356
2b188cc1
JA
9357 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9358 O_RDWR | O_CLOEXEC);
2b188cc1 9359#if defined(CONFIG_UNIX)
9faadcc8
PB
9360 if (IS_ERR(file)) {
9361 sock_release(ctx->ring_sock);
9362 ctx->ring_sock = NULL;
9363 } else {
9364 ctx->ring_sock->file = file;
0f212204 9365 }
2b188cc1 9366#endif
9faadcc8 9367 return file;
2b188cc1
JA
9368}
9369
7f13657d
XW
9370static int io_uring_create(unsigned entries, struct io_uring_params *p,
9371 struct io_uring_params __user *params)
2b188cc1 9372{
2b188cc1 9373 struct io_ring_ctx *ctx;
9faadcc8 9374 struct file *file;
2b188cc1
JA
9375 int ret;
9376
8110c1a6 9377 if (!entries)
2b188cc1 9378 return -EINVAL;
8110c1a6
JA
9379 if (entries > IORING_MAX_ENTRIES) {
9380 if (!(p->flags & IORING_SETUP_CLAMP))
9381 return -EINVAL;
9382 entries = IORING_MAX_ENTRIES;
9383 }
2b188cc1
JA
9384
9385 /*
9386 * Use twice as many entries for the CQ ring. It's possible for the
9387 * application to drive a higher depth than the size of the SQ ring,
9388 * since the sqes are only used at submission time. This allows for
33a107f0
JA
9389 * some flexibility in overcommitting a bit. If the application has
9390 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9391 * of CQ ring entries manually.
2b188cc1
JA
9392 */
9393 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
9394 if (p->flags & IORING_SETUP_CQSIZE) {
9395 /*
9396 * If IORING_SETUP_CQSIZE is set, we do the same roundup
9397 * to a power-of-two, if it isn't already. We do NOT impose
9398 * any cq vs sq ring sizing.
9399 */
eb2667b3 9400 if (!p->cq_entries)
33a107f0 9401 return -EINVAL;
8110c1a6
JA
9402 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9403 if (!(p->flags & IORING_SETUP_CLAMP))
9404 return -EINVAL;
9405 p->cq_entries = IORING_MAX_CQ_ENTRIES;
9406 }
eb2667b3
JQ
9407 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9408 if (p->cq_entries < p->sq_entries)
9409 return -EINVAL;
33a107f0
JA
9410 } else {
9411 p->cq_entries = 2 * p->sq_entries;
9412 }
2b188cc1 9413
2b188cc1 9414 ctx = io_ring_ctx_alloc(p);
62e398be 9415 if (!ctx)
2b188cc1 9416 return -ENOMEM;
2b188cc1 9417 ctx->compat = in_compat_syscall();
62e398be
JA
9418 if (!capable(CAP_IPC_LOCK))
9419 ctx->user = get_uid(current_user());
2aede0e4
JA
9420
9421 /*
9422 * This is just grabbed for accounting purposes. When a process exits,
9423 * the mm is exited and dropped before the files, hence we need to hang
9424 * on to this mm purely for the purposes of being able to unaccount
9425 * memory (locked/pinned vm). It's not used for anything else.
9426 */
6b7898eb 9427 mmgrab(current->mm);
2aede0e4 9428 ctx->mm_account = current->mm;
6b7898eb 9429
2b188cc1
JA
9430 ret = io_allocate_scq_urings(ctx, p);
9431 if (ret)
9432 goto err;
9433
7e84e1c7 9434 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
9435 if (ret)
9436 goto err;
9437
2b188cc1 9438 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
9439 p->sq_off.head = offsetof(struct io_rings, sq.head);
9440 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9441 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9442 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9443 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9444 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9445 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
9446
9447 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
9448 p->cq_off.head = offsetof(struct io_rings, cq.head);
9449 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9450 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9451 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9452 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9453 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 9454 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 9455
7f13657d
XW
9456 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9457 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 9458 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68 9459 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
1c0aa1fa 9460 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
7f13657d
XW
9461
9462 if (copy_to_user(params, p, sizeof(*p))) {
9463 ret = -EFAULT;
9464 goto err;
9465 }
d1719f70 9466
9faadcc8
PB
9467 file = io_uring_get_file(ctx);
9468 if (IS_ERR(file)) {
9469 ret = PTR_ERR(file);
9470 goto err;
9471 }
9472
044c1ab3
JA
9473 /*
9474 * Install ring fd as the very last thing, so we don't risk someone
9475 * having closed it before we finish setup
9476 */
9faadcc8
PB
9477 ret = io_uring_install_fd(ctx, file);
9478 if (ret < 0) {
9479 /* fput will clean it up */
9480 fput(file);
9481 return ret;
9482 }
044c1ab3 9483
c826bd7a 9484 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
9485 return ret;
9486err:
9487 io_ring_ctx_wait_and_kill(ctx);
9488 return ret;
9489}
9490
9491/*
9492 * Sets up an aio uring context, and returns the fd. Applications asks for a
9493 * ring size, we return the actual sq/cq ring sizes (among other things) in the
9494 * params structure passed in.
9495 */
9496static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9497{
9498 struct io_uring_params p;
2b188cc1
JA
9499 int i;
9500
9501 if (copy_from_user(&p, params, sizeof(p)))
9502 return -EFAULT;
9503 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9504 if (p.resv[i])
9505 return -EINVAL;
9506 }
9507
6c271ce2 9508 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 9509 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7
SG
9510 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9511 IORING_SETUP_R_DISABLED))
2b188cc1
JA
9512 return -EINVAL;
9513
7f13657d 9514 return io_uring_create(entries, &p, params);
2b188cc1
JA
9515}
9516
9517SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9518 struct io_uring_params __user *, params)
9519{
9520 return io_uring_setup(entries, params);
9521}
9522
66f4af93
JA
9523static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9524{
9525 struct io_uring_probe *p;
9526 size_t size;
9527 int i, ret;
9528
9529 size = struct_size(p, ops, nr_args);
9530 if (size == SIZE_MAX)
9531 return -EOVERFLOW;
9532 p = kzalloc(size, GFP_KERNEL);
9533 if (!p)
9534 return -ENOMEM;
9535
9536 ret = -EFAULT;
9537 if (copy_from_user(p, arg, size))
9538 goto out;
9539 ret = -EINVAL;
9540 if (memchr_inv(p, 0, size))
9541 goto out;
9542
9543 p->last_op = IORING_OP_LAST - 1;
9544 if (nr_args > IORING_OP_LAST)
9545 nr_args = IORING_OP_LAST;
9546
9547 for (i = 0; i < nr_args; i++) {
9548 p->ops[i].op = i;
9549 if (!io_op_defs[i].not_supported)
9550 p->ops[i].flags = IO_URING_OP_SUPPORTED;
9551 }
9552 p->ops_len = i;
9553
9554 ret = 0;
9555 if (copy_to_user(arg, p, size))
9556 ret = -EFAULT;
9557out:
9558 kfree(p);
9559 return ret;
9560}
9561
071698e1
JA
9562static int io_register_personality(struct io_ring_ctx *ctx)
9563{
4379bf8b 9564 const struct cred *creds;
61cf9370 9565 u32 id;
1e6fa521 9566 int ret;
071698e1 9567
4379bf8b 9568 creds = get_current_cred();
1e6fa521 9569
61cf9370
MWO
9570 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
9571 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
9572 if (!ret)
9573 return id;
9574 put_cred(creds);
1e6fa521 9575 return ret;
071698e1
JA
9576}
9577
21b55dbc
SG
9578static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9579 unsigned int nr_args)
9580{
9581 struct io_uring_restriction *res;
9582 size_t size;
9583 int i, ret;
9584
7e84e1c7
SG
9585 /* Restrictions allowed only if rings started disabled */
9586 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9587 return -EBADFD;
9588
21b55dbc 9589 /* We allow only a single restrictions registration */
7e84e1c7 9590 if (ctx->restrictions.registered)
21b55dbc
SG
9591 return -EBUSY;
9592
9593 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9594 return -EINVAL;
9595
9596 size = array_size(nr_args, sizeof(*res));
9597 if (size == SIZE_MAX)
9598 return -EOVERFLOW;
9599
9600 res = memdup_user(arg, size);
9601 if (IS_ERR(res))
9602 return PTR_ERR(res);
9603
9604 ret = 0;
9605
9606 for (i = 0; i < nr_args; i++) {
9607 switch (res[i].opcode) {
9608 case IORING_RESTRICTION_REGISTER_OP:
9609 if (res[i].register_op >= IORING_REGISTER_LAST) {
9610 ret = -EINVAL;
9611 goto out;
9612 }
9613
9614 __set_bit(res[i].register_op,
9615 ctx->restrictions.register_op);
9616 break;
9617 case IORING_RESTRICTION_SQE_OP:
9618 if (res[i].sqe_op >= IORING_OP_LAST) {
9619 ret = -EINVAL;
9620 goto out;
9621 }
9622
9623 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9624 break;
9625 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9626 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9627 break;
9628 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9629 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9630 break;
9631 default:
9632 ret = -EINVAL;
9633 goto out;
9634 }
9635 }
9636
9637out:
9638 /* Reset all restrictions if an error happened */
9639 if (ret != 0)
9640 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9641 else
7e84e1c7 9642 ctx->restrictions.registered = true;
21b55dbc
SG
9643
9644 kfree(res);
9645 return ret;
9646}
9647
7e84e1c7
SG
9648static int io_register_enable_rings(struct io_ring_ctx *ctx)
9649{
9650 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9651 return -EBADFD;
9652
9653 if (ctx->restrictions.registered)
9654 ctx->restricted = 1;
9655
0298ef96
PB
9656 ctx->flags &= ~IORING_SETUP_R_DISABLED;
9657 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
9658 wake_up(&ctx->sq_data->wait);
7e84e1c7
SG
9659 return 0;
9660}
9661
071698e1
JA
9662static bool io_register_op_must_quiesce(int op)
9663{
9664 switch (op) {
9665 case IORING_UNREGISTER_FILES:
9666 case IORING_REGISTER_FILES_UPDATE:
9667 case IORING_REGISTER_PROBE:
9668 case IORING_REGISTER_PERSONALITY:
9669 case IORING_UNREGISTER_PERSONALITY:
9670 return false;
9671 default:
9672 return true;
9673 }
9674}
9675
edafccee
JA
9676static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9677 void __user *arg, unsigned nr_args)
b19062a5
JA
9678 __releases(ctx->uring_lock)
9679 __acquires(ctx->uring_lock)
edafccee
JA
9680{
9681 int ret;
9682
35fa71a0
JA
9683 /*
9684 * We're inside the ring mutex, if the ref is already dying, then
9685 * someone else killed the ctx or is already going through
9686 * io_uring_register().
9687 */
9688 if (percpu_ref_is_dying(&ctx->refs))
9689 return -ENXIO;
9690
071698e1 9691 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 9692 percpu_ref_kill(&ctx->refs);
b19062a5 9693
05f3fb3c
JA
9694 /*
9695 * Drop uring mutex before waiting for references to exit. If
9696 * another thread is currently inside io_uring_enter() it might
9697 * need to grab the uring_lock to make progress. If we hold it
9698 * here across the drain wait, then we can deadlock. It's safe
9699 * to drop the mutex here, since no new references will come in
9700 * after we've killed the percpu ref.
9701 */
9702 mutex_unlock(&ctx->uring_lock);
af9c1a44
JA
9703 do {
9704 ret = wait_for_completion_interruptible(&ctx->ref_comp);
9705 if (!ret)
9706 break;
ed6930c9
JA
9707 ret = io_run_task_work_sig();
9708 if (ret < 0)
9709 break;
af9c1a44
JA
9710 } while (1);
9711
05f3fb3c 9712 mutex_lock(&ctx->uring_lock);
af9c1a44 9713
c150368b
JA
9714 if (ret) {
9715 percpu_ref_resurrect(&ctx->refs);
21b55dbc
SG
9716 goto out_quiesce;
9717 }
9718 }
9719
9720 if (ctx->restricted) {
9721 if (opcode >= IORING_REGISTER_LAST) {
9722 ret = -EINVAL;
9723 goto out;
9724 }
9725
9726 if (!test_bit(opcode, ctx->restrictions.register_op)) {
9727 ret = -EACCES;
c150368b
JA
9728 goto out;
9729 }
05f3fb3c 9730 }
edafccee
JA
9731
9732 switch (opcode) {
9733 case IORING_REGISTER_BUFFERS:
0a96bbe4 9734 ret = io_sqe_buffers_register(ctx, arg, nr_args);
edafccee
JA
9735 break;
9736 case IORING_UNREGISTER_BUFFERS:
9737 ret = -EINVAL;
9738 if (arg || nr_args)
9739 break;
0a96bbe4 9740 ret = io_sqe_buffers_unregister(ctx);
edafccee 9741 break;
6b06314c
JA
9742 case IORING_REGISTER_FILES:
9743 ret = io_sqe_files_register(ctx, arg, nr_args);
9744 break;
9745 case IORING_UNREGISTER_FILES:
9746 ret = -EINVAL;
9747 if (arg || nr_args)
9748 break;
9749 ret = io_sqe_files_unregister(ctx);
9750 break;
c3a31e60
JA
9751 case IORING_REGISTER_FILES_UPDATE:
9752 ret = io_sqe_files_update(ctx, arg, nr_args);
9753 break;
9b402849 9754 case IORING_REGISTER_EVENTFD:
f2842ab5 9755 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
9756 ret = -EINVAL;
9757 if (nr_args != 1)
9758 break;
9759 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
9760 if (ret)
9761 break;
9762 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9763 ctx->eventfd_async = 1;
9764 else
9765 ctx->eventfd_async = 0;
9b402849
JA
9766 break;
9767 case IORING_UNREGISTER_EVENTFD:
9768 ret = -EINVAL;
9769 if (arg || nr_args)
9770 break;
9771 ret = io_eventfd_unregister(ctx);
9772 break;
66f4af93
JA
9773 case IORING_REGISTER_PROBE:
9774 ret = -EINVAL;
9775 if (!arg || nr_args > 256)
9776 break;
9777 ret = io_probe(ctx, arg, nr_args);
9778 break;
071698e1
JA
9779 case IORING_REGISTER_PERSONALITY:
9780 ret = -EINVAL;
9781 if (arg || nr_args)
9782 break;
9783 ret = io_register_personality(ctx);
9784 break;
9785 case IORING_UNREGISTER_PERSONALITY:
9786 ret = -EINVAL;
9787 if (arg)
9788 break;
9789 ret = io_unregister_personality(ctx, nr_args);
9790 break;
7e84e1c7
SG
9791 case IORING_REGISTER_ENABLE_RINGS:
9792 ret = -EINVAL;
9793 if (arg || nr_args)
9794 break;
9795 ret = io_register_enable_rings(ctx);
9796 break;
21b55dbc
SG
9797 case IORING_REGISTER_RESTRICTIONS:
9798 ret = io_register_restrictions(ctx, arg, nr_args);
9799 break;
edafccee
JA
9800 default:
9801 ret = -EINVAL;
9802 break;
9803 }
9804
21b55dbc 9805out:
071698e1 9806 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 9807 /* bring the ctx back to life */
05f3fb3c 9808 percpu_ref_reinit(&ctx->refs);
21b55dbc 9809out_quiesce:
0f158b4c 9810 reinit_completion(&ctx->ref_comp);
05f3fb3c 9811 }
edafccee
JA
9812 return ret;
9813}
9814
9815SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9816 void __user *, arg, unsigned int, nr_args)
9817{
9818 struct io_ring_ctx *ctx;
9819 long ret = -EBADF;
9820 struct fd f;
9821
9822 f = fdget(fd);
9823 if (!f.file)
9824 return -EBADF;
9825
9826 ret = -EOPNOTSUPP;
9827 if (f.file->f_op != &io_uring_fops)
9828 goto out_fput;
9829
9830 ctx = f.file->private_data;
9831
b6c23dd5
PB
9832 io_run_task_work();
9833
edafccee
JA
9834 mutex_lock(&ctx->uring_lock);
9835 ret = __io_uring_register(ctx, opcode, arg, nr_args);
9836 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
9837 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9838 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
9839out_fput:
9840 fdput(f);
9841 return ret;
9842}
9843
2b188cc1
JA
9844static int __init io_uring_init(void)
9845{
d7f62e82
SM
9846#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9847 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9848 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9849} while (0)
9850
9851#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9852 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9853 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9854 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
9855 BUILD_BUG_SQE_ELEM(1, __u8, flags);
9856 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
9857 BUILD_BUG_SQE_ELEM(4, __s32, fd);
9858 BUILD_BUG_SQE_ELEM(8, __u64, off);
9859 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
9860 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 9861 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
9862 BUILD_BUG_SQE_ELEM(24, __u32, len);
9863 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
9864 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
9865 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9866 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
9867 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
9868 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
9869 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
9870 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
9871 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
9872 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
9873 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
9874 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
9875 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
9876 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 9877 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
9878 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
9879 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
9880 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 9881 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
d7f62e82 9882
d3656344 9883 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
84557871 9884 BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
91f245d5
JA
9885 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
9886 SLAB_ACCOUNT);
2b188cc1
JA
9887 return 0;
9888};
9889__initcall(io_uring_init);