]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/io_uring.c
io-wq: don't create any IO workers upfront
[mirror_ubuntu-jammy-kernel.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
6c271ce2 60#include <linux/kthread.h>
2b188cc1 61#include <linux/blkdev.h>
edafccee 62#include <linux/bvec.h>
2b188cc1
JA
63#include <linux/net.h>
64#include <net/sock.h>
65#include <net/af_unix.h>
6b06314c 66#include <net/scm.h>
2b188cc1
JA
67#include <linux/anon_inodes.h>
68#include <linux/sched/mm.h>
69#include <linux/uaccess.h>
70#include <linux/nospec.h>
edafccee
JA
71#include <linux/sizes.h>
72#include <linux/hugetlb.h>
aa4c3967 73#include <linux/highmem.h>
15b71abe
JA
74#include <linux/namei.h>
75#include <linux/fsnotify.h>
4840e418 76#include <linux/fadvise.h>
3e4827b0 77#include <linux/eventpoll.h>
ff002b30 78#include <linux/fs_struct.h>
7d67af2c 79#include <linux/splice.h>
b41e9852 80#include <linux/task_work.h>
bcf5a063 81#include <linux/pagemap.h>
0f212204 82#include <linux/io_uring.h>
91d8f519 83#include <linux/blk-cgroup.h>
4ea33a97 84#include <linux/audit.h>
2b188cc1 85
c826bd7a
DD
86#define CREATE_TRACE_POINTS
87#include <trace/events/io_uring.h>
88
2b188cc1
JA
89#include <uapi/linux/io_uring.h>
90
91#include "internal.h"
561fb04a 92#include "io-wq.h"
2b188cc1 93
5277deaa 94#define IORING_MAX_ENTRIES 32768
33a107f0 95#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
65e19f54
JA
96
97/*
98 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
99 */
100#define IORING_FILE_TABLE_SHIFT 9
101#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
102#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
103#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
21b55dbc
SG
104#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
105 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 106
b16fed66
PB
107#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
108 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
109 IOSQE_BUFFER_SELECT)
110
2b188cc1
JA
111struct io_uring {
112 u32 head ____cacheline_aligned_in_smp;
113 u32 tail ____cacheline_aligned_in_smp;
114};
115
1e84b97b 116/*
75b28aff
HV
117 * This data is shared with the application through the mmap at offsets
118 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
119 *
120 * The offsets to the member fields are published through struct
121 * io_sqring_offsets when calling io_uring_setup.
122 */
75b28aff 123struct io_rings {
1e84b97b
SB
124 /*
125 * Head and tail offsets into the ring; the offsets need to be
126 * masked to get valid indices.
127 *
75b28aff
HV
128 * The kernel controls head of the sq ring and the tail of the cq ring,
129 * and the application controls tail of the sq ring and the head of the
130 * cq ring.
1e84b97b 131 */
75b28aff 132 struct io_uring sq, cq;
1e84b97b 133 /*
75b28aff 134 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
135 * ring_entries - 1)
136 */
75b28aff
HV
137 u32 sq_ring_mask, cq_ring_mask;
138 /* Ring sizes (constant, power of 2) */
139 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
140 /*
141 * Number of invalid entries dropped by the kernel due to
142 * invalid index stored in array
143 *
144 * Written by the kernel, shouldn't be modified by the
145 * application (i.e. get number of "new events" by comparing to
146 * cached value).
147 *
148 * After a new SQ head value was read by the application this
149 * counter includes all submissions that were dropped reaching
150 * the new SQ head (and possibly more).
151 */
75b28aff 152 u32 sq_dropped;
1e84b97b 153 /*
0d9b5b3a 154 * Runtime SQ flags
1e84b97b
SB
155 *
156 * Written by the kernel, shouldn't be modified by the
157 * application.
158 *
159 * The application needs a full memory barrier before checking
160 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
161 */
75b28aff 162 u32 sq_flags;
0d9b5b3a
SG
163 /*
164 * Runtime CQ flags
165 *
166 * Written by the application, shouldn't be modified by the
167 * kernel.
168 */
169 u32 cq_flags;
1e84b97b
SB
170 /*
171 * Number of completion events lost because the queue was full;
172 * this should be avoided by the application by making sure
0b4295b5 173 * there are not more requests pending than there is space in
1e84b97b
SB
174 * the completion queue.
175 *
176 * Written by the kernel, shouldn't be modified by the
177 * application (i.e. get number of "new events" by comparing to
178 * cached value).
179 *
180 * As completion events come in out of order this counter is not
181 * ordered with any other data.
182 */
75b28aff 183 u32 cq_overflow;
1e84b97b
SB
184 /*
185 * Ring buffer of completion events.
186 *
187 * The kernel writes completion events fresh every time they are
188 * produced, so the application is allowed to modify pending
189 * entries.
190 */
75b28aff 191 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
192};
193
45d189c6
PB
194enum io_uring_cmd_flags {
195 IO_URING_F_NONBLOCK = 1,
889fca73 196 IO_URING_F_COMPLETE_DEFER = 2,
45d189c6
PB
197};
198
edafccee
JA
199struct io_mapped_ubuf {
200 u64 ubuf;
201 size_t len;
202 struct bio_vec *bvec;
203 unsigned int nr_bvecs;
de293938 204 unsigned long acct_pages;
edafccee
JA
205};
206
50238531
BM
207struct io_ring_ctx;
208
269bbe5f
BM
209struct io_rsrc_put {
210 struct list_head list;
50238531
BM
211 union {
212 void *rsrc;
213 struct file *file;
214 };
269bbe5f
BM
215};
216
217struct fixed_rsrc_table {
65e19f54 218 struct file **files;
31b51510
JA
219};
220
269bbe5f 221struct fixed_rsrc_ref_node {
05589553
XW
222 struct percpu_ref refs;
223 struct list_head node;
269bbe5f
BM
224 struct list_head rsrc_list;
225 struct fixed_rsrc_data *rsrc_data;
50238531
BM
226 void (*rsrc_put)(struct io_ring_ctx *ctx,
227 struct io_rsrc_put *prsrc);
4a38aed2 228 struct llist_node llist;
e297822b 229 bool done;
05589553
XW
230};
231
269bbe5f
BM
232struct fixed_rsrc_data {
233 struct fixed_rsrc_table *table;
05f3fb3c
JA
234 struct io_ring_ctx *ctx;
235
269bbe5f 236 struct fixed_rsrc_ref_node *node;
05f3fb3c 237 struct percpu_ref refs;
05f3fb3c 238 struct completion done;
8bad28d8 239 bool quiesce;
05f3fb3c
JA
240};
241
5a2e745d
JA
242struct io_buffer {
243 struct list_head list;
244 __u64 addr;
245 __s32 len;
246 __u16 bid;
247};
248
21b55dbc
SG
249struct io_restriction {
250 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
251 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
252 u8 sqe_flags_allowed;
253 u8 sqe_flags_required;
7e84e1c7 254 bool registered;
21b55dbc
SG
255};
256
534ca6d6
JA
257struct io_sq_data {
258 refcount_t refs;
69fb2131
JA
259 struct mutex lock;
260
261 /* ctx's that are using this sqd */
262 struct list_head ctx_list;
263 struct list_head ctx_new_list;
264 struct mutex ctx_lock;
265
534ca6d6
JA
266 struct task_struct *thread;
267 struct wait_queue_head wait;
08369246
XW
268
269 unsigned sq_thread_idle;
534ca6d6
JA
270};
271
258b29a9 272#define IO_IOPOLL_BATCH 8
6dd0be1e 273#define IO_COMPL_BATCH 32
6ff119a6 274#define IO_REQ_CACHE_SIZE 32
bf019da7 275#define IO_REQ_ALLOC_BATCH 8
258b29a9
PB
276
277struct io_comp_state {
6dd0be1e 278 struct io_kiocb *reqs[IO_COMPL_BATCH];
1b4c351f 279 unsigned int nr;
c7dae4ba
JA
280 unsigned int locked_free_nr;
281 /* inline/task_work completion list, under ->uring_lock */
1b4c351f 282 struct list_head free_list;
c7dae4ba
JA
283 /* IRQ completion list, under ->completion_lock */
284 struct list_head locked_free_list;
258b29a9
PB
285};
286
a1ab7b35
PB
287struct io_submit_link {
288 struct io_kiocb *head;
289 struct io_kiocb *last;
290};
291
258b29a9
PB
292struct io_submit_state {
293 struct blk_plug plug;
a1ab7b35 294 struct io_submit_link link;
258b29a9
PB
295
296 /*
297 * io_kiocb alloc cache
298 */
bf019da7 299 void *reqs[IO_REQ_CACHE_SIZE];
258b29a9
PB
300 unsigned int free_reqs;
301
302 bool plug_started;
303
304 /*
305 * Batch completion logic
306 */
307 struct io_comp_state comp;
308
309 /*
310 * File reference cache
311 */
312 struct file *file;
313 unsigned int fd;
314 unsigned int file_refs;
315 unsigned int ios_left;
316};
317
2b188cc1
JA
318struct io_ring_ctx {
319 struct {
320 struct percpu_ref refs;
321 } ____cacheline_aligned_in_smp;
322
323 struct {
324 unsigned int flags;
e1d85334 325 unsigned int compat: 1;
aad5d8da 326 unsigned int limit_mem: 1;
e1d85334
RD
327 unsigned int cq_overflow_flushed: 1;
328 unsigned int drain_next: 1;
329 unsigned int eventfd_async: 1;
21b55dbc 330 unsigned int restricted: 1;
d9d05217 331 unsigned int sqo_dead: 1;
2b188cc1 332
75b28aff
HV
333 /*
334 * Ring buffer of indices into array of io_uring_sqe, which is
335 * mmapped by the application using the IORING_OFF_SQES offset.
336 *
337 * This indirection could e.g. be used to assign fixed
338 * io_uring_sqe entries to operations and only submit them to
339 * the queue when needed.
340 *
341 * The kernel modifies neither the indices array nor the entries
342 * array.
343 */
344 u32 *sq_array;
2b188cc1
JA
345 unsigned cached_sq_head;
346 unsigned sq_entries;
347 unsigned sq_mask;
6c271ce2 348 unsigned sq_thread_idle;
498ccd9e 349 unsigned cached_sq_dropped;
2c3bac6d 350 unsigned cached_cq_overflow;
ad3eb2c8 351 unsigned long sq_check_overflow;
de0617e4
JA
352
353 struct list_head defer_list;
5262f567 354 struct list_head timeout_list;
1d7bb1d5 355 struct list_head cq_overflow_list;
fcb323cc 356
ad3eb2c8 357 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
358 } ____cacheline_aligned_in_smp;
359
3c1a2ead
JA
360 struct {
361 struct mutex uring_lock;
362 wait_queue_head_t wait;
363 } ____cacheline_aligned_in_smp;
364
365 struct io_submit_state submit_state;
366
206aefde
JA
367 struct io_rings *rings;
368
2b188cc1 369 /* IO offload */
561fb04a 370 struct io_wq *io_wq;
2aede0e4
JA
371
372 /*
373 * For SQPOLL usage - we hold a reference to the parent task, so we
374 * have access to the ->files
375 */
376 struct task_struct *sqo_task;
377
378 /* Only used for accounting purposes */
379 struct mm_struct *mm_account;
380
91d8f519
DZ
381#ifdef CONFIG_BLK_CGROUP
382 struct cgroup_subsys_state *sqo_blkcg_css;
383#endif
384
534ca6d6
JA
385 struct io_sq_data *sq_data; /* if using sq thread polling */
386
90554200 387 struct wait_queue_head sqo_sq_wait;
69fb2131 388 struct list_head sqd_list;
75b28aff 389
6b06314c
JA
390 /*
391 * If used, fixed file set. Writers must ensure that ->refs is dead,
392 * readers must ensure that ->refs is alive as long as the file* is
393 * used. Only updated through io_uring_register(2).
394 */
269bbe5f 395 struct fixed_rsrc_data *file_data;
6b06314c
JA
396 unsigned nr_user_files;
397
edafccee
JA
398 /* if used, fixed mapped user buffers */
399 unsigned nr_user_bufs;
400 struct io_mapped_ubuf *user_bufs;
401
2b188cc1
JA
402 struct user_struct *user;
403
0b8c0ec7 404 const struct cred *creds;
181e448d 405
4ea33a97
JA
406#ifdef CONFIG_AUDIT
407 kuid_t loginuid;
408 unsigned int sessionid;
409#endif
410
0f158b4c
JA
411 struct completion ref_comp;
412 struct completion sq_thread_comp;
206aefde
JA
413
414#if defined(CONFIG_UNIX)
415 struct socket *ring_sock;
416#endif
417
5a2e745d
JA
418 struct idr io_buffer_idr;
419
071698e1
JA
420 struct idr personality_idr;
421
206aefde
JA
422 struct {
423 unsigned cached_cq_tail;
424 unsigned cq_entries;
425 unsigned cq_mask;
426 atomic_t cq_timeouts;
f010505b 427 unsigned cq_last_tm_flush;
ad3eb2c8 428 unsigned long cq_check_overflow;
206aefde
JA
429 struct wait_queue_head cq_wait;
430 struct fasync_struct *cq_fasync;
431 struct eventfd_ctx *cq_ev_fd;
432 } ____cacheline_aligned_in_smp;
2b188cc1 433
2b188cc1
JA
434 struct {
435 spinlock_t completion_lock;
e94f141b 436
def596e9 437 /*
540e32a0 438 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
439 * io_uring instances that don't use IORING_SETUP_SQPOLL.
440 * For SQPOLL, only the single threaded io_sq_thread() will
441 * manipulate the list, hence no extra locking is needed there.
442 */
540e32a0 443 struct list_head iopoll_list;
78076bb6
JA
444 struct hlist_head *cancel_hash;
445 unsigned cancel_hash_bits;
e94f141b 446 bool poll_multi_file;
31b51510 447
fcb323cc
JA
448 spinlock_t inflight_lock;
449 struct list_head inflight_list;
2b188cc1 450 } ____cacheline_aligned_in_smp;
85faa7b8 451
269bbe5f
BM
452 struct delayed_work rsrc_put_work;
453 struct llist_head rsrc_put_llist;
d67d2263
BM
454 struct list_head rsrc_ref_list;
455 spinlock_t rsrc_ref_lock;
4a38aed2 456
21b55dbc 457 struct io_restriction restrictions;
3c1a2ead 458
7c25c0d1
JA
459 /* exit task_work */
460 struct callback_head *exit_task_work;
461
3c1a2ead
JA
462 /* Keep this last, we don't need it for the fast path */
463 struct work_struct exit_work;
2b188cc1
JA
464};
465
09bb8394
JA
466/*
467 * First field must be the file pointer in all the
468 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
469 */
221c5eb2
JA
470struct io_poll_iocb {
471 struct file *file;
018043be 472 struct wait_queue_head *head;
221c5eb2 473 __poll_t events;
8c838788 474 bool done;
221c5eb2 475 bool canceled;
392edb45 476 struct wait_queue_entry wait;
221c5eb2
JA
477};
478
018043be
PB
479struct io_poll_remove {
480 struct file *file;
481 u64 addr;
482};
483
b5dba59e
JA
484struct io_close {
485 struct file *file;
b5dba59e
JA
486 int fd;
487};
488
ad8a48ac
JA
489struct io_timeout_data {
490 struct io_kiocb *req;
491 struct hrtimer timer;
492 struct timespec64 ts;
493 enum hrtimer_mode mode;
494};
495
8ed8d3c3
JA
496struct io_accept {
497 struct file *file;
498 struct sockaddr __user *addr;
499 int __user *addr_len;
500 int flags;
09952e3e 501 unsigned long nofile;
8ed8d3c3
JA
502};
503
504struct io_sync {
505 struct file *file;
506 loff_t len;
507 loff_t off;
508 int flags;
d63d1b5e 509 int mode;
8ed8d3c3
JA
510};
511
fbf23849
JA
512struct io_cancel {
513 struct file *file;
514 u64 addr;
515};
516
b29472ee
JA
517struct io_timeout {
518 struct file *file;
bfe68a22
PB
519 u32 off;
520 u32 target_seq;
135fcde8 521 struct list_head list;
90cd7e42
PB
522 /* head of the link, used by linked timeouts only */
523 struct io_kiocb *head;
b29472ee
JA
524};
525
0bdf7a2d
PB
526struct io_timeout_rem {
527 struct file *file;
528 u64 addr;
9c8e11b3
PB
529
530 /* timeout update */
531 struct timespec64 ts;
532 u32 flags;
0bdf7a2d
PB
533};
534
9adbd45d
JA
535struct io_rw {
536 /* NOTE: kiocb has the file as the first member, so don't do it here */
537 struct kiocb kiocb;
538 u64 addr;
539 u64 len;
540};
541
3fbb51c1
JA
542struct io_connect {
543 struct file *file;
544 struct sockaddr __user *addr;
545 int addr_len;
546};
547
e47293fd
JA
548struct io_sr_msg {
549 struct file *file;
fddaface 550 union {
270a5940 551 struct user_msghdr __user *umsg;
fddaface
JA
552 void __user *buf;
553 };
e47293fd 554 int msg_flags;
bcda7baa 555 int bgid;
fddaface 556 size_t len;
bcda7baa 557 struct io_buffer *kbuf;
e47293fd
JA
558};
559
15b71abe
JA
560struct io_open {
561 struct file *file;
562 int dfd;
15b71abe 563 struct filename *filename;
c12cedf2 564 struct open_how how;
4022e7af 565 unsigned long nofile;
15b71abe
JA
566};
567
269bbe5f 568struct io_rsrc_update {
05f3fb3c
JA
569 struct file *file;
570 u64 arg;
571 u32 nr_args;
572 u32 offset;
573};
574
4840e418
JA
575struct io_fadvise {
576 struct file *file;
577 u64 offset;
578 u32 len;
579 u32 advice;
580};
581
c1ca757b
JA
582struct io_madvise {
583 struct file *file;
584 u64 addr;
585 u32 len;
586 u32 advice;
587};
588
3e4827b0
JA
589struct io_epoll {
590 struct file *file;
591 int epfd;
592 int op;
593 int fd;
594 struct epoll_event event;
e47293fd
JA
595};
596
7d67af2c
PB
597struct io_splice {
598 struct file *file_out;
599 struct file *file_in;
600 loff_t off_out;
601 loff_t off_in;
602 u64 len;
603 unsigned int flags;
604};
605
ddf0322d
JA
606struct io_provide_buf {
607 struct file *file;
608 __u64 addr;
609 __s32 len;
610 __u32 bgid;
611 __u16 nbufs;
612 __u16 bid;
613};
614
1d9e1288
BM
615struct io_statx {
616 struct file *file;
617 int dfd;
618 unsigned int mask;
619 unsigned int flags;
e62753e4 620 const char __user *filename;
1d9e1288
BM
621 struct statx __user *buffer;
622};
623
36f4fa68
JA
624struct io_shutdown {
625 struct file *file;
626 int how;
627};
628
80a261fd
JA
629struct io_rename {
630 struct file *file;
631 int old_dfd;
632 int new_dfd;
633 struct filename *oldpath;
634 struct filename *newpath;
635 int flags;
636};
637
14a1143b
JA
638struct io_unlink {
639 struct file *file;
640 int dfd;
641 int flags;
642 struct filename *filename;
643};
644
3ca405eb
PB
645struct io_completion {
646 struct file *file;
647 struct list_head list;
0f7e466b 648 int cflags;
3ca405eb
PB
649};
650
f499a021
JA
651struct io_async_connect {
652 struct sockaddr_storage address;
653};
654
03b1230c
JA
655struct io_async_msghdr {
656 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
657 /* points to an allocated iov, if NULL we use fast_iov instead */
658 struct iovec *free_iov;
03b1230c
JA
659 struct sockaddr __user *uaddr;
660 struct msghdr msg;
b537916c 661 struct sockaddr_storage addr;
03b1230c
JA
662};
663
f67676d1
JA
664struct io_async_rw {
665 struct iovec fast_iov[UIO_FASTIOV];
ff6165b2
JA
666 const struct iovec *free_iovec;
667 struct iov_iter iter;
227c0c96 668 size_t bytes_done;
bcf5a063 669 struct wait_page_queue wpq;
f67676d1
JA
670};
671
6b47ee6e
PB
672enum {
673 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
674 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
675 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
676 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
677 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 678 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
6b47ee6e 679
6b47ee6e
PB
680 REQ_F_FAIL_LINK_BIT,
681 REQ_F_INFLIGHT_BIT,
682 REQ_F_CUR_POS_BIT,
683 REQ_F_NOWAIT_BIT,
6b47ee6e 684 REQ_F_LINK_TIMEOUT_BIT,
6b47ee6e 685 REQ_F_ISREG_BIT,
99bc4c38 686 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 687 REQ_F_POLLED_BIT,
bcda7baa 688 REQ_F_BUFFER_SELECTED_BIT,
5b0bbee4 689 REQ_F_NO_FILE_TABLE_BIT,
7cdaf587 690 REQ_F_WORK_INITIALIZED_BIT,
900fad45 691 REQ_F_LTIMEOUT_ACTIVE_BIT,
e342c807 692 REQ_F_COMPLETE_INLINE_BIT,
84557871
JA
693
694 /* not a real bit, just to check we're not overflowing the space */
695 __REQ_F_LAST_BIT,
6b47ee6e
PB
696};
697
698enum {
699 /* ctx owns file */
700 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
701 /* drain existing IO first */
702 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
703 /* linked sqes */
704 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
705 /* doesn't sever on completion < 0 */
706 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
707 /* IOSQE_ASYNC */
708 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
709 /* IOSQE_BUFFER_SELECT */
710 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
6b47ee6e 711
6b47ee6e
PB
712 /* fail rest of links */
713 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
714 /* on inflight list */
715 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
716 /* read/write uses file position */
717 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
718 /* must not punt to workers */
719 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 720 /* has or had linked timeout */
6b47ee6e 721 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
6b47ee6e
PB
722 /* regular file */
723 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
99bc4c38
PB
724 /* needs cleanup */
725 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
726 /* already went through poll handler */
727 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
728 /* buffer already selected */
729 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
5b0bbee4
JA
730 /* doesn't need file table for this request */
731 REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
7cdaf587
XW
732 /* io_wq_work is initialized */
733 REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
900fad45
PB
734 /* linked timeout is active, i.e. prepared by link's head */
735 REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
e342c807
PB
736 /* completion is deferred through io_comp_state */
737 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
d7718a9d
JA
738};
739
740struct async_poll {
741 struct io_poll_iocb poll;
807abcb0 742 struct io_poll_iocb *double_poll;
6b47ee6e
PB
743};
744
7cbf1722
JA
745struct io_task_work {
746 struct io_wq_work_node node;
747 task_work_func_t func;
748};
749
09bb8394
JA
750/*
751 * NOTE! Each of the iocb union members has the file pointer
752 * as the first entry in their struct definition. So you can
753 * access the file pointer through any of the sub-structs,
754 * or directly as just 'ki_filp' in this struct.
755 */
2b188cc1 756struct io_kiocb {
221c5eb2 757 union {
09bb8394 758 struct file *file;
9adbd45d 759 struct io_rw rw;
221c5eb2 760 struct io_poll_iocb poll;
018043be 761 struct io_poll_remove poll_remove;
8ed8d3c3
JA
762 struct io_accept accept;
763 struct io_sync sync;
fbf23849 764 struct io_cancel cancel;
b29472ee 765 struct io_timeout timeout;
0bdf7a2d 766 struct io_timeout_rem timeout_rem;
3fbb51c1 767 struct io_connect connect;
e47293fd 768 struct io_sr_msg sr_msg;
15b71abe 769 struct io_open open;
b5dba59e 770 struct io_close close;
269bbe5f 771 struct io_rsrc_update rsrc_update;
4840e418 772 struct io_fadvise fadvise;
c1ca757b 773 struct io_madvise madvise;
3e4827b0 774 struct io_epoll epoll;
7d67af2c 775 struct io_splice splice;
ddf0322d 776 struct io_provide_buf pbuf;
1d9e1288 777 struct io_statx statx;
36f4fa68 778 struct io_shutdown shutdown;
80a261fd 779 struct io_rename rename;
14a1143b 780 struct io_unlink unlink;
3ca405eb
PB
781 /* use only after cleaning per-op data, see io_clean_op() */
782 struct io_completion compl;
221c5eb2 783 };
2b188cc1 784
e8c2bc1f
JA
785 /* opcode allocated if it needs to store data for async defer */
786 void *async_data;
d625c6ee 787 u8 opcode;
65a6543d
XW
788 /* polled IO has completed */
789 u8 iopoll_completed;
2b188cc1 790
4f4eeba8 791 u16 buf_index;
9cf7c104 792 u32 result;
4f4eeba8 793
010e8e6b
PB
794 struct io_ring_ctx *ctx;
795 unsigned int flags;
796 refcount_t refs;
797 struct task_struct *task;
798 u64 user_data;
d7718a9d 799
f2f87370 800 struct io_kiocb *link;
269bbe5f 801 struct percpu_ref *fixed_rsrc_refs;
fcb323cc 802
d21ffe7e
PB
803 /*
804 * 1. used with ctx->iopoll_list with reads/writes
805 * 2. to track reqs with ->files (see io_op_def::file_table)
806 */
010e8e6b 807 struct list_head inflight_entry;
7cbf1722
JA
808 union {
809 struct io_task_work io_task_work;
810 struct callback_head task_work;
811 };
010e8e6b
PB
812 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
813 struct hlist_node hash_node;
814 struct async_poll *apoll;
815 struct io_wq_work work;
2b188cc1 816};
05589553 817
27dc8338
PB
818struct io_defer_entry {
819 struct list_head list;
820 struct io_kiocb *req;
9cf7c104 821 u32 seq;
2b188cc1
JA
822};
823
d3656344 824struct io_op_def {
d3656344
JA
825 /* needs req->file assigned */
826 unsigned needs_file : 1;
d3656344
JA
827 /* hash wq insertion if file is a regular file */
828 unsigned hash_reg_file : 1;
829 /* unbound wq insertion if file is a non-regular file */
830 unsigned unbound_nonreg_file : 1;
66f4af93
JA
831 /* opcode is not supported by this kernel */
832 unsigned not_supported : 1;
8a72758c
JA
833 /* set if opcode supports polled "wait" */
834 unsigned pollin : 1;
835 unsigned pollout : 1;
bcda7baa
JA
836 /* op supports buffer selection */
837 unsigned buffer_select : 1;
e8c2bc1f
JA
838 /* must always have async data allocated */
839 unsigned needs_async_data : 1;
27926b68
JA
840 /* should block plug */
841 unsigned plug : 1;
e8c2bc1f
JA
842 /* size of async data needed, if any */
843 unsigned short async_size;
0f203765 844 unsigned work_flags;
d3656344
JA
845};
846
0918682b 847static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
848 [IORING_OP_NOP] = {},
849 [IORING_OP_READV] = {
d3656344
JA
850 .needs_file = 1,
851 .unbound_nonreg_file = 1,
8a72758c 852 .pollin = 1,
4d954c25 853 .buffer_select = 1,
e8c2bc1f 854 .needs_async_data = 1,
27926b68 855 .plug = 1,
e8c2bc1f 856 .async_size = sizeof(struct io_async_rw),
0f203765 857 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
d3656344 858 },
0463b6c5 859 [IORING_OP_WRITEV] = {
d3656344
JA
860 .needs_file = 1,
861 .hash_reg_file = 1,
862 .unbound_nonreg_file = 1,
8a72758c 863 .pollout = 1,
e8c2bc1f 864 .needs_async_data = 1,
27926b68 865 .plug = 1,
e8c2bc1f 866 .async_size = sizeof(struct io_async_rw),
69228338
JA
867 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
868 IO_WQ_WORK_FSIZE,
d3656344 869 },
0463b6c5 870 [IORING_OP_FSYNC] = {
d3656344 871 .needs_file = 1,
0f203765 872 .work_flags = IO_WQ_WORK_BLKCG,
d3656344 873 },
0463b6c5 874 [IORING_OP_READ_FIXED] = {
d3656344
JA
875 .needs_file = 1,
876 .unbound_nonreg_file = 1,
8a72758c 877 .pollin = 1,
27926b68 878 .plug = 1,
e8c2bc1f 879 .async_size = sizeof(struct io_async_rw),
4017eb91 880 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
d3656344 881 },
0463b6c5 882 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
883 .needs_file = 1,
884 .hash_reg_file = 1,
885 .unbound_nonreg_file = 1,
8a72758c 886 .pollout = 1,
27926b68 887 .plug = 1,
e8c2bc1f 888 .async_size = sizeof(struct io_async_rw),
4017eb91
JA
889 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
890 IO_WQ_WORK_MM,
d3656344 891 },
0463b6c5 892 [IORING_OP_POLL_ADD] = {
d3656344
JA
893 .needs_file = 1,
894 .unbound_nonreg_file = 1,
895 },
0463b6c5
PB
896 [IORING_OP_POLL_REMOVE] = {},
897 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344 898 .needs_file = 1,
0f203765 899 .work_flags = IO_WQ_WORK_BLKCG,
d3656344 900 },
0463b6c5 901 [IORING_OP_SENDMSG] = {
d3656344
JA
902 .needs_file = 1,
903 .unbound_nonreg_file = 1,
8a72758c 904 .pollout = 1,
e8c2bc1f
JA
905 .needs_async_data = 1,
906 .async_size = sizeof(struct io_async_msghdr),
92c75f75
JA
907 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
908 IO_WQ_WORK_FS,
d3656344 909 },
0463b6c5 910 [IORING_OP_RECVMSG] = {
d3656344
JA
911 .needs_file = 1,
912 .unbound_nonreg_file = 1,
8a72758c 913 .pollin = 1,
52de1fe1 914 .buffer_select = 1,
e8c2bc1f
JA
915 .needs_async_data = 1,
916 .async_size = sizeof(struct io_async_msghdr),
92c75f75
JA
917 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
918 IO_WQ_WORK_FS,
d3656344 919 },
0463b6c5 920 [IORING_OP_TIMEOUT] = {
e8c2bc1f
JA
921 .needs_async_data = 1,
922 .async_size = sizeof(struct io_timeout_data),
0f203765 923 .work_flags = IO_WQ_WORK_MM,
d3656344 924 },
9c8e11b3
PB
925 [IORING_OP_TIMEOUT_REMOVE] = {
926 /* used by timeout updates' prep() */
927 .work_flags = IO_WQ_WORK_MM,
928 },
0463b6c5 929 [IORING_OP_ACCEPT] = {
d3656344
JA
930 .needs_file = 1,
931 .unbound_nonreg_file = 1,
8a72758c 932 .pollin = 1,
0f203765 933 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
d3656344 934 },
0463b6c5
PB
935 [IORING_OP_ASYNC_CANCEL] = {},
936 [IORING_OP_LINK_TIMEOUT] = {
e8c2bc1f
JA
937 .needs_async_data = 1,
938 .async_size = sizeof(struct io_timeout_data),
0f203765 939 .work_flags = IO_WQ_WORK_MM,
d3656344 940 },
0463b6c5 941 [IORING_OP_CONNECT] = {
d3656344
JA
942 .needs_file = 1,
943 .unbound_nonreg_file = 1,
8a72758c 944 .pollout = 1,
e8c2bc1f
JA
945 .needs_async_data = 1,
946 .async_size = sizeof(struct io_async_connect),
0f203765 947 .work_flags = IO_WQ_WORK_MM,
d3656344 948 },
0463b6c5 949 [IORING_OP_FALLOCATE] = {
d3656344 950 .needs_file = 1,
69228338 951 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
d3656344 952 },
0463b6c5 953 [IORING_OP_OPENAT] = {
0f203765 954 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
14587a46 955 IO_WQ_WORK_FS | IO_WQ_WORK_MM,
d3656344 956 },
0463b6c5 957 [IORING_OP_CLOSE] = {
0f203765 958 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
d3656344 959 },
0463b6c5 960 [IORING_OP_FILES_UPDATE] = {
0f203765 961 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
d3656344 962 },
0463b6c5 963 [IORING_OP_STATX] = {
0f203765
JA
964 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
965 IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
d3656344 966 },
0463b6c5 967 [IORING_OP_READ] = {
3a6820f2
JA
968 .needs_file = 1,
969 .unbound_nonreg_file = 1,
8a72758c 970 .pollin = 1,
bcda7baa 971 .buffer_select = 1,
27926b68 972 .plug = 1,
e8c2bc1f 973 .async_size = sizeof(struct io_async_rw),
0f203765 974 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
3a6820f2 975 },
0463b6c5 976 [IORING_OP_WRITE] = {
3a6820f2
JA
977 .needs_file = 1,
978 .unbound_nonreg_file = 1,
8a72758c 979 .pollout = 1,
27926b68 980 .plug = 1,
e8c2bc1f 981 .async_size = sizeof(struct io_async_rw),
69228338
JA
982 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
983 IO_WQ_WORK_FSIZE,
3a6820f2 984 },
0463b6c5 985 [IORING_OP_FADVISE] = {
4840e418 986 .needs_file = 1,
0f203765 987 .work_flags = IO_WQ_WORK_BLKCG,
4840e418 988 },
0463b6c5 989 [IORING_OP_MADVISE] = {
0f203765 990 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
c1ca757b 991 },
0463b6c5 992 [IORING_OP_SEND] = {
fddaface
JA
993 .needs_file = 1,
994 .unbound_nonreg_file = 1,
8a72758c 995 .pollout = 1,
0f203765 996 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
fddaface 997 },
0463b6c5 998 [IORING_OP_RECV] = {
fddaface
JA
999 .needs_file = 1,
1000 .unbound_nonreg_file = 1,
8a72758c 1001 .pollin = 1,
bcda7baa 1002 .buffer_select = 1,
0f203765 1003 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
fddaface 1004 },
0463b6c5 1005 [IORING_OP_OPENAT2] = {
0f203765 1006 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
14587a46 1007 IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
cebdb986 1008 },
3e4827b0
JA
1009 [IORING_OP_EPOLL_CTL] = {
1010 .unbound_nonreg_file = 1,
0f203765 1011 .work_flags = IO_WQ_WORK_FILES,
3e4827b0 1012 },
7d67af2c
PB
1013 [IORING_OP_SPLICE] = {
1014 .needs_file = 1,
1015 .hash_reg_file = 1,
1016 .unbound_nonreg_file = 1,
0f203765 1017 .work_flags = IO_WQ_WORK_BLKCG,
ddf0322d
JA
1018 },
1019 [IORING_OP_PROVIDE_BUFFERS] = {},
067524e9 1020 [IORING_OP_REMOVE_BUFFERS] = {},
f2a8d5c7
PB
1021 [IORING_OP_TEE] = {
1022 .needs_file = 1,
1023 .hash_reg_file = 1,
1024 .unbound_nonreg_file = 1,
1025 },
36f4fa68
JA
1026 [IORING_OP_SHUTDOWN] = {
1027 .needs_file = 1,
1028 },
80a261fd
JA
1029 [IORING_OP_RENAMEAT] = {
1030 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
1031 IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
1032 },
14a1143b
JA
1033 [IORING_OP_UNLINKAT] = {
1034 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
1035 IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
1036 },
d3656344
JA
1037};
1038
9936c7c2
PB
1039static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1040 struct task_struct *task,
1041 struct files_struct *files);
269bbe5f 1042static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
bc9744cd 1043static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
1ffc5422 1044 struct io_ring_ctx *ctx);
f2303b1f 1045static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
1ffc5422 1046
23faba36 1047static bool io_rw_reissue(struct io_kiocb *req);
78e19bbe 1048static void io_cqring_fill_event(struct io_kiocb *req, long res);
ec9c02ad 1049static void io_put_req(struct io_kiocb *req);
216578e5 1050static void io_put_req_deferred(struct io_kiocb *req, int nr);
c40f6379 1051static void io_double_put_req(struct io_kiocb *req);
c7dae4ba
JA
1052static void io_dismantle_req(struct io_kiocb *req);
1053static void io_put_task(struct task_struct *task, int nr);
1054static void io_queue_next(struct io_kiocb *req);
94ae5e77 1055static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
7271ef3a 1056static void __io_queue_linked_timeout(struct io_kiocb *req);
94ae5e77 1057static void io_queue_linked_timeout(struct io_kiocb *req);
05f3fb3c 1058static int __io_sqe_files_update(struct io_ring_ctx *ctx,
269bbe5f 1059 struct io_uring_rsrc_update *ip,
05f3fb3c 1060 unsigned nr_args);
3ca405eb 1061static void __io_clean_op(struct io_kiocb *req);
8371adf5
PB
1062static struct file *io_file_get(struct io_submit_state *state,
1063 struct io_kiocb *req, int fd, bool fixed);
c5eef2b9 1064static void __io_queue_sqe(struct io_kiocb *req);
269bbe5f 1065static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1066
847595de
PB
1067static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
1068 struct iov_iter *iter, bool needs_lock);
ff6165b2
JA
1069static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
1070 const struct iovec *fast_iov,
227c0c96 1071 struct iov_iter *iter, bool force);
907d1df3 1072static void io_req_task_queue(struct io_kiocb *req);
65453d1e
JA
1073static void io_submit_flush_completions(struct io_comp_state *cs,
1074 struct io_ring_ctx *ctx);
de0617e4 1075
2b188cc1
JA
1076static struct kmem_cache *req_cachep;
1077
0918682b 1078static const struct file_operations io_uring_fops;
2b188cc1
JA
1079
1080struct sock *io_uring_get_socket(struct file *file)
1081{
1082#if defined(CONFIG_UNIX)
1083 if (file->f_op == &io_uring_fops) {
1084 struct io_ring_ctx *ctx = file->private_data;
1085
1086 return ctx->ring_sock->sk;
1087 }
1088#endif
1089 return NULL;
1090}
1091EXPORT_SYMBOL(io_uring_get_socket);
1092
f2f87370
PB
1093#define io_for_each_link(pos, head) \
1094 for (pos = (head); pos; pos = pos->link)
1095
3ca405eb
PB
1096static inline void io_clean_op(struct io_kiocb *req)
1097{
9d5c8190 1098 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
3ca405eb
PB
1099 __io_clean_op(req);
1100}
1101
36f72fe2
PB
1102static inline void io_set_resource_node(struct io_kiocb *req)
1103{
1104 struct io_ring_ctx *ctx = req->ctx;
1105
269bbe5f
BM
1106 if (!req->fixed_rsrc_refs) {
1107 req->fixed_rsrc_refs = &ctx->file_data->node->refs;
1108 percpu_ref_get(req->fixed_rsrc_refs);
36f72fe2
PB
1109 }
1110}
1111
88f171ab
PB
1112static bool io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1113{
1114 if (!percpu_ref_tryget(ref)) {
1115 /* already at zero, wait for ->release() */
1116 if (!try_wait_for_completion(compl))
1117 synchronize_rcu();
1118 return false;
1119 }
1120
1121 percpu_ref_resurrect(ref);
1122 reinit_completion(compl);
1123 percpu_ref_put(ref);
1124 return true;
1125}
1126
08d23634
PB
1127static bool io_match_task(struct io_kiocb *head,
1128 struct task_struct *task,
1129 struct files_struct *files)
1130{
1131 struct io_kiocb *req;
1132
84965ff8
JA
1133 if (task && head->task != task) {
1134 /* in terms of cancelation, always match if req task is dead */
1135 if (head->task->flags & PF_EXITING)
1136 return true;
08d23634 1137 return false;
84965ff8 1138 }
08d23634
PB
1139 if (!files)
1140 return true;
1141
1142 io_for_each_link(req, head) {
02a13674
JA
1143 if (!(req->flags & REQ_F_WORK_INITIALIZED))
1144 continue;
1145 if (req->file && req->file->f_op == &io_uring_fops)
1146 return true;
1147 if ((req->work.flags & IO_WQ_WORK_FILES) &&
08d23634
PB
1148 req->work.identity->files == files)
1149 return true;
1150 }
1151 return false;
1152}
1153
28cea78a 1154static void io_sq_thread_drop_mm_files(void)
c40f6379 1155{
28cea78a 1156 struct files_struct *files = current->files;
c40f6379
JA
1157 struct mm_struct *mm = current->mm;
1158
1159 if (mm) {
1160 kthread_unuse_mm(mm);
1161 mmput(mm);
4b70cf9d 1162 current->mm = NULL;
c40f6379 1163 }
28cea78a
JA
1164 if (files) {
1165 struct nsproxy *nsproxy = current->nsproxy;
1166
1167 task_lock(current);
1168 current->files = NULL;
1169 current->nsproxy = NULL;
1170 task_unlock(current);
1171 put_files_struct(files);
1172 put_nsproxy(nsproxy);
1173 }
1174}
1175
1a38ffc9 1176static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
28cea78a
JA
1177{
1178 if (!current->files) {
1179 struct files_struct *files;
1180 struct nsproxy *nsproxy;
1181
1182 task_lock(ctx->sqo_task);
1183 files = ctx->sqo_task->files;
1184 if (!files) {
1185 task_unlock(ctx->sqo_task);
1a38ffc9 1186 return -EOWNERDEAD;
28cea78a
JA
1187 }
1188 atomic_inc(&files->count);
1189 get_nsproxy(ctx->sqo_task->nsproxy);
1190 nsproxy = ctx->sqo_task->nsproxy;
1191 task_unlock(ctx->sqo_task);
1192
1193 task_lock(current);
1194 current->files = files;
1195 current->nsproxy = nsproxy;
1196 task_unlock(current);
1197 }
1a38ffc9 1198 return 0;
c40f6379
JA
1199}
1200
1201static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
1202{
4b70cf9d
JA
1203 struct mm_struct *mm;
1204
1205 if (current->mm)
1206 return 0;
1207
4b70cf9d
JA
1208 task_lock(ctx->sqo_task);
1209 mm = ctx->sqo_task->mm;
1210 if (unlikely(!mm || !mmget_not_zero(mm)))
1211 mm = NULL;
1212 task_unlock(ctx->sqo_task);
1213
1214 if (mm) {
1215 kthread_use_mm(mm);
1216 return 0;
c40f6379
JA
1217 }
1218
4b70cf9d 1219 return -EFAULT;
c40f6379
JA
1220}
1221
4e326358
PB
1222static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
1223 struct io_kiocb *req)
c40f6379 1224{
28cea78a 1225 const struct io_op_def *def = &io_op_defs[req->opcode];
1a38ffc9 1226 int ret;
28cea78a
JA
1227
1228 if (def->work_flags & IO_WQ_WORK_MM) {
1a38ffc9 1229 ret = __io_sq_thread_acquire_mm(ctx);
28cea78a
JA
1230 if (unlikely(ret))
1231 return ret;
1232 }
1233
1a38ffc9
PB
1234 if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
1235 ret = __io_sq_thread_acquire_files(ctx);
1236 if (unlikely(ret))
1237 return ret;
1238 }
28cea78a
JA
1239
1240 return 0;
c40f6379
JA
1241}
1242
4e326358
PB
1243static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
1244 struct io_kiocb *req)
1245{
4e326358
PB
1246 if (!(ctx->flags & IORING_SETUP_SQPOLL))
1247 return 0;
1248 return __io_sq_thread_acquire_mm_files(ctx, req);
1249}
1250
91d8f519
DZ
1251static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
1252 struct cgroup_subsys_state **cur_css)
1253
1254{
1255#ifdef CONFIG_BLK_CGROUP
1256 /* puts the old one when swapping */
1257 if (*cur_css != ctx->sqo_blkcg_css) {
1258 kthread_associate_blkcg(ctx->sqo_blkcg_css);
1259 *cur_css = ctx->sqo_blkcg_css;
1260 }
1261#endif
1262}
1263
1264static void io_sq_thread_unassociate_blkcg(void)
1265{
1266#ifdef CONFIG_BLK_CGROUP
1267 kthread_associate_blkcg(NULL);
1268#endif
1269}
1270
c40f6379
JA
1271static inline void req_set_fail_links(struct io_kiocb *req)
1272{
1273 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1274 req->flags |= REQ_F_FAIL_LINK;
1275}
4a38aed2 1276
1e6fa521
JA
1277/*
1278 * None of these are dereferenced, they are simply used to check if any of
1279 * them have changed. If we're under current and check they are still the
1280 * same, we're fine to grab references to them for actual out-of-line use.
1281 */
1282static void io_init_identity(struct io_identity *id)
1283{
1284 id->files = current->files;
1285 id->mm = current->mm;
1286#ifdef CONFIG_BLK_CGROUP
1287 rcu_read_lock();
1288 id->blkcg_css = blkcg_css();
1289 rcu_read_unlock();
1290#endif
1291 id->creds = current_cred();
1292 id->nsproxy = current->nsproxy;
1293 id->fs = current->fs;
1294 id->fsize = rlimit(RLIMIT_FSIZE);
4ea33a97
JA
1295#ifdef CONFIG_AUDIT
1296 id->loginuid = current->loginuid;
1297 id->sessionid = current->sessionid;
1298#endif
1e6fa521
JA
1299 refcount_set(&id->count, 1);
1300}
1301
ec99ca6c
PB
1302static inline void __io_req_init_async(struct io_kiocb *req)
1303{
1304 memset(&req->work, 0, sizeof(req->work));
1305 req->flags |= REQ_F_WORK_INITIALIZED;
1306}
1307
7cdaf587
XW
1308/*
1309 * Note: must call io_req_init_async() for the first time you
1310 * touch any members of io_wq_work.
1311 */
1312static inline void io_req_init_async(struct io_kiocb *req)
1313{
500a373d
JA
1314 struct io_uring_task *tctx = current->io_uring;
1315
7cdaf587
XW
1316 if (req->flags & REQ_F_WORK_INITIALIZED)
1317 return;
1318
ec99ca6c 1319 __io_req_init_async(req);
500a373d
JA
1320
1321 /* Grab a ref if this isn't our static identity */
1322 req->work.identity = tctx->identity;
1323 if (tctx->identity != &tctx->__identity)
1324 refcount_inc(&req->work.identity->count);
7cdaf587
XW
1325}
1326
2b188cc1
JA
1327static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1328{
1329 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1330
0f158b4c 1331 complete(&ctx->ref_comp);
2b188cc1
JA
1332}
1333
8eb7e2d0
PB
1334static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1335{
1336 return !req->timeout.off;
1337}
1338
2b188cc1
JA
1339static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1340{
1341 struct io_ring_ctx *ctx;
78076bb6 1342 int hash_bits;
2b188cc1
JA
1343
1344 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1345 if (!ctx)
1346 return NULL;
1347
78076bb6
JA
1348 /*
1349 * Use 5 bits less than the max cq entries, that should give us around
1350 * 32 entries per hash list if totally full and uniformly spread.
1351 */
1352 hash_bits = ilog2(p->cq_entries);
1353 hash_bits -= 5;
1354 if (hash_bits <= 0)
1355 hash_bits = 1;
1356 ctx->cancel_hash_bits = hash_bits;
1357 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1358 GFP_KERNEL);
1359 if (!ctx->cancel_hash)
1360 goto err;
1361 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1362
21482896 1363 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1364 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1365 goto err;
2b188cc1
JA
1366
1367 ctx->flags = p->flags;
90554200 1368 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1369 INIT_LIST_HEAD(&ctx->sqd_list);
2b188cc1 1370 init_waitqueue_head(&ctx->cq_wait);
1d7bb1d5 1371 INIT_LIST_HEAD(&ctx->cq_overflow_list);
0f158b4c
JA
1372 init_completion(&ctx->ref_comp);
1373 init_completion(&ctx->sq_thread_comp);
5a2e745d 1374 idr_init(&ctx->io_buffer_idr);
071698e1 1375 idr_init(&ctx->personality_idr);
2b188cc1
JA
1376 mutex_init(&ctx->uring_lock);
1377 init_waitqueue_head(&ctx->wait);
1378 spin_lock_init(&ctx->completion_lock);
540e32a0 1379 INIT_LIST_HEAD(&ctx->iopoll_list);
de0617e4 1380 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1381 INIT_LIST_HEAD(&ctx->timeout_list);
fcb323cc
JA
1382 spin_lock_init(&ctx->inflight_lock);
1383 INIT_LIST_HEAD(&ctx->inflight_list);
d67d2263
BM
1384 spin_lock_init(&ctx->rsrc_ref_lock);
1385 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1386 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1387 init_llist_head(&ctx->rsrc_put_llist);
1b4c351f 1388 INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
c7dae4ba 1389 INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
2b188cc1 1390 return ctx;
206aefde 1391err:
78076bb6 1392 kfree(ctx->cancel_hash);
206aefde
JA
1393 kfree(ctx);
1394 return NULL;
2b188cc1
JA
1395}
1396
9cf7c104 1397static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1398{
2bc9930e
JA
1399 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1400 struct io_ring_ctx *ctx = req->ctx;
a197f664 1401
9cf7c104 1402 return seq != ctx->cached_cq_tail
2c3bac6d 1403 + READ_ONCE(ctx->cached_cq_overflow);
2bc9930e 1404 }
de0617e4 1405
9d858b21 1406 return false;
de0617e4
JA
1407}
1408
5c3462cf 1409static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
1e6fa521 1410{
500a373d 1411 if (req->work.identity == &tctx->__identity)
1e6fa521
JA
1412 return;
1413 if (refcount_dec_and_test(&req->work.identity->count))
1414 kfree(req->work.identity);
1415}
1416
4edf20f9 1417static void io_req_clean_work(struct io_kiocb *req)
18d9be1a 1418{
7cdaf587 1419 if (!(req->flags & REQ_F_WORK_INITIALIZED))
4edf20f9 1420 return;
51a4cc11 1421
e86d0047 1422 if (req->work.flags & IO_WQ_WORK_MM)
98447d65 1423 mmdrop(req->work.identity->mm);
91d8f519 1424#ifdef CONFIG_BLK_CGROUP
e86d0047 1425 if (req->work.flags & IO_WQ_WORK_BLKCG)
98447d65 1426 css_put(req->work.identity->blkcg_css);
91d8f519 1427#endif
e86d0047 1428 if (req->work.flags & IO_WQ_WORK_CREDS)
98447d65 1429 put_cred(req->work.identity->creds);
dfead8a8 1430 if (req->work.flags & IO_WQ_WORK_FS) {
98447d65 1431 struct fs_struct *fs = req->work.identity->fs;
51a4cc11 1432
98447d65 1433 spin_lock(&req->work.identity->fs->lock);
ff002b30
JA
1434 if (--fs->users)
1435 fs = NULL;
98447d65 1436 spin_unlock(&req->work.identity->fs->lock);
ff002b30
JA
1437 if (fs)
1438 free_fs_struct(fs);
1439 }
34e08fed
PB
1440 if (req->work.flags & IO_WQ_WORK_FILES) {
1441 put_files_struct(req->work.identity->files);
1442 put_nsproxy(req->work.identity->nsproxy);
34e08fed
PB
1443 }
1444 if (req->flags & REQ_F_INFLIGHT) {
1445 struct io_ring_ctx *ctx = req->ctx;
1446 struct io_uring_task *tctx = req->task->io_uring;
1447 unsigned long flags;
1448
1449 spin_lock_irqsave(&ctx->inflight_lock, flags);
1450 list_del(&req->inflight_entry);
1451 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1452 req->flags &= ~REQ_F_INFLIGHT;
1453 if (atomic_read(&tctx->in_idle))
1454 wake_up(&tctx->wait);
1455 }
51a4cc11 1456
e86d0047
PB
1457 req->flags &= ~REQ_F_WORK_INITIALIZED;
1458 req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
1459 IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
5c3462cf 1460 io_put_identity(req->task->io_uring, req);
561fb04a
JA
1461}
1462
1e6fa521
JA
1463/*
1464 * Create a private copy of io_identity, since some fields don't match
1465 * the current context.
1466 */
1467static bool io_identity_cow(struct io_kiocb *req)
1468{
5c3462cf 1469 struct io_uring_task *tctx = current->io_uring;
1e6fa521
JA
1470 const struct cred *creds = NULL;
1471 struct io_identity *id;
1472
1473 if (req->work.flags & IO_WQ_WORK_CREDS)
1474 creds = req->work.identity->creds;
1475
1476 id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
1477 if (unlikely(!id)) {
1478 req->work.flags |= IO_WQ_WORK_CANCEL;
1479 return false;
1480 }
1481
1482 /*
1483 * We can safely just re-init the creds we copied Either the field
1484 * matches the current one, or we haven't grabbed it yet. The only
1485 * exception is ->creds, through registered personalities, so handle
1486 * that one separately.
1487 */
1488 io_init_identity(id);
1489 if (creds)
e8c954df 1490 id->creds = creds;
1e6fa521
JA
1491
1492 /* add one for this request */
1493 refcount_inc(&id->count);
1494
cb8a8ae3
JA
1495 /* drop tctx and req identity references, if needed */
1496 if (tctx->identity != &tctx->__identity &&
1497 refcount_dec_and_test(&tctx->identity->count))
1498 kfree(tctx->identity);
1499 if (req->work.identity != &tctx->__identity &&
1500 refcount_dec_and_test(&req->work.identity->count))
1e6fa521
JA
1501 kfree(req->work.identity);
1502
1503 req->work.identity = id;
500a373d 1504 tctx->identity = id;
1e6fa521
JA
1505 return true;
1506}
1507
ce3d5aae
PB
1508static void io_req_track_inflight(struct io_kiocb *req)
1509{
1510 struct io_ring_ctx *ctx = req->ctx;
1511
1512 if (!(req->flags & REQ_F_INFLIGHT)) {
1513 io_req_init_async(req);
1514 req->flags |= REQ_F_INFLIGHT;
1515
1516 spin_lock_irq(&ctx->inflight_lock);
1517 list_add(&req->inflight_entry, &ctx->inflight_list);
1518 spin_unlock_irq(&ctx->inflight_lock);
1519 }
1520}
1521
1e6fa521 1522static bool io_grab_identity(struct io_kiocb *req)
18d9be1a 1523{
d3656344 1524 const struct io_op_def *def = &io_op_defs[req->opcode];
5c3462cf 1525 struct io_identity *id = req->work.identity;
54a91f3b 1526
69228338
JA
1527 if (def->work_flags & IO_WQ_WORK_FSIZE) {
1528 if (id->fsize != rlimit(RLIMIT_FSIZE))
1529 return false;
1530 req->work.flags |= IO_WQ_WORK_FSIZE;
1531 }
91d8f519 1532#ifdef CONFIG_BLK_CGROUP
dfead8a8
JA
1533 if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
1534 (def->work_flags & IO_WQ_WORK_BLKCG)) {
91d8f519 1535 rcu_read_lock();
1e6fa521
JA
1536 if (id->blkcg_css != blkcg_css()) {
1537 rcu_read_unlock();
1538 return false;
1539 }
91d8f519
DZ
1540 /*
1541 * This should be rare, either the cgroup is dying or the task
1542 * is moving cgroups. Just punt to root for the handful of ios.
1543 */
1e6fa521 1544 if (css_tryget_online(id->blkcg_css))
dfead8a8 1545 req->work.flags |= IO_WQ_WORK_BLKCG;
91d8f519
DZ
1546 rcu_read_unlock();
1547 }
1548#endif
dfead8a8 1549 if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
1e6fa521
JA
1550 if (id->creds != current_cred())
1551 return false;
1552 get_cred(id->creds);
dfead8a8
JA
1553 req->work.flags |= IO_WQ_WORK_CREDS;
1554 }
4ea33a97
JA
1555#ifdef CONFIG_AUDIT
1556 if (!uid_eq(current->loginuid, id->loginuid) ||
1557 current->sessionid != id->sessionid)
1558 return false;
1559#endif
dfead8a8
JA
1560 if (!(req->work.flags & IO_WQ_WORK_FS) &&
1561 (def->work_flags & IO_WQ_WORK_FS)) {
1e6fa521
JA
1562 if (current->fs != id->fs)
1563 return false;
1564 spin_lock(&id->fs->lock);
1565 if (!id->fs->in_exec) {
1566 id->fs->users++;
dfead8a8 1567 req->work.flags |= IO_WQ_WORK_FS;
dca9cf8b
PB
1568 } else {
1569 req->work.flags |= IO_WQ_WORK_CANCEL;
1570 }
1571 spin_unlock(&current->fs->lock);
1572 }
af604703
PB
1573 if (!(req->work.flags & IO_WQ_WORK_FILES) &&
1574 (def->work_flags & IO_WQ_WORK_FILES) &&
1575 !(req->flags & REQ_F_NO_FILE_TABLE)) {
1576 if (id->files != current->files ||
1577 id->nsproxy != current->nsproxy)
1578 return false;
1579 atomic_inc(&id->files->count);
1580 get_nsproxy(id->nsproxy);
af604703 1581 req->work.flags |= IO_WQ_WORK_FILES;
ce3d5aae 1582 io_req_track_inflight(req);
af604703 1583 }
77788775
JA
1584 if (!(req->work.flags & IO_WQ_WORK_MM) &&
1585 (def->work_flags & IO_WQ_WORK_MM)) {
1586 if (id->mm != current->mm)
1587 return false;
1588 mmgrab(id->mm);
1589 req->work.flags |= IO_WQ_WORK_MM;
1590 }
1e6fa521
JA
1591
1592 return true;
1593}
1594
1595static void io_prep_async_work(struct io_kiocb *req)
1596{
1597 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1598 struct io_ring_ctx *ctx = req->ctx;
1599
1600 io_req_init_async(req);
1601
feaadc4f
PB
1602 if (req->flags & REQ_F_FORCE_ASYNC)
1603 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1604
1e6fa521
JA
1605 if (req->flags & REQ_F_ISREG) {
1606 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1607 io_wq_hash_work(&req->work, file_inode(req->file));
1608 } else {
1609 if (def->unbound_nonreg_file)
1610 req->work.flags |= IO_WQ_WORK_UNBOUND;
1611 }
1612
1e6fa521
JA
1613 /* if we fail grabbing identity, we must COW, regrab, and retry */
1614 if (io_grab_identity(req))
1615 return;
1616
1617 if (!io_identity_cow(req))
1618 return;
1619
1620 /* can't fail at this point */
1621 if (!io_grab_identity(req))
1622 WARN_ON(1);
561fb04a 1623}
cccf0ee8 1624
cbdcb435 1625static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1626{
cbdcb435 1627 struct io_kiocb *cur;
54a91f3b 1628
f2f87370
PB
1629 io_for_each_link(cur, req)
1630 io_prep_async_work(cur);
561fb04a
JA
1631}
1632
7271ef3a 1633static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
561fb04a 1634{
a197f664 1635 struct io_ring_ctx *ctx = req->ctx;
cbdcb435 1636 struct io_kiocb *link = io_prep_linked_timeout(req);
561fb04a 1637
8766dd51
PB
1638 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1639 &req->work, req->flags);
1640 io_wq_enqueue(ctx->io_wq, &req->work);
7271ef3a 1641 return link;
18d9be1a
JA
1642}
1643
cbdcb435
PB
1644static void io_queue_async_work(struct io_kiocb *req)
1645{
7271ef3a
JA
1646 struct io_kiocb *link;
1647
cbdcb435
PB
1648 /* init ->work of the whole link before punting */
1649 io_prep_async_link(req);
7271ef3a
JA
1650 link = __io_queue_async_work(req);
1651
1652 if (link)
1653 io_queue_linked_timeout(link);
cbdcb435
PB
1654}
1655
5262f567
JA
1656static void io_kill_timeout(struct io_kiocb *req)
1657{
e8c2bc1f 1658 struct io_timeout_data *io = req->async_data;
5262f567
JA
1659 int ret;
1660
e8c2bc1f 1661 ret = hrtimer_try_to_cancel(&io->timer);
5262f567 1662 if (ret != -1) {
01cec8c1
PB
1663 atomic_set(&req->ctx->cq_timeouts,
1664 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1665 list_del_init(&req->timeout.list);
78e19bbe 1666 io_cqring_fill_event(req, 0);
216578e5 1667 io_put_req_deferred(req, 1);
5262f567
JA
1668 }
1669}
1670
76e1b642
JA
1671/*
1672 * Returns true if we found and killed one or more timeouts
1673 */
6b81928d
PB
1674static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
1675 struct files_struct *files)
5262f567
JA
1676{
1677 struct io_kiocb *req, *tmp;
76e1b642 1678 int canceled = 0;
5262f567
JA
1679
1680 spin_lock_irq(&ctx->completion_lock);
f3606e3a 1681 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
6b81928d 1682 if (io_match_task(req, tsk, files)) {
f3606e3a 1683 io_kill_timeout(req);
76e1b642
JA
1684 canceled++;
1685 }
f3606e3a 1686 }
5262f567 1687 spin_unlock_irq(&ctx->completion_lock);
76e1b642 1688 return canceled != 0;
5262f567
JA
1689}
1690
04518945 1691static void __io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 1692{
04518945 1693 do {
27dc8338
PB
1694 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1695 struct io_defer_entry, list);
de0617e4 1696
9cf7c104 1697 if (req_need_defer(de->req, de->seq))
04518945 1698 break;
27dc8338 1699 list_del_init(&de->list);
907d1df3 1700 io_req_task_queue(de->req);
27dc8338 1701 kfree(de);
04518945
PB
1702 } while (!list_empty(&ctx->defer_list));
1703}
1704
360428f8 1705static void io_flush_timeouts(struct io_ring_ctx *ctx)
de0617e4 1706{
f010505b
MDG
1707 u32 seq;
1708
1709 if (list_empty(&ctx->timeout_list))
1710 return;
1711
1712 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1713
1714 do {
1715 u32 events_needed, events_got;
360428f8 1716 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
135fcde8 1717 struct io_kiocb, timeout.list);
de0617e4 1718
8eb7e2d0 1719 if (io_is_timeout_noseq(req))
360428f8 1720 break;
f010505b
MDG
1721
1722 /*
1723 * Since seq can easily wrap around over time, subtract
1724 * the last seq at which timeouts were flushed before comparing.
1725 * Assuming not more than 2^31-1 events have happened since,
1726 * these subtractions won't have wrapped, so we can check if
1727 * target is in [last_seq, current_seq] by comparing the two.
1728 */
1729 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1730 events_got = seq - ctx->cq_last_tm_flush;
1731 if (events_got < events_needed)
360428f8 1732 break;
bfe68a22 1733
135fcde8 1734 list_del_init(&req->timeout.list);
5262f567 1735 io_kill_timeout(req);
f010505b
MDG
1736 } while (!list_empty(&ctx->timeout_list));
1737
1738 ctx->cq_last_tm_flush = seq;
360428f8 1739}
5262f567 1740
360428f8
PB
1741static void io_commit_cqring(struct io_ring_ctx *ctx)
1742{
1743 io_flush_timeouts(ctx);
ec30e04b
PB
1744
1745 /* order cqe stores with ring update */
1746 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
de0617e4 1747
04518945
PB
1748 if (unlikely(!list_empty(&ctx->defer_list)))
1749 __io_queue_deferred(ctx);
de0617e4
JA
1750}
1751
90554200
JA
1752static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1753{
1754 struct io_rings *r = ctx->rings;
1755
1756 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1757}
1758
888aae2e
PB
1759static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1760{
1761 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1762}
1763
2b188cc1
JA
1764static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1765{
75b28aff 1766 struct io_rings *rings = ctx->rings;
2b188cc1
JA
1767 unsigned tail;
1768
115e12e5
SB
1769 /*
1770 * writes to the cq entry need to come after reading head; the
1771 * control dependency is enough as we're using WRITE_ONCE to
1772 * fill the cq entry
1773 */
888aae2e 1774 if (__io_cqring_events(ctx) == rings->cq_ring_entries)
2b188cc1
JA
1775 return NULL;
1776
888aae2e 1777 tail = ctx->cached_cq_tail++;
75b28aff 1778 return &rings->cqes[tail & ctx->cq_mask];
2b188cc1
JA
1779}
1780
f2842ab5
JA
1781static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1782{
f0b493e6
JA
1783 if (!ctx->cq_ev_fd)
1784 return false;
7e55a19c
SG
1785 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1786 return false;
f2842ab5
JA
1787 if (!ctx->eventfd_async)
1788 return true;
b41e9852 1789 return io_wq_current_is_worker();
f2842ab5
JA
1790}
1791
b41e9852 1792static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 1793{
b1445e59
PB
1794 /* see waitqueue_active() comment */
1795 smp_mb();
1796
1d7bb1d5
JA
1797 if (waitqueue_active(&ctx->wait))
1798 wake_up(&ctx->wait);
534ca6d6
JA
1799 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1800 wake_up(&ctx->sq_data->wait);
b41e9852 1801 if (io_should_trigger_evfd(ctx))
1d7bb1d5 1802 eventfd_signal(ctx->cq_ev_fd, 1);
b1445e59 1803 if (waitqueue_active(&ctx->cq_wait)) {
4aa84f2f
PB
1804 wake_up_interruptible(&ctx->cq_wait);
1805 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1806 }
1d7bb1d5
JA
1807}
1808
80c18e4a
PB
1809static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1810{
b1445e59
PB
1811 /* see waitqueue_active() comment */
1812 smp_mb();
1813
80c18e4a
PB
1814 if (ctx->flags & IORING_SETUP_SQPOLL) {
1815 if (waitqueue_active(&ctx->wait))
1816 wake_up(&ctx->wait);
1817 }
1818 if (io_should_trigger_evfd(ctx))
1819 eventfd_signal(ctx->cq_ev_fd, 1);
b1445e59 1820 if (waitqueue_active(&ctx->cq_wait)) {
4aa84f2f
PB
1821 wake_up_interruptible(&ctx->cq_wait);
1822 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1823 }
80c18e4a
PB
1824}
1825
c4a2ed72 1826/* Returns true if there are no backlogged entries after the flush */
6c503150
PB
1827static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1828 struct task_struct *tsk,
1829 struct files_struct *files)
1d7bb1d5
JA
1830{
1831 struct io_rings *rings = ctx->rings;
e6c8aa9a 1832 struct io_kiocb *req, *tmp;
1d7bb1d5 1833 struct io_uring_cqe *cqe;
1d7bb1d5 1834 unsigned long flags;
b18032bb 1835 bool all_flushed, posted;
1d7bb1d5
JA
1836 LIST_HEAD(list);
1837
e23de15f
PB
1838 if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
1839 return false;
1d7bb1d5 1840
b18032bb 1841 posted = false;
1d7bb1d5 1842 spin_lock_irqsave(&ctx->completion_lock, flags);
e6c8aa9a 1843 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
08d23634 1844 if (!io_match_task(req, tsk, files))
e6c8aa9a
JA
1845 continue;
1846
1d7bb1d5
JA
1847 cqe = io_get_cqring(ctx);
1848 if (!cqe && !force)
1849 break;
1850
40d8ddd4 1851 list_move(&req->compl.list, &list);
1d7bb1d5
JA
1852 if (cqe) {
1853 WRITE_ONCE(cqe->user_data, req->user_data);
1854 WRITE_ONCE(cqe->res, req->result);
0f7e466b 1855 WRITE_ONCE(cqe->flags, req->compl.cflags);
1d7bb1d5 1856 } else {
2c3bac6d 1857 ctx->cached_cq_overflow++;
1d7bb1d5 1858 WRITE_ONCE(ctx->rings->cq_overflow,
2c3bac6d 1859 ctx->cached_cq_overflow);
1d7bb1d5 1860 }
b18032bb 1861 posted = true;
1d7bb1d5
JA
1862 }
1863
09e88404
PB
1864 all_flushed = list_empty(&ctx->cq_overflow_list);
1865 if (all_flushed) {
1866 clear_bit(0, &ctx->sq_check_overflow);
1867 clear_bit(0, &ctx->cq_check_overflow);
1868 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1869 }
46930143 1870
b18032bb
JA
1871 if (posted)
1872 io_commit_cqring(ctx);
1d7bb1d5 1873 spin_unlock_irqrestore(&ctx->completion_lock, flags);
b18032bb
JA
1874 if (posted)
1875 io_cqring_ev_posted(ctx);
1d7bb1d5
JA
1876
1877 while (!list_empty(&list)) {
40d8ddd4
PB
1878 req = list_first_entry(&list, struct io_kiocb, compl.list);
1879 list_del(&req->compl.list);
ec9c02ad 1880 io_put_req(req);
1d7bb1d5 1881 }
c4a2ed72 1882
09e88404 1883 return all_flushed;
1d7bb1d5
JA
1884}
1885
6c503150
PB
1886static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1887 struct task_struct *tsk,
1888 struct files_struct *files)
1889{
1890 if (test_bit(0, &ctx->cq_check_overflow)) {
1891 /* iopoll syncs against uring_lock, not completion_lock */
1892 if (ctx->flags & IORING_SETUP_IOPOLL)
1893 mutex_lock(&ctx->uring_lock);
1894 __io_cqring_overflow_flush(ctx, force, tsk, files);
1895 if (ctx->flags & IORING_SETUP_IOPOLL)
1896 mutex_unlock(&ctx->uring_lock);
1897 }
1898}
1899
bcda7baa 1900static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
2b188cc1 1901{
78e19bbe 1902 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1903 struct io_uring_cqe *cqe;
1904
78e19bbe 1905 trace_io_uring_complete(ctx, req->user_data, res);
51c3ff62 1906
2b188cc1
JA
1907 /*
1908 * If we can't get a cq entry, userspace overflowed the
1909 * submission (by quite a lot). Increment the overflow count in
1910 * the ring.
1911 */
1912 cqe = io_get_cqring(ctx);
1d7bb1d5 1913 if (likely(cqe)) {
78e19bbe 1914 WRITE_ONCE(cqe->user_data, req->user_data);
2b188cc1 1915 WRITE_ONCE(cqe->res, res);
bcda7baa 1916 WRITE_ONCE(cqe->flags, cflags);
fdaf083c
JA
1917 } else if (ctx->cq_overflow_flushed ||
1918 atomic_read(&req->task->io_uring->in_idle)) {
0f212204
JA
1919 /*
1920 * If we're in ring overflow flush mode, or in task cancel mode,
1921 * then we cannot store the request for later flushing, we need
1922 * to drop it on the floor.
1923 */
2c3bac6d
PB
1924 ctx->cached_cq_overflow++;
1925 WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1d7bb1d5 1926 } else {
ad3eb2c8
JA
1927 if (list_empty(&ctx->cq_overflow_list)) {
1928 set_bit(0, &ctx->sq_check_overflow);
1929 set_bit(0, &ctx->cq_check_overflow);
6d5f9049 1930 ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
ad3eb2c8 1931 }
40d8ddd4 1932 io_clean_op(req);
1d7bb1d5 1933 req->result = res;
0f7e466b 1934 req->compl.cflags = cflags;
40d8ddd4
PB
1935 refcount_inc(&req->refs);
1936 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
2b188cc1
JA
1937 }
1938}
1939
bcda7baa
JA
1940static void io_cqring_fill_event(struct io_kiocb *req, long res)
1941{
1942 __io_cqring_fill_event(req, res, 0);
1943}
1944
c7dae4ba
JA
1945static inline void io_req_complete_post(struct io_kiocb *req, long res,
1946 unsigned int cflags)
2b188cc1 1947{
78e19bbe 1948 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1949 unsigned long flags;
1950
1951 spin_lock_irqsave(&ctx->completion_lock, flags);
bcda7baa 1952 __io_cqring_fill_event(req, res, cflags);
2b188cc1 1953 io_commit_cqring(ctx);
c7dae4ba
JA
1954 /*
1955 * If we're the last reference to this request, add to our locked
1956 * free_list cache.
1957 */
1958 if (refcount_dec_and_test(&req->refs)) {
1959 struct io_comp_state *cs = &ctx->submit_state.comp;
1960
1961 io_dismantle_req(req);
1962 io_put_task(req->task, 1);
1963 list_add(&req->compl.list, &cs->locked_free_list);
1964 cs->locked_free_nr++;
1965 } else
1966 req = NULL;
2b188cc1
JA
1967 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1968
8c838788 1969 io_cqring_ev_posted(ctx);
c7dae4ba
JA
1970 if (req) {
1971 io_queue_next(req);
1972 percpu_ref_put(&ctx->refs);
229a7b63 1973 }
229a7b63
JA
1974}
1975
a38d68db 1976static void io_req_complete_state(struct io_kiocb *req, long res,
889fca73 1977 unsigned int cflags)
229a7b63 1978{
a38d68db
PB
1979 io_clean_op(req);
1980 req->result = res;
1981 req->compl.cflags = cflags;
e342c807 1982 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
1983}
1984
889fca73
PB
1985static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1986 long res, unsigned cflags)
bcda7baa 1987{
889fca73
PB
1988 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1989 io_req_complete_state(req, res, cflags);
a38d68db 1990 else
c7dae4ba 1991 io_req_complete_post(req, res, cflags);
bcda7baa
JA
1992}
1993
a38d68db 1994static inline void io_req_complete(struct io_kiocb *req, long res)
0ddf92e8 1995{
889fca73 1996 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
1997}
1998
c7dae4ba 1999static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
0ddf92e8 2000{
c7dae4ba
JA
2001 struct io_submit_state *state = &ctx->submit_state;
2002 struct io_comp_state *cs = &state->comp;
e5d1bc0a 2003 struct io_kiocb *req = NULL;
0ddf92e8 2004
c7dae4ba
JA
2005 /*
2006 * If we have more than a batch's worth of requests in our IRQ side
2007 * locked cache, grab the lock and move them over to our submission
2008 * side cache.
2009 */
2010 if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
2011 spin_lock_irq(&ctx->completion_lock);
2012 list_splice_init(&cs->locked_free_list, &cs->free_list);
2013 cs->locked_free_nr = 0;
2014 spin_unlock_irq(&ctx->completion_lock);
2015 }
0ddf92e8 2016
c7dae4ba
JA
2017 while (!list_empty(&cs->free_list)) {
2018 req = list_first_entry(&cs->free_list, struct io_kiocb,
1b4c351f
JA
2019 compl.list);
2020 list_del(&req->compl.list);
e5d1bc0a
PB
2021 state->reqs[state->free_reqs++] = req;
2022 if (state->free_reqs == ARRAY_SIZE(state->reqs))
2023 break;
1b4c351f
JA
2024 }
2025
e5d1bc0a 2026 return req != NULL;
0ddf92e8
JA
2027}
2028
e5d1bc0a 2029static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2b188cc1 2030{
e5d1bc0a
PB
2031 struct io_submit_state *state = &ctx->submit_state;
2032
2033 BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
2034
f6b6c7d6 2035 if (!state->free_reqs) {
291b2821 2036 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2579f913
JA
2037 int ret;
2038
c7dae4ba 2039 if (io_flush_cached_reqs(ctx))
e5d1bc0a
PB
2040 goto got_req;
2041
bf019da7
PB
2042 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
2043 state->reqs);
fd6fab2c
JA
2044
2045 /*
2046 * Bulk alloc is all-or-nothing. If we fail to get a batch,
2047 * retry single alloc to be on the safe side.
2048 */
2049 if (unlikely(ret <= 0)) {
2050 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2051 if (!state->reqs[0])
3893f39f 2052 return NULL;
fd6fab2c
JA
2053 ret = 1;
2054 }
291b2821 2055 state->free_reqs = ret;
2b188cc1 2056 }
e5d1bc0a 2057got_req:
291b2821
PB
2058 state->free_reqs--;
2059 return state->reqs[state->free_reqs];
2b188cc1
JA
2060}
2061
8da11c19
PB
2062static inline void io_put_file(struct io_kiocb *req, struct file *file,
2063 bool fixed)
2064{
36f72fe2 2065 if (!fixed)
8da11c19
PB
2066 fput(file);
2067}
2068
4edf20f9 2069static void io_dismantle_req(struct io_kiocb *req)
2b188cc1 2070{
3ca405eb 2071 io_clean_op(req);
929a3af9 2072
e8c2bc1f
JA
2073 if (req->async_data)
2074 kfree(req->async_data);
8da11c19
PB
2075 if (req->file)
2076 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
269bbe5f
BM
2077 if (req->fixed_rsrc_refs)
2078 percpu_ref_put(req->fixed_rsrc_refs);
4edf20f9 2079 io_req_clean_work(req);
e65ef56d
JA
2080}
2081
7c660731
PB
2082static inline void io_put_task(struct task_struct *task, int nr)
2083{
2084 struct io_uring_task *tctx = task->io_uring;
2085
2086 percpu_counter_sub(&tctx->inflight, nr);
2087 if (unlikely(atomic_read(&tctx->in_idle)))
2088 wake_up(&tctx->wait);
2089 put_task_struct_many(task, nr);
2090}
2091
216578e5 2092static void __io_free_req(struct io_kiocb *req)
c6ca97b3 2093{
51a4cc11 2094 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 2095
216578e5 2096 io_dismantle_req(req);
7c660731 2097 io_put_task(req->task, 1);
c6ca97b3 2098
3893f39f 2099 kmem_cache_free(req_cachep, req);
ecfc5177 2100 percpu_ref_put(&ctx->refs);
e65ef56d
JA
2101}
2102
f2f87370
PB
2103static inline void io_remove_next_linked(struct io_kiocb *req)
2104{
2105 struct io_kiocb *nxt = req->link;
2106
2107 req->link = nxt->link;
2108 nxt->link = NULL;
2109}
2110
c9abd7ad 2111static void io_kill_linked_timeout(struct io_kiocb *req)
2665abfd 2112{
a197f664 2113 struct io_ring_ctx *ctx = req->ctx;
7c86ffee 2114 struct io_kiocb *link;
c9abd7ad
PB
2115 bool cancelled = false;
2116 unsigned long flags;
7c86ffee 2117
c9abd7ad 2118 spin_lock_irqsave(&ctx->completion_lock, flags);
f2f87370
PB
2119 link = req->link;
2120
900fad45
PB
2121 /*
2122 * Can happen if a linked timeout fired and link had been like
2123 * req -> link t-out -> link t-out [-> ...]
2124 */
c9abd7ad
PB
2125 if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
2126 struct io_timeout_data *io = link->async_data;
2127 int ret;
7c86ffee 2128
f2f87370 2129 io_remove_next_linked(req);
90cd7e42 2130 link->timeout.head = NULL;
c9abd7ad
PB
2131 ret = hrtimer_try_to_cancel(&io->timer);
2132 if (ret != -1) {
2133 io_cqring_fill_event(link, -ECANCELED);
2134 io_commit_cqring(ctx);
2135 cancelled = true;
2136 }
2137 }
7c86ffee 2138 req->flags &= ~REQ_F_LINK_TIMEOUT;
216578e5 2139 spin_unlock_irqrestore(&ctx->completion_lock, flags);
ab0b6451 2140
c9abd7ad 2141 if (cancelled) {
7c86ffee 2142 io_cqring_ev_posted(ctx);
c9abd7ad
PB
2143 io_put_req(link);
2144 }
7c86ffee
PB
2145}
2146
9e645e11 2147
d148ca4b 2148static void io_fail_links(struct io_kiocb *req)
9e645e11 2149{
f2f87370 2150 struct io_kiocb *link, *nxt;
2665abfd 2151 struct io_ring_ctx *ctx = req->ctx;
d148ca4b 2152 unsigned long flags;
9e645e11 2153
d148ca4b 2154 spin_lock_irqsave(&ctx->completion_lock, flags);
f2f87370
PB
2155 link = req->link;
2156 req->link = NULL;
9e645e11 2157
f2f87370
PB
2158 while (link) {
2159 nxt = link->link;
2160 link->link = NULL;
2665abfd 2161
f2f87370 2162 trace_io_uring_fail_link(req, link);
7c86ffee 2163 io_cqring_fill_event(link, -ECANCELED);
216578e5
PB
2164
2165 /*
2166 * It's ok to free under spinlock as they're not linked anymore,
2167 * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
2168 * work.fs->lock.
2169 */
2170 if (link->flags & REQ_F_WORK_INITIALIZED)
2171 io_put_req_deferred(link, 2);
2172 else
2173 io_double_put_req(link);
f2f87370 2174 link = nxt;
9e645e11 2175 }
2665abfd 2176 io_commit_cqring(ctx);
216578e5 2177 spin_unlock_irqrestore(&ctx->completion_lock, flags);
9e645e11 2178
2665abfd 2179 io_cqring_ev_posted(ctx);
9e645e11
JA
2180}
2181
3fa5e0f3 2182static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
c69f8dbe 2183{
7c86ffee
PB
2184 if (req->flags & REQ_F_LINK_TIMEOUT)
2185 io_kill_linked_timeout(req);
944e58bf 2186
9e645e11
JA
2187 /*
2188 * If LINK is set, we have dependent requests in this chain. If we
2189 * didn't fail this request, queue the first one up, moving any other
2190 * dependencies to the next request. In case of failure, fail the rest
2191 * of the chain.
2192 */
f2f87370
PB
2193 if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
2194 struct io_kiocb *nxt = req->link;
2195
2196 req->link = NULL;
2197 return nxt;
2198 }
9b5f7bd9
PB
2199 io_fail_links(req);
2200 return NULL;
4d7dd462 2201}
9e645e11 2202
f2f87370 2203static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
3fa5e0f3 2204{
cdbff982 2205 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
3fa5e0f3
PB
2206 return NULL;
2207 return __io_req_find_next(req);
2208}
2209
7cbf1722 2210static bool __tctx_task_work(struct io_uring_task *tctx)
c2c4c83c 2211{
65453d1e 2212 struct io_ring_ctx *ctx = NULL;
7cbf1722
JA
2213 struct io_wq_work_list list;
2214 struct io_wq_work_node *node;
c2c4c83c 2215
7cbf1722
JA
2216 if (wq_list_empty(&tctx->task_list))
2217 return false;
6200b0ae 2218
0b81e80c 2219 spin_lock_irq(&tctx->task_lock);
7cbf1722
JA
2220 list = tctx->task_list;
2221 INIT_WQ_LIST(&tctx->task_list);
0b81e80c 2222 spin_unlock_irq(&tctx->task_lock);
c2c4c83c 2223
7cbf1722
JA
2224 node = list.first;
2225 while (node) {
2226 struct io_wq_work_node *next = node->next;
65453d1e 2227 struct io_ring_ctx *this_ctx;
7cbf1722 2228 struct io_kiocb *req;
0ba9c9ed 2229
7cbf1722 2230 req = container_of(node, struct io_kiocb, io_task_work.node);
65453d1e 2231 this_ctx = req->ctx;
7cbf1722
JA
2232 req->task_work.func(&req->task_work);
2233 node = next;
65453d1e
JA
2234
2235 if (!ctx) {
2236 ctx = this_ctx;
2237 } else if (ctx != this_ctx) {
2238 mutex_lock(&ctx->uring_lock);
2239 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
2240 mutex_unlock(&ctx->uring_lock);
2241 ctx = this_ctx;
2242 }
2243 }
2244
2245 if (ctx && ctx->submit_state.comp.nr) {
2246 mutex_lock(&ctx->uring_lock);
2247 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
2248 mutex_unlock(&ctx->uring_lock);
7cbf1722
JA
2249 }
2250
2251 return list.first != NULL;
c2c4c83c
JA
2252}
2253
7cbf1722 2254static void tctx_task_work(struct callback_head *cb)
c40f6379 2255{
7cbf1722 2256 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
c40f6379 2257
7cbf1722
JA
2258 while (__tctx_task_work(tctx))
2259 cond_resched();
2260
2261 clear_bit(0, &tctx->task_state);
2262}
2263
2264static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
2265 enum task_work_notify_mode notify)
2266{
2267 struct io_uring_task *tctx = tsk->io_uring;
2268 struct io_wq_work_node *node, *prev;
0b81e80c 2269 unsigned long flags;
7cbf1722
JA
2270 int ret;
2271
2272 WARN_ON_ONCE(!tctx);
2273
0b81e80c 2274 spin_lock_irqsave(&tctx->task_lock, flags);
7cbf1722 2275 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
0b81e80c 2276 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2277
2278 /* task_work already pending, we're done */
2279 if (test_bit(0, &tctx->task_state) ||
2280 test_and_set_bit(0, &tctx->task_state))
2281 return 0;
2282
2283 if (!task_work_add(tsk, &tctx->task_work, notify))
2284 return 0;
2285
2286 /*
2287 * Slow path - we failed, find and delete work. if the work is not
2288 * in the list, it got run and we're fine.
2289 */
2290 ret = 0;
0b81e80c 2291 spin_lock_irqsave(&tctx->task_lock, flags);
7cbf1722
JA
2292 wq_list_for_each(node, prev, &tctx->task_list) {
2293 if (&req->io_task_work.node == node) {
2294 wq_list_del(&tctx->task_list, node, prev);
2295 ret = 1;
2296 break;
2297 }
2298 }
0b81e80c 2299 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2300 clear_bit(0, &tctx->task_state);
2301 return ret;
2302}
2303
355fb9e2 2304static int io_req_task_work_add(struct io_kiocb *req)
c2c4c83c
JA
2305{
2306 struct task_struct *tsk = req->task;
2307 struct io_ring_ctx *ctx = req->ctx;
91989c70
JA
2308 enum task_work_notify_mode notify;
2309 int ret;
c2c4c83c 2310
6200b0ae
JA
2311 if (tsk->flags & PF_EXITING)
2312 return -ESRCH;
2313
c2c4c83c 2314 /*
0ba9c9ed
JA
2315 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2316 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2317 * processing task_work. There's no reliable way to tell if TWA_RESUME
2318 * will do the job.
c2c4c83c 2319 */
91989c70 2320 notify = TWA_NONE;
355fb9e2 2321 if (!(ctx->flags & IORING_SETUP_SQPOLL))
c2c4c83c
JA
2322 notify = TWA_SIGNAL;
2323
7cbf1722 2324 ret = io_task_work_add(tsk, req, notify);
c2c4c83c
JA
2325 if (!ret)
2326 wake_up_process(tsk);
0ba9c9ed 2327
c2c4c83c
JA
2328 return ret;
2329}
2330
eab30c4d 2331static void io_req_task_work_add_fallback(struct io_kiocb *req,
7cbf1722 2332 task_work_func_t cb)
eab30c4d 2333{
7c25c0d1
JA
2334 struct io_ring_ctx *ctx = req->ctx;
2335 struct callback_head *head;
eab30c4d
PB
2336
2337 init_task_work(&req->task_work, cb);
7c25c0d1
JA
2338 do {
2339 head = READ_ONCE(ctx->exit_task_work);
2340 req->task_work.next = head;
2341 } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
eab30c4d
PB
2342}
2343
c40f6379
JA
2344static void __io_req_task_cancel(struct io_kiocb *req, int error)
2345{
2346 struct io_ring_ctx *ctx = req->ctx;
2347
2348 spin_lock_irq(&ctx->completion_lock);
2349 io_cqring_fill_event(req, error);
2350 io_commit_cqring(ctx);
2351 spin_unlock_irq(&ctx->completion_lock);
2352
2353 io_cqring_ev_posted(ctx);
2354 req_set_fail_links(req);
2355 io_double_put_req(req);
2356}
2357
2358static void io_req_task_cancel(struct callback_head *cb)
2359{
2360 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
87ceb6a6 2361 struct io_ring_ctx *ctx = req->ctx;
c40f6379 2362
792bb6eb 2363 mutex_lock(&ctx->uring_lock);
a3df7698 2364 __io_req_task_cancel(req, req->result);
792bb6eb 2365 mutex_unlock(&ctx->uring_lock);
87ceb6a6 2366 percpu_ref_put(&ctx->refs);
c40f6379
JA
2367}
2368
2369static void __io_req_task_submit(struct io_kiocb *req)
2370{
2371 struct io_ring_ctx *ctx = req->ctx;
2372
04fc6c80 2373 /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
81b6d05c 2374 mutex_lock(&ctx->uring_lock);
dc0eced5
PB
2375 if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
2376 !io_sq_thread_acquire_mm_files(ctx, req))
c5eef2b9 2377 __io_queue_sqe(req);
81b6d05c 2378 else
c40f6379 2379 __io_req_task_cancel(req, -EFAULT);
81b6d05c 2380 mutex_unlock(&ctx->uring_lock);
aec18a57
PB
2381
2382 if (ctx->flags & IORING_SETUP_SQPOLL)
2383 io_sq_thread_drop_mm_files();
c40f6379
JA
2384}
2385
2386static void io_req_task_submit(struct callback_head *cb)
2387{
2388 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2389
2390 __io_req_task_submit(req);
2391}
2392
2393static void io_req_task_queue(struct io_kiocb *req)
2394{
c40f6379
JA
2395 int ret;
2396
7cbf1722 2397 req->task_work.func = io_req_task_submit;
355fb9e2 2398 ret = io_req_task_work_add(req);
c40f6379 2399 if (unlikely(ret)) {
a3df7698 2400 req->result = -ECANCELED;
04fc6c80 2401 percpu_ref_get(&req->ctx->refs);
eab30c4d 2402 io_req_task_work_add_fallback(req, io_req_task_cancel);
c40f6379 2403 }
c40f6379
JA
2404}
2405
a3df7698
PB
2406static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2407{
2408 percpu_ref_get(&req->ctx->refs);
2409 req->result = ret;
2410 req->task_work.func = io_req_task_cancel;
2411
2412 if (unlikely(io_req_task_work_add(req)))
2413 io_req_task_work_add_fallback(req, io_req_task_cancel);
2414}
2415
f2f87370 2416static inline void io_queue_next(struct io_kiocb *req)
c69f8dbe 2417{
9b5f7bd9 2418 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf
PB
2419
2420 if (nxt)
906a8c3f 2421 io_req_task_queue(nxt);
c69f8dbe
JL
2422}
2423
c3524383 2424static void io_free_req(struct io_kiocb *req)
7a743e22 2425{
c3524383
PB
2426 io_queue_next(req);
2427 __io_free_req(req);
2428}
8766dd51 2429
2d6500d4 2430struct req_batch {
5af1d13e
PB
2431 struct task_struct *task;
2432 int task_refs;
1b4c351f 2433 int ctx_refs;
2d6500d4
PB
2434};
2435
5af1d13e
PB
2436static inline void io_init_req_batch(struct req_batch *rb)
2437{
5af1d13e 2438 rb->task_refs = 0;
9ae72463 2439 rb->ctx_refs = 0;
5af1d13e
PB
2440 rb->task = NULL;
2441}
2442
2d6500d4
PB
2443static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2444 struct req_batch *rb)
2445{
6e833d53 2446 if (rb->task)
7c660731 2447 io_put_task(rb->task, rb->task_refs);
9ae72463
PB
2448 if (rb->ctx_refs)
2449 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2d6500d4
PB
2450}
2451
6ff119a6
PB
2452static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2453 struct io_submit_state *state)
2d6500d4 2454{
f2f87370 2455 io_queue_next(req);
2d6500d4 2456
e3bc8e9d 2457 if (req->task != rb->task) {
7c660731
PB
2458 if (rb->task)
2459 io_put_task(rb->task, rb->task_refs);
e3bc8e9d
JA
2460 rb->task = req->task;
2461 rb->task_refs = 0;
5af1d13e 2462 }
e3bc8e9d 2463 rb->task_refs++;
9ae72463 2464 rb->ctx_refs++;
5af1d13e 2465
4edf20f9 2466 io_dismantle_req(req);
bd759045 2467 if (state->free_reqs != ARRAY_SIZE(state->reqs))
6ff119a6 2468 state->reqs[state->free_reqs++] = req;
bd759045
PB
2469 else
2470 list_add(&req->compl.list, &state->comp.free_list);
7a743e22
PB
2471}
2472
905c172f
PB
2473static void io_submit_flush_completions(struct io_comp_state *cs,
2474 struct io_ring_ctx *ctx)
2475{
2476 int i, nr = cs->nr;
2477 struct io_kiocb *req;
2478 struct req_batch rb;
2479
2480 io_init_req_batch(&rb);
2481 spin_lock_irq(&ctx->completion_lock);
2482 for (i = 0; i < nr; i++) {
2483 req = cs->reqs[i];
2484 __io_cqring_fill_event(req, req->result, req->compl.cflags);
2485 }
2486 io_commit_cqring(ctx);
2487 spin_unlock_irq(&ctx->completion_lock);
2488
2489 io_cqring_ev_posted(ctx);
2490 for (i = 0; i < nr; i++) {
2491 req = cs->reqs[i];
2492
2493 /* submission and completion refs */
2494 if (refcount_sub_and_test(2, &req->refs))
6ff119a6 2495 io_req_free_batch(&rb, req, &ctx->submit_state);
905c172f
PB
2496 }
2497
2498 io_req_free_batch_finish(ctx, &rb);
2499 cs->nr = 0;
7a743e22
PB
2500}
2501
ba816ad6
JA
2502/*
2503 * Drop reference to request, return next in chain (if there is one) if this
2504 * was the last reference to this request.
2505 */
9b5f7bd9 2506static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 2507{
9b5f7bd9
PB
2508 struct io_kiocb *nxt = NULL;
2509
2a44f467 2510 if (refcount_dec_and_test(&req->refs)) {
9b5f7bd9 2511 nxt = io_req_find_next(req);
4d7dd462 2512 __io_free_req(req);
2a44f467 2513 }
9b5f7bd9 2514 return nxt;
2b188cc1
JA
2515}
2516
e65ef56d
JA
2517static void io_put_req(struct io_kiocb *req)
2518{
2519 if (refcount_dec_and_test(&req->refs))
2520 io_free_req(req);
2b188cc1
JA
2521}
2522
216578e5
PB
2523static void io_put_req_deferred_cb(struct callback_head *cb)
2524{
2525 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2526
2527 io_free_req(req);
2528}
2529
2530static void io_free_req_deferred(struct io_kiocb *req)
2531{
2532 int ret;
2533
7cbf1722 2534 req->task_work.func = io_put_req_deferred_cb;
355fb9e2 2535 ret = io_req_task_work_add(req);
eab30c4d
PB
2536 if (unlikely(ret))
2537 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
216578e5
PB
2538}
2539
2540static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2541{
2542 if (refcount_sub_and_test(refs, &req->refs))
2543 io_free_req_deferred(req);
2544}
2545
978db57e
JA
2546static void io_double_put_req(struct io_kiocb *req)
2547{
2548 /* drop both submit and complete references */
2549 if (refcount_sub_and_test(2, &req->refs))
2550 io_free_req(req);
2551}
2552
6c503150 2553static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
2554{
2555 /* See comment at the top of this file */
2556 smp_rmb();
e23de15f 2557 return __io_cqring_events(ctx);
a3a0e43f
JA
2558}
2559
fb5ccc98
PB
2560static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2561{
2562 struct io_rings *rings = ctx->rings;
2563
2564 /* make sure SQ entry isn't read before tail */
2565 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2566}
2567
8ff069bf 2568static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
e94f141b 2569{
8ff069bf 2570 unsigned int cflags;
e94f141b 2571
bcda7baa
JA
2572 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2573 cflags |= IORING_CQE_F_BUFFER;
0e1b6fe3 2574 req->flags &= ~REQ_F_BUFFER_SELECTED;
bcda7baa
JA
2575 kfree(kbuf);
2576 return cflags;
e94f141b
JA
2577}
2578
8ff069bf 2579static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
bcda7baa 2580{
4d954c25 2581 struct io_buffer *kbuf;
bcda7baa 2582
4d954c25 2583 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
8ff069bf
PB
2584 return io_put_kbuf(req, kbuf);
2585}
2586
4c6e277c
JA
2587static inline bool io_run_task_work(void)
2588{
6200b0ae
JA
2589 /*
2590 * Not safe to run on exiting task, and the task_work handling will
2591 * not add work to such a task.
2592 */
2593 if (unlikely(current->flags & PF_EXITING))
2594 return false;
4c6e277c
JA
2595 if (current->task_works) {
2596 __set_current_state(TASK_RUNNING);
2597 task_work_run();
2598 return true;
2599 }
2600
2601 return false;
bcda7baa
JA
2602}
2603
def596e9
JA
2604/*
2605 * Find and free completed poll iocbs
2606 */
2607static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2608 struct list_head *done)
2609{
8237e045 2610 struct req_batch rb;
def596e9 2611 struct io_kiocb *req;
bbde017a
XW
2612
2613 /* order with ->result store in io_complete_rw_iopoll() */
2614 smp_rmb();
def596e9 2615
5af1d13e 2616 io_init_req_batch(&rb);
def596e9 2617 while (!list_empty(done)) {
bcda7baa
JA
2618 int cflags = 0;
2619
d21ffe7e 2620 req = list_first_entry(done, struct io_kiocb, inflight_entry);
f161340d
PB
2621 list_del(&req->inflight_entry);
2622
bbde017a
XW
2623 if (READ_ONCE(req->result) == -EAGAIN) {
2624 req->iopoll_completed = 0;
23faba36 2625 if (io_rw_reissue(req))
f161340d 2626 continue;
bbde017a 2627 }
def596e9 2628
bcda7baa 2629 if (req->flags & REQ_F_BUFFER_SELECTED)
8ff069bf 2630 cflags = io_put_rw_kbuf(req);
bcda7baa
JA
2631
2632 __io_cqring_fill_event(req, req->result, cflags);
def596e9
JA
2633 (*nr_events)++;
2634
c3524383 2635 if (refcount_dec_and_test(&req->refs))
6ff119a6 2636 io_req_free_batch(&rb, req, &ctx->submit_state);
def596e9 2637 }
def596e9 2638
09bb8394 2639 io_commit_cqring(ctx);
80c18e4a 2640 io_cqring_ev_posted_iopoll(ctx);
2d6500d4 2641 io_req_free_batch_finish(ctx, &rb);
581f9810
BM
2642}
2643
def596e9
JA
2644static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2645 long min)
2646{
2647 struct io_kiocb *req, *tmp;
2648 LIST_HEAD(done);
2649 bool spin;
2650 int ret;
2651
2652 /*
2653 * Only spin for completions if we don't have multiple devices hanging
2654 * off our complete list, and we're under the requested amount.
2655 */
2656 spin = !ctx->poll_multi_file && *nr_events < min;
2657
2658 ret = 0;
d21ffe7e 2659 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
9adbd45d 2660 struct kiocb *kiocb = &req->rw.kiocb;
def596e9
JA
2661
2662 /*
581f9810
BM
2663 * Move completed and retryable entries to our local lists.
2664 * If we find a request that requires polling, break out
2665 * and complete those lists first, if we have entries there.
def596e9 2666 */
65a6543d 2667 if (READ_ONCE(req->iopoll_completed)) {
d21ffe7e 2668 list_move_tail(&req->inflight_entry, &done);
def596e9
JA
2669 continue;
2670 }
2671 if (!list_empty(&done))
2672 break;
2673
2674 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2675 if (ret < 0)
2676 break;
2677
3aadc23e
PB
2678 /* iopoll may have completed current req */
2679 if (READ_ONCE(req->iopoll_completed))
d21ffe7e 2680 list_move_tail(&req->inflight_entry, &done);
3aadc23e 2681
def596e9
JA
2682 if (ret && spin)
2683 spin = false;
2684 ret = 0;
2685 }
2686
2687 if (!list_empty(&done))
2688 io_iopoll_complete(ctx, nr_events, &done);
2689
2690 return ret;
2691}
2692
2693/*
d195a66e 2694 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
def596e9
JA
2695 * non-spinning poll check - we'll still enter the driver poll loop, but only
2696 * as a non-spinning completion check.
2697 */
2698static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2699 long min)
2700{
540e32a0 2701 while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
def596e9
JA
2702 int ret;
2703
2704 ret = io_do_iopoll(ctx, nr_events, min);
2705 if (ret < 0)
2706 return ret;
eba0a4dd 2707 if (*nr_events >= min)
def596e9
JA
2708 return 0;
2709 }
2710
2711 return 1;
2712}
2713
2714/*
2715 * We can't just wait for polled events to come to us, we have to actively
2716 * find and complete them.
2717 */
b2edc0a7 2718static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
2719{
2720 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2721 return;
2722
2723 mutex_lock(&ctx->uring_lock);
540e32a0 2724 while (!list_empty(&ctx->iopoll_list)) {
def596e9
JA
2725 unsigned int nr_events = 0;
2726
b2edc0a7 2727 io_do_iopoll(ctx, &nr_events, 0);
08f5439f 2728
b2edc0a7
PB
2729 /* let it sleep and repeat later if can't complete a request */
2730 if (nr_events == 0)
2731 break;
08f5439f
JA
2732 /*
2733 * Ensure we allow local-to-the-cpu processing to take place,
2734 * in this case we need to ensure that we reap all events.
3fcee5a6 2735 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 2736 */
3fcee5a6
PB
2737 if (need_resched()) {
2738 mutex_unlock(&ctx->uring_lock);
2739 cond_resched();
2740 mutex_lock(&ctx->uring_lock);
2741 }
def596e9
JA
2742 }
2743 mutex_unlock(&ctx->uring_lock);
2744}
2745
7668b92a 2746static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 2747{
7668b92a 2748 unsigned int nr_events = 0;
2b2ed975 2749 int iters = 0, ret = 0;
500f9fba 2750
c7849be9
XW
2751 /*
2752 * We disallow the app entering submit/complete with polling, but we
2753 * still need to lock the ring to prevent racing with polled issue
2754 * that got punted to a workqueue.
2755 */
2756 mutex_lock(&ctx->uring_lock);
def596e9 2757 do {
a3a0e43f
JA
2758 /*
2759 * Don't enter poll loop if we already have events pending.
2760 * If we do, we can potentially be spinning for commands that
2761 * already triggered a CQE (eg in error).
2762 */
6c503150
PB
2763 if (test_bit(0, &ctx->cq_check_overflow))
2764 __io_cqring_overflow_flush(ctx, false, NULL, NULL);
2765 if (io_cqring_events(ctx))
a3a0e43f
JA
2766 break;
2767
500f9fba
JA
2768 /*
2769 * If a submit got punted to a workqueue, we can have the
2770 * application entering polling for a command before it gets
2771 * issued. That app will hold the uring_lock for the duration
2772 * of the poll right here, so we need to take a breather every
2773 * now and then to ensure that the issue has a chance to add
2774 * the poll to the issued list. Otherwise we can spin here
2775 * forever, while the workqueue is stuck trying to acquire the
2776 * very same mutex.
2777 */
2778 if (!(++iters & 7)) {
2779 mutex_unlock(&ctx->uring_lock);
4c6e277c 2780 io_run_task_work();
500f9fba
JA
2781 mutex_lock(&ctx->uring_lock);
2782 }
2783
7668b92a 2784 ret = io_iopoll_getevents(ctx, &nr_events, min);
def596e9
JA
2785 if (ret <= 0)
2786 break;
2787 ret = 0;
7668b92a 2788 } while (min && !nr_events && !need_resched());
def596e9 2789
500f9fba 2790 mutex_unlock(&ctx->uring_lock);
def596e9
JA
2791 return ret;
2792}
2793
491381ce 2794static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 2795{
491381ce
JA
2796 /*
2797 * Tell lockdep we inherited freeze protection from submission
2798 * thread.
2799 */
2800 if (req->flags & REQ_F_ISREG) {
2801 struct inode *inode = file_inode(req->file);
2b188cc1 2802
491381ce 2803 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1 2804 }
491381ce 2805 file_end_write(req->file);
2b188cc1
JA
2806}
2807
b63534c4 2808#ifdef CONFIG_BLOCK
dc2a6e9a 2809static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4
JA
2810{
2811 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
4a245479 2812 int rw, ret;
b63534c4 2813 struct iov_iter iter;
b63534c4 2814
dc2a6e9a
PB
2815 /* already prepared */
2816 if (req->async_data)
2817 return true;
b63534c4
JA
2818
2819 switch (req->opcode) {
2820 case IORING_OP_READV:
2821 case IORING_OP_READ_FIXED:
2822 case IORING_OP_READ:
2823 rw = READ;
2824 break;
2825 case IORING_OP_WRITEV:
2826 case IORING_OP_WRITE_FIXED:
2827 case IORING_OP_WRITE:
2828 rw = WRITE;
2829 break;
2830 default:
2831 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2832 req->opcode);
dc2a6e9a 2833 return false;
b63534c4
JA
2834 }
2835
dc2a6e9a
PB
2836 ret = io_import_iovec(rw, req, &iovec, &iter, false);
2837 if (ret < 0)
2838 return false;
6bf985dc 2839 return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
b63534c4 2840}
b63534c4
JA
2841#endif
2842
23faba36 2843static bool io_rw_reissue(struct io_kiocb *req)
b63534c4
JA
2844{
2845#ifdef CONFIG_BLOCK
355afaeb 2846 umode_t mode = file_inode(req->file)->i_mode;
b63534c4
JA
2847 int ret;
2848
355afaeb
JA
2849 if (!S_ISBLK(mode) && !S_ISREG(mode))
2850 return false;
75c668cd 2851 if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
b63534c4
JA
2852 return false;
2853
55e6ac1e
PB
2854 lockdep_assert_held(&req->ctx->uring_lock);
2855
28cea78a 2856 ret = io_sq_thread_acquire_mm_files(req->ctx, req);
6d816e08 2857
dc2a6e9a 2858 if (!ret && io_resubmit_prep(req)) {
fdee946d
JA
2859 refcount_inc(&req->refs);
2860 io_queue_async_work(req);
b63534c4 2861 return true;
fdee946d 2862 }
dc2a6e9a 2863 req_set_fail_links(req);
b63534c4
JA
2864#endif
2865 return false;
2866}
2867
a1d7c393 2868static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
889fca73 2869 unsigned int issue_flags)
a1d7c393 2870{
2f8e45f1
PB
2871 int cflags = 0;
2872
23faba36
PB
2873 if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
2874 return;
2f8e45f1
PB
2875 if (res != req->result)
2876 req_set_fail_links(req);
23faba36 2877
2f8e45f1
PB
2878 if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2879 kiocb_end_write(req);
2880 if (req->flags & REQ_F_BUFFER_SELECTED)
2881 cflags = io_put_rw_kbuf(req);
2882 __io_req_complete(req, issue_flags, res, cflags);
ba816ad6
JA
2883}
2884
2885static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2886{
9adbd45d 2887 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 2888
889fca73 2889 __io_complete_rw(req, res, res2, 0);
2b188cc1
JA
2890}
2891
def596e9
JA
2892static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2893{
9adbd45d 2894 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 2895
491381ce
JA
2896 if (kiocb->ki_flags & IOCB_WRITE)
2897 kiocb_end_write(req);
def596e9 2898
2d7d6792 2899 if (res != -EAGAIN && res != req->result)
4e88d6e7 2900 req_set_fail_links(req);
bbde017a
XW
2901
2902 WRITE_ONCE(req->result, res);
2903 /* order with io_poll_complete() checking ->result */
cd664b0e
PB
2904 smp_wmb();
2905 WRITE_ONCE(req->iopoll_completed, 1);
def596e9
JA
2906}
2907
2908/*
2909 * After the iocb has been issued, it's safe to be found on the poll list.
2910 * Adding the kiocb to the list AFTER submission ensures that we don't
2911 * find it from a io_iopoll_getevents() thread before the issuer is done
2912 * accessing the kiocb cookie.
2913 */
2e9dbe90 2914static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
def596e9
JA
2915{
2916 struct io_ring_ctx *ctx = req->ctx;
2917
2918 /*
2919 * Track whether we have multiple files in our lists. This will impact
2920 * how we do polling eventually, not spinning if we're on potentially
2921 * different devices.
2922 */
540e32a0 2923 if (list_empty(&ctx->iopoll_list)) {
def596e9
JA
2924 ctx->poll_multi_file = false;
2925 } else if (!ctx->poll_multi_file) {
2926 struct io_kiocb *list_req;
2927
540e32a0 2928 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
d21ffe7e 2929 inflight_entry);
9adbd45d 2930 if (list_req->file != req->file)
def596e9
JA
2931 ctx->poll_multi_file = true;
2932 }
2933
2934 /*
2935 * For fast devices, IO may have already completed. If it has, add
2936 * it to the front so we find it first.
2937 */
65a6543d 2938 if (READ_ONCE(req->iopoll_completed))
d21ffe7e 2939 list_add(&req->inflight_entry, &ctx->iopoll_list);
def596e9 2940 else
d21ffe7e 2941 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
bdcd3eab 2942
2e9dbe90
XW
2943 /*
2944 * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
2945 * task context or in io worker task context. If current task context is
2946 * sq thread, we don't need to check whether should wake up sq thread.
2947 */
2948 if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
534ca6d6
JA
2949 wq_has_sleeper(&ctx->sq_data->wait))
2950 wake_up(&ctx->sq_data->wait);
def596e9
JA
2951}
2952
9f13c35b
PB
2953static inline void io_state_file_put(struct io_submit_state *state)
2954{
02b23a9a
PB
2955 if (state->file_refs) {
2956 fput_many(state->file, state->file_refs);
2957 state->file_refs = 0;
2958 }
9a56a232
JA
2959}
2960
2961/*
2962 * Get as many references to a file as we have IOs left in this submission,
2963 * assuming most submissions are for one file, or at least that each file
2964 * has more than one submission.
2965 */
8da11c19 2966static struct file *__io_file_get(struct io_submit_state *state, int fd)
9a56a232
JA
2967{
2968 if (!state)
2969 return fget(fd);
2970
6e1271e6 2971 if (state->file_refs) {
9a56a232 2972 if (state->fd == fd) {
6e1271e6 2973 state->file_refs--;
9a56a232
JA
2974 return state->file;
2975 }
02b23a9a 2976 io_state_file_put(state);
9a56a232
JA
2977 }
2978 state->file = fget_many(fd, state->ios_left);
6e1271e6 2979 if (unlikely(!state->file))
9a56a232
JA
2980 return NULL;
2981
2982 state->fd = fd;
6e1271e6 2983 state->file_refs = state->ios_left - 1;
9a56a232
JA
2984 return state->file;
2985}
2986
4503b767
JA
2987static bool io_bdev_nowait(struct block_device *bdev)
2988{
9ba0d0c8 2989 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
2990}
2991
2b188cc1
JA
2992/*
2993 * If we tracked the file through the SCM inflight mechanism, we could support
2994 * any file. For now, just ensure that anything potentially problematic is done
2995 * inline.
2996 */
af197f50 2997static bool io_file_supports_async(struct file *file, int rw)
2b188cc1
JA
2998{
2999 umode_t mode = file_inode(file)->i_mode;
3000
4503b767 3001 if (S_ISBLK(mode)) {
4e7b5671
CH
3002 if (IS_ENABLED(CONFIG_BLOCK) &&
3003 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
3004 return true;
3005 return false;
3006 }
3007 if (S_ISCHR(mode) || S_ISSOCK(mode))
2b188cc1 3008 return true;
4503b767 3009 if (S_ISREG(mode)) {
4e7b5671
CH
3010 if (IS_ENABLED(CONFIG_BLOCK) &&
3011 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
3012 file->f_op != &io_uring_fops)
3013 return true;
3014 return false;
3015 }
2b188cc1 3016
c5b85625
JA
3017 /* any ->read/write should understand O_NONBLOCK */
3018 if (file->f_flags & O_NONBLOCK)
3019 return true;
3020
af197f50
JA
3021 if (!(file->f_mode & FMODE_NOWAIT))
3022 return false;
3023
3024 if (rw == READ)
3025 return file->f_op->read_iter != NULL;
3026
3027 return file->f_op->write_iter != NULL;
2b188cc1
JA
3028}
3029
a88fc400 3030static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 3031{
def596e9 3032 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 3033 struct kiocb *kiocb = &req->rw.kiocb;
75c668cd 3034 struct file *file = req->file;
09bb8394
JA
3035 unsigned ioprio;
3036 int ret;
2b188cc1 3037
75c668cd 3038 if (S_ISREG(file_inode(file)->i_mode))
491381ce
JA
3039 req->flags |= REQ_F_ISREG;
3040
2b188cc1 3041 kiocb->ki_pos = READ_ONCE(sqe->off);
75c668cd 3042 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
ba04291e 3043 req->flags |= REQ_F_CUR_POS;
75c668cd 3044 kiocb->ki_pos = file->f_pos;
ba04291e 3045 }
2b188cc1 3046 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
3047 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
3048 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
3049 if (unlikely(ret))
3050 return ret;
2b188cc1 3051
75c668cd
PB
3052 /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
3053 if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
3054 req->flags |= REQ_F_NOWAIT;
3055
2b188cc1
JA
3056 ioprio = READ_ONCE(sqe->ioprio);
3057 if (ioprio) {
3058 ret = ioprio_check_cap(ioprio);
3059 if (ret)
09bb8394 3060 return ret;
2b188cc1
JA
3061
3062 kiocb->ki_ioprio = ioprio;
3063 } else
3064 kiocb->ki_ioprio = get_current_ioprio();
3065
def596e9 3066 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
3067 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
3068 !kiocb->ki_filp->f_op->iopoll)
09bb8394 3069 return -EOPNOTSUPP;
2b188cc1 3070
def596e9
JA
3071 kiocb->ki_flags |= IOCB_HIPRI;
3072 kiocb->ki_complete = io_complete_rw_iopoll;
65a6543d 3073 req->iopoll_completed = 0;
def596e9 3074 } else {
09bb8394
JA
3075 if (kiocb->ki_flags & IOCB_HIPRI)
3076 return -EINVAL;
def596e9
JA
3077 kiocb->ki_complete = io_complete_rw;
3078 }
9adbd45d 3079
3529d8c2
JA
3080 req->rw.addr = READ_ONCE(sqe->addr);
3081 req->rw.len = READ_ONCE(sqe->len);
4f4eeba8 3082 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 3083 return 0;
2b188cc1
JA
3084}
3085
3086static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3087{
3088 switch (ret) {
3089 case -EIOCBQUEUED:
3090 break;
3091 case -ERESTARTSYS:
3092 case -ERESTARTNOINTR:
3093 case -ERESTARTNOHAND:
3094 case -ERESTART_RESTARTBLOCK:
3095 /*
3096 * We can't just restart the syscall, since previously
3097 * submitted sqes may already be in progress. Just fail this
3098 * IO with EINTR.
3099 */
3100 ret = -EINTR;
df561f66 3101 fallthrough;
2b188cc1
JA
3102 default:
3103 kiocb->ki_complete(kiocb, ret, 0);
3104 }
3105}
3106
a1d7c393 3107static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
889fca73 3108 unsigned int issue_flags)
ba816ad6 3109{
ba04291e 3110 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
e8c2bc1f 3111 struct io_async_rw *io = req->async_data;
ba04291e 3112
227c0c96 3113 /* add previously done IO, if any */
e8c2bc1f 3114 if (io && io->bytes_done > 0) {
227c0c96 3115 if (ret < 0)
e8c2bc1f 3116 ret = io->bytes_done;
227c0c96 3117 else
e8c2bc1f 3118 ret += io->bytes_done;
227c0c96
JA
3119 }
3120
ba04291e
JA
3121 if (req->flags & REQ_F_CUR_POS)
3122 req->file->f_pos = kiocb->ki_pos;
bcaec089 3123 if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
889fca73 3124 __io_complete_rw(req, ret, 0, issue_flags);
ba816ad6
JA
3125 else
3126 io_rw_done(kiocb, ret);
3127}
3128
847595de 3129static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
edafccee 3130{
9adbd45d
JA
3131 struct io_ring_ctx *ctx = req->ctx;
3132 size_t len = req->rw.len;
edafccee 3133 struct io_mapped_ubuf *imu;
4be1c615 3134 u16 index, buf_index = req->buf_index;
edafccee
JA
3135 size_t offset;
3136 u64 buf_addr;
3137
edafccee
JA
3138 if (unlikely(buf_index >= ctx->nr_user_bufs))
3139 return -EFAULT;
edafccee
JA
3140 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3141 imu = &ctx->user_bufs[index];
9adbd45d 3142 buf_addr = req->rw.addr;
edafccee
JA
3143
3144 /* overflow */
3145 if (buf_addr + len < buf_addr)
3146 return -EFAULT;
3147 /* not inside the mapped region */
3148 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
3149 return -EFAULT;
3150
3151 /*
3152 * May not be a start of buffer, set size appropriately
3153 * and advance us to the beginning.
3154 */
3155 offset = buf_addr - imu->ubuf;
3156 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
3157
3158 if (offset) {
3159 /*
3160 * Don't use iov_iter_advance() here, as it's really slow for
3161 * using the latter parts of a big fixed buffer - it iterates
3162 * over each segment manually. We can cheat a bit here, because
3163 * we know that:
3164 *
3165 * 1) it's a BVEC iter, we set it up
3166 * 2) all bvecs are PAGE_SIZE in size, except potentially the
3167 * first and last bvec
3168 *
3169 * So just find our index, and adjust the iterator afterwards.
3170 * If the offset is within the first bvec (or the whole first
3171 * bvec, just use iov_iter_advance(). This makes it easier
3172 * since we can just skip the first segment, which may not
3173 * be PAGE_SIZE aligned.
3174 */
3175 const struct bio_vec *bvec = imu->bvec;
3176
3177 if (offset <= bvec->bv_len) {
3178 iov_iter_advance(iter, offset);
3179 } else {
3180 unsigned long seg_skip;
3181
3182 /* skip first vec */
3183 offset -= bvec->bv_len;
3184 seg_skip = 1 + (offset >> PAGE_SHIFT);
3185
3186 iter->bvec = bvec + seg_skip;
3187 iter->nr_segs -= seg_skip;
99c79f66 3188 iter->count -= bvec->bv_len + offset;
bd11b3a3 3189 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
3190 }
3191 }
3192
847595de 3193 return 0;
edafccee
JA
3194}
3195
bcda7baa
JA
3196static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3197{
3198 if (needs_lock)
3199 mutex_unlock(&ctx->uring_lock);
3200}
3201
3202static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3203{
3204 /*
3205 * "Normal" inline submissions always hold the uring_lock, since we
3206 * grab it from the system call. Same is true for the SQPOLL offload.
3207 * The only exception is when we've detached the request and issue it
3208 * from an async worker thread, grab the lock for that case.
3209 */
3210 if (needs_lock)
3211 mutex_lock(&ctx->uring_lock);
3212}
3213
3214static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3215 int bgid, struct io_buffer *kbuf,
3216 bool needs_lock)
3217{
3218 struct io_buffer *head;
3219
3220 if (req->flags & REQ_F_BUFFER_SELECTED)
3221 return kbuf;
3222
3223 io_ring_submit_lock(req->ctx, needs_lock);
3224
3225 lockdep_assert_held(&req->ctx->uring_lock);
3226
3227 head = idr_find(&req->ctx->io_buffer_idr, bgid);
3228 if (head) {
3229 if (!list_empty(&head->list)) {
3230 kbuf = list_last_entry(&head->list, struct io_buffer,
3231 list);
3232 list_del(&kbuf->list);
3233 } else {
3234 kbuf = head;
3235 idr_remove(&req->ctx->io_buffer_idr, bgid);
3236 }
3237 if (*len > kbuf->len)
3238 *len = kbuf->len;
3239 } else {
3240 kbuf = ERR_PTR(-ENOBUFS);
3241 }
3242
3243 io_ring_submit_unlock(req->ctx, needs_lock);
3244
3245 return kbuf;
3246}
3247
4d954c25
JA
3248static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3249 bool needs_lock)
3250{
3251 struct io_buffer *kbuf;
4f4eeba8 3252 u16 bgid;
4d954c25
JA
3253
3254 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
4f4eeba8 3255 bgid = req->buf_index;
4d954c25
JA
3256 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
3257 if (IS_ERR(kbuf))
3258 return kbuf;
3259 req->rw.addr = (u64) (unsigned long) kbuf;
3260 req->flags |= REQ_F_BUFFER_SELECTED;
3261 return u64_to_user_ptr(kbuf->addr);
3262}
3263
3264#ifdef CONFIG_COMPAT
3265static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3266 bool needs_lock)
3267{
3268 struct compat_iovec __user *uiov;
3269 compat_ssize_t clen;
3270 void __user *buf;
3271 ssize_t len;
3272
3273 uiov = u64_to_user_ptr(req->rw.addr);
3274 if (!access_ok(uiov, sizeof(*uiov)))
3275 return -EFAULT;
3276 if (__get_user(clen, &uiov->iov_len))
3277 return -EFAULT;
3278 if (clen < 0)
3279 return -EINVAL;
3280
3281 len = clen;
3282 buf = io_rw_buffer_select(req, &len, needs_lock);
3283 if (IS_ERR(buf))
3284 return PTR_ERR(buf);
3285 iov[0].iov_base = buf;
3286 iov[0].iov_len = (compat_size_t) len;
3287 return 0;
3288}
3289#endif
3290
3291static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3292 bool needs_lock)
3293{
3294 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3295 void __user *buf;
3296 ssize_t len;
3297
3298 if (copy_from_user(iov, uiov, sizeof(*uiov)))
3299 return -EFAULT;
3300
3301 len = iov[0].iov_len;
3302 if (len < 0)
3303 return -EINVAL;
3304 buf = io_rw_buffer_select(req, &len, needs_lock);
3305 if (IS_ERR(buf))
3306 return PTR_ERR(buf);
3307 iov[0].iov_base = buf;
3308 iov[0].iov_len = len;
3309 return 0;
3310}
3311
3312static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3313 bool needs_lock)
3314{
dddb3e26
JA
3315 if (req->flags & REQ_F_BUFFER_SELECTED) {
3316 struct io_buffer *kbuf;
3317
3318 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3319 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3320 iov[0].iov_len = kbuf->len;
4d954c25 3321 return 0;
dddb3e26 3322 }
dd201662 3323 if (req->rw.len != 1)
4d954c25
JA
3324 return -EINVAL;
3325
3326#ifdef CONFIG_COMPAT
3327 if (req->ctx->compat)
3328 return io_compat_import(req, iov, needs_lock);
3329#endif
3330
3331 return __io_iov_buffer_select(req, iov, needs_lock);
3332}
3333
847595de
PB
3334static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3335 struct iov_iter *iter, bool needs_lock)
2b188cc1 3336{
9adbd45d
JA
3337 void __user *buf = u64_to_user_ptr(req->rw.addr);
3338 size_t sqe_len = req->rw.len;
847595de 3339 u8 opcode = req->opcode;
4d954c25 3340 ssize_t ret;
edafccee 3341
7d009165 3342 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 3343 *iovec = NULL;
9adbd45d 3344 return io_import_fixed(req, rw, iter);
edafccee 3345 }
2b188cc1 3346
bcda7baa 3347 /* buffer index only valid with fixed read/write, or buffer select */
4f4eeba8 3348 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
9adbd45d
JA
3349 return -EINVAL;
3350
3a6820f2 3351 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 3352 if (req->flags & REQ_F_BUFFER_SELECT) {
4d954c25 3353 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
867a23ea 3354 if (IS_ERR(buf))
4d954c25 3355 return PTR_ERR(buf);
3f9d6441 3356 req->rw.len = sqe_len;
bcda7baa
JA
3357 }
3358
3a6820f2
JA
3359 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3360 *iovec = NULL;
10fc72e4 3361 return ret;
3a6820f2
JA
3362 }
3363
4d954c25
JA
3364 if (req->flags & REQ_F_BUFFER_SELECT) {
3365 ret = io_iov_buffer_select(req, *iovec, needs_lock);
847595de
PB
3366 if (!ret)
3367 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
4d954c25
JA
3368 *iovec = NULL;
3369 return ret;
3370 }
3371
89cd35c5
CH
3372 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3373 req->ctx->compat);
2b188cc1
JA
3374}
3375
0fef9483
JA
3376static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3377{
5b09e37e 3378 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
3379}
3380
31b51510 3381/*
32960613
JA
3382 * For files that don't have ->read_iter() and ->write_iter(), handle them
3383 * by looping over ->read() or ->write() manually.
31b51510 3384 */
4017eb91 3385static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3386{
4017eb91
JA
3387 struct kiocb *kiocb = &req->rw.kiocb;
3388 struct file *file = req->file;
32960613
JA
3389 ssize_t ret = 0;
3390
3391 /*
3392 * Don't support polled IO through this interface, and we can't
3393 * support non-blocking either. For the latter, this just causes
3394 * the kiocb to be handled from an async context.
3395 */
3396 if (kiocb->ki_flags & IOCB_HIPRI)
3397 return -EOPNOTSUPP;
3398 if (kiocb->ki_flags & IOCB_NOWAIT)
3399 return -EAGAIN;
3400
3401 while (iov_iter_count(iter)) {
311ae9e1 3402 struct iovec iovec;
32960613
JA
3403 ssize_t nr;
3404
311ae9e1
PB
3405 if (!iov_iter_is_bvec(iter)) {
3406 iovec = iov_iter_iovec(iter);
3407 } else {
4017eb91
JA
3408 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3409 iovec.iov_len = req->rw.len;
311ae9e1
PB
3410 }
3411
32960613
JA
3412 if (rw == READ) {
3413 nr = file->f_op->read(file, iovec.iov_base,
0fef9483 3414 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3415 } else {
3416 nr = file->f_op->write(file, iovec.iov_base,
0fef9483 3417 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3418 }
3419
3420 if (nr < 0) {
3421 if (!ret)
3422 ret = nr;
3423 break;
3424 }
3425 ret += nr;
3426 if (nr != iovec.iov_len)
3427 break;
4017eb91
JA
3428 req->rw.len -= nr;
3429 req->rw.addr += nr;
32960613
JA
3430 iov_iter_advance(iter, nr);
3431 }
3432
3433 return ret;
3434}
3435
ff6165b2
JA
3436static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3437 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3438{
e8c2bc1f 3439 struct io_async_rw *rw = req->async_data;
b64e3444 3440
ff6165b2 3441 memcpy(&rw->iter, iter, sizeof(*iter));
afb87658 3442 rw->free_iovec = iovec;
227c0c96 3443 rw->bytes_done = 0;
ff6165b2 3444 /* can only be fixed buffers, no need to do anything */
9c3a205c 3445 if (iov_iter_is_bvec(iter))
ff6165b2 3446 return;
b64e3444 3447 if (!iovec) {
ff6165b2
JA
3448 unsigned iov_off = 0;
3449
3450 rw->iter.iov = rw->fast_iov;
3451 if (iter->iov != fast_iov) {
3452 iov_off = iter->iov - fast_iov;
3453 rw->iter.iov += iov_off;
3454 }
3455 if (rw->fast_iov != fast_iov)
3456 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
45097dae 3457 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
3458 } else {
3459 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
3460 }
3461}
3462
e8c2bc1f 3463static inline int __io_alloc_async_data(struct io_kiocb *req)
3d9932a8 3464{
e8c2bc1f
JA
3465 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3466 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3467 return req->async_data == NULL;
3d9932a8
XW
3468}
3469
e8c2bc1f 3470static int io_alloc_async_data(struct io_kiocb *req)
f67676d1 3471{
e8c2bc1f 3472 if (!io_op_defs[req->opcode].needs_async_data)
d3656344 3473 return 0;
3d9932a8 3474
e8c2bc1f 3475 return __io_alloc_async_data(req);
b7bb4f7d
JA
3476}
3477
ff6165b2
JA
3478static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3479 const struct iovec *fast_iov,
227c0c96 3480 struct iov_iter *iter, bool force)
b7bb4f7d 3481{
e8c2bc1f 3482 if (!force && !io_op_defs[req->opcode].needs_async_data)
74566df3 3483 return 0;
e8c2bc1f 3484 if (!req->async_data) {
6bf985dc
PB
3485 if (__io_alloc_async_data(req)) {
3486 kfree(iovec);
5d204bcf 3487 return -ENOMEM;
6bf985dc 3488 }
b7bb4f7d 3489
ff6165b2 3490 io_req_map_rw(req, iovec, fast_iov, iter);
5d204bcf 3491 }
b7bb4f7d 3492 return 0;
f67676d1
JA
3493}
3494
73debe68 3495static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 3496{
e8c2bc1f 3497 struct io_async_rw *iorw = req->async_data;
f4bff104 3498 struct iovec *iov = iorw->fast_iov;
847595de 3499 int ret;
c3e330a4 3500
2846c481 3501 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
c3e330a4
PB
3502 if (unlikely(ret < 0))
3503 return ret;
3504
ab0b196c
PB
3505 iorw->bytes_done = 0;
3506 iorw->free_iovec = iov;
3507 if (iov)
3508 req->flags |= REQ_F_NEED_CLEANUP;
c3e330a4
PB
3509 return 0;
3510}
3511
73debe68 3512static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3513{
3529d8c2
JA
3514 if (unlikely(!(req->file->f_mode & FMODE_READ)))
3515 return -EBADF;
93642ef8 3516 return io_prep_rw(req, sqe);
f67676d1
JA
3517}
3518
c1dd91d1
JA
3519/*
3520 * This is our waitqueue callback handler, registered through lock_page_async()
3521 * when we initially tried to do the IO with the iocb armed our waitqueue.
3522 * This gets called when the page is unlocked, and we generally expect that to
3523 * happen when the page IO is completed and the page is now uptodate. This will
3524 * queue a task_work based retry of the operation, attempting to copy the data
3525 * again. If the latter fails because the page was NOT uptodate, then we will
3526 * do a thread based blocking retry of the operation. That's the unexpected
3527 * slow path.
3528 */
bcf5a063
JA
3529static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3530 int sync, void *arg)
3531{
3532 struct wait_page_queue *wpq;
3533 struct io_kiocb *req = wait->private;
bcf5a063 3534 struct wait_page_key *key = arg;
bcf5a063
JA
3535
3536 wpq = container_of(wait, struct wait_page_queue, wait);
3537
cdc8fcb4
LT
3538 if (!wake_page_match(wpq, key))
3539 return 0;
3540
c8d317aa 3541 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063
JA
3542 list_del_init(&wait->entry);
3543
bcf5a063
JA
3544 /* submit ref gets dropped, acquire a new one */
3545 refcount_inc(&req->refs);
921b9054 3546 io_req_task_queue(req);
bcf5a063
JA
3547 return 1;
3548}
3549
c1dd91d1
JA
3550/*
3551 * This controls whether a given IO request should be armed for async page
3552 * based retry. If we return false here, the request is handed to the async
3553 * worker threads for retry. If we're doing buffered reads on a regular file,
3554 * we prepare a private wait_page_queue entry and retry the operation. This
3555 * will either succeed because the page is now uptodate and unlocked, or it
3556 * will register a callback when the page is unlocked at IO completion. Through
3557 * that callback, io_uring uses task_work to setup a retry of the operation.
3558 * That retry will attempt the buffered read again. The retry will generally
3559 * succeed, or in rare cases where it fails, we then fall back to using the
3560 * async worker threads for a blocking retry.
3561 */
227c0c96 3562static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 3563{
e8c2bc1f
JA
3564 struct io_async_rw *rw = req->async_data;
3565 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 3566 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 3567
bcf5a063
JA
3568 /* never retry for NOWAIT, we just complete with -EAGAIN */
3569 if (req->flags & REQ_F_NOWAIT)
3570 return false;
f67676d1 3571
227c0c96 3572 /* Only for buffered IO */
3b2a4439 3573 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 3574 return false;
3b2a4439 3575
bcf5a063
JA
3576 /*
3577 * just use poll if we can, and don't attempt if the fs doesn't
3578 * support callback based unlocks
3579 */
3580 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3581 return false;
f67676d1 3582
3b2a4439
JA
3583 wait->wait.func = io_async_buf_func;
3584 wait->wait.private = req;
3585 wait->wait.flags = 0;
3586 INIT_LIST_HEAD(&wait->wait.entry);
3587 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 3588 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 3589 kiocb->ki_waitq = wait;
3b2a4439 3590 return true;
bcf5a063
JA
3591}
3592
3593static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3594{
3595 if (req->file->f_op->read_iter)
3596 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 3597 else if (req->file->f_op->read)
4017eb91 3598 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
3599 else
3600 return -EINVAL;
f67676d1
JA
3601}
3602
889fca73 3603static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3604{
3605 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3606 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3607 struct iov_iter __iter, *iter = &__iter;
e8c2bc1f 3608 struct io_async_rw *rw = req->async_data;
227c0c96 3609 ssize_t io_size, ret, ret2;
45d189c6 3610 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ff6165b2 3611
2846c481 3612 if (rw) {
e8c2bc1f 3613 iter = &rw->iter;
2846c481
PB
3614 iovec = NULL;
3615 } else {
3616 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3617 if (ret < 0)
3618 return ret;
3619 }
632546c4 3620 io_size = iov_iter_count(iter);
fa15bafb 3621 req->result = io_size;
2b188cc1 3622
fd6c2e4c
JA
3623 /* Ensure we clear previously set non-block flag */
3624 if (!force_nonblock)
29de5f6a 3625 kiocb->ki_flags &= ~IOCB_NOWAIT;
a88fc400
PB
3626 else
3627 kiocb->ki_flags |= IOCB_NOWAIT;
3628
24c74678 3629 /* If the file doesn't support async, just async punt */
6713e7a6
PB
3630 if (force_nonblock && !io_file_supports_async(req->file, READ)) {
3631 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc 3632 return ret ?: -EAGAIN;
6713e7a6 3633 }
9e645e11 3634
632546c4 3635 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
5ea5dd45
PB
3636 if (unlikely(ret)) {
3637 kfree(iovec);
3638 return ret;
3639 }
2b188cc1 3640
227c0c96 3641 ret = io_iter_do_read(req, iter);
32960613 3642
57cd657b 3643 if (ret == -EIOCBQUEUED) {
fe1cdd55 3644 goto out_free;
227c0c96 3645 } else if (ret == -EAGAIN) {
eefdf30f
JA
3646 /* IOPOLL retry should happen for io-wq threads */
3647 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 3648 goto done;
75c668cd
PB
3649 /* no retry on NONBLOCK nor RWF_NOWAIT */
3650 if (req->flags & REQ_F_NOWAIT)
355afaeb 3651 goto done;
84216315 3652 /* some cases will consume bytes even on error returns */
632546c4 3653 iov_iter_revert(iter, io_size - iov_iter_count(iter));
f38c7e3a 3654 ret = 0;
7335e3bf 3655 } else if (ret <= 0 || ret == io_size || !force_nonblock ||
75c668cd 3656 (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
7335e3bf 3657 /* read all, failed, already did sync or don't want to retry */
00d23d51 3658 goto done;
227c0c96
JA
3659 }
3660
227c0c96 3661 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc
PB
3662 if (ret2)
3663 return ret2;
3664
fe1cdd55 3665 iovec = NULL;
e8c2bc1f 3666 rw = req->async_data;
227c0c96 3667 /* now use our persistent iterator, if we aren't already */
e8c2bc1f 3668 iter = &rw->iter;
227c0c96 3669
b23df91b
PB
3670 do {
3671 io_size -= ret;
3672 rw->bytes_done += ret;
3673 /* if we can retry, do so with the callbacks armed */
3674 if (!io_rw_should_retry(req)) {
3675 kiocb->ki_flags &= ~IOCB_WAITQ;
3676 return -EAGAIN;
3677 }
3678
3679 /*
3680 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3681 * we get -EIOCBQUEUED, then we'll get a notification when the
3682 * desired page gets unlocked. We can also get a partial read
3683 * here, and if we do, then just retry at the new offset.
3684 */
3685 ret = io_iter_do_read(req, iter);
3686 if (ret == -EIOCBQUEUED)
3687 return 0;
227c0c96 3688 /* we got some bytes, but not all. retry. */
b23df91b 3689 } while (ret > 0 && ret < io_size);
227c0c96 3690done:
889fca73 3691 kiocb_done(kiocb, ret, issue_flags);
fe1cdd55
PB
3692out_free:
3693 /* it's faster to check here then delegate to kfree */
3694 if (iovec)
3695 kfree(iovec);
5ea5dd45 3696 return 0;
2b188cc1
JA
3697}
3698
73debe68 3699static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3700{
3529d8c2
JA
3701 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3702 return -EBADF;
93642ef8 3703 return io_prep_rw(req, sqe);
f67676d1
JA
3704}
3705
889fca73 3706static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3707{
3708 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3709 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3710 struct iov_iter __iter, *iter = &__iter;
e8c2bc1f 3711 struct io_async_rw *rw = req->async_data;
fa15bafb 3712 ssize_t ret, ret2, io_size;
45d189c6 3713 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
2b188cc1 3714
2846c481 3715 if (rw) {
e8c2bc1f 3716 iter = &rw->iter;
2846c481
PB
3717 iovec = NULL;
3718 } else {
3719 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3720 if (ret < 0)
3721 return ret;
3722 }
632546c4 3723 io_size = iov_iter_count(iter);
fa15bafb 3724 req->result = io_size;
2b188cc1 3725
fd6c2e4c
JA
3726 /* Ensure we clear previously set non-block flag */
3727 if (!force_nonblock)
a88fc400
PB
3728 kiocb->ki_flags &= ~IOCB_NOWAIT;
3729 else
3730 kiocb->ki_flags |= IOCB_NOWAIT;
fd6c2e4c 3731
24c74678 3732 /* If the file doesn't support async, just async punt */
af197f50 3733 if (force_nonblock && !io_file_supports_async(req->file, WRITE))
f67676d1 3734 goto copy_iov;
31b51510 3735
10d59345
JA
3736 /* file path doesn't support NOWAIT for non-direct_IO */
3737 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3738 (req->flags & REQ_F_ISREG))
f67676d1 3739 goto copy_iov;
31b51510 3740
632546c4 3741 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
fa15bafb
PB
3742 if (unlikely(ret))
3743 goto out_free;
4ed734b0 3744
fa15bafb
PB
3745 /*
3746 * Open-code file_start_write here to grab freeze protection,
3747 * which will be released by another thread in
3748 * io_complete_rw(). Fool lockdep by telling it the lock got
3749 * released so that it doesn't complain about the held lock when
3750 * we return to userspace.
3751 */
3752 if (req->flags & REQ_F_ISREG) {
8a3c84b6 3753 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
3754 __sb_writers_release(file_inode(req->file)->i_sb,
3755 SB_FREEZE_WRITE);
3756 }
3757 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 3758
fa15bafb 3759 if (req->file->f_op->write_iter)
ff6165b2 3760 ret2 = call_write_iter(req->file, kiocb, iter);
2dd2111d 3761 else if (req->file->f_op->write)
4017eb91 3762 ret2 = loop_rw_iter(WRITE, req, iter);
2dd2111d
GH
3763 else
3764 ret2 = -EINVAL;
4ed734b0 3765
fa15bafb
PB
3766 /*
3767 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3768 * retry them without IOCB_NOWAIT.
3769 */
3770 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3771 ret2 = -EAGAIN;
75c668cd
PB
3772 /* no retry on NONBLOCK nor RWF_NOWAIT */
3773 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 3774 goto done;
fa15bafb 3775 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f
JA
3776 /* IOPOLL retry should happen for io-wq threads */
3777 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3778 goto copy_iov;
355afaeb 3779done:
889fca73 3780 kiocb_done(kiocb, ret2, issue_flags);
fa15bafb 3781 } else {
f67676d1 3782copy_iov:
84216315 3783 /* some cases will consume bytes even on error returns */
632546c4 3784 iov_iter_revert(iter, io_size - iov_iter_count(iter));
227c0c96 3785 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
6bf985dc 3786 return ret ?: -EAGAIN;
2b188cc1 3787 }
31b51510 3788out_free:
f261c168 3789 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 3790 if (iovec)
6f2cc166 3791 kfree(iovec);
2b188cc1
JA
3792 return ret;
3793}
3794
80a261fd
JA
3795static int io_renameat_prep(struct io_kiocb *req,
3796 const struct io_uring_sqe *sqe)
3797{
3798 struct io_rename *ren = &req->rename;
3799 const char __user *oldf, *newf;
3800
3801 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3802 return -EBADF;
3803
3804 ren->old_dfd = READ_ONCE(sqe->fd);
3805 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3806 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3807 ren->new_dfd = READ_ONCE(sqe->len);
3808 ren->flags = READ_ONCE(sqe->rename_flags);
3809
3810 ren->oldpath = getname(oldf);
3811 if (IS_ERR(ren->oldpath))
3812 return PTR_ERR(ren->oldpath);
3813
3814 ren->newpath = getname(newf);
3815 if (IS_ERR(ren->newpath)) {
3816 putname(ren->oldpath);
3817 return PTR_ERR(ren->newpath);
3818 }
3819
3820 req->flags |= REQ_F_NEED_CLEANUP;
3821 return 0;
3822}
3823
45d189c6 3824static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
3825{
3826 struct io_rename *ren = &req->rename;
3827 int ret;
3828
45d189c6 3829 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
3830 return -EAGAIN;
3831
3832 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3833 ren->newpath, ren->flags);
3834
3835 req->flags &= ~REQ_F_NEED_CLEANUP;
3836 if (ret < 0)
3837 req_set_fail_links(req);
3838 io_req_complete(req, ret);
3839 return 0;
3840}
3841
14a1143b
JA
3842static int io_unlinkat_prep(struct io_kiocb *req,
3843 const struct io_uring_sqe *sqe)
3844{
3845 struct io_unlink *un = &req->unlink;
3846 const char __user *fname;
3847
3848 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3849 return -EBADF;
3850
3851 un->dfd = READ_ONCE(sqe->fd);
3852
3853 un->flags = READ_ONCE(sqe->unlink_flags);
3854 if (un->flags & ~AT_REMOVEDIR)
3855 return -EINVAL;
3856
3857 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3858 un->filename = getname(fname);
3859 if (IS_ERR(un->filename))
3860 return PTR_ERR(un->filename);
3861
3862 req->flags |= REQ_F_NEED_CLEANUP;
3863 return 0;
3864}
3865
45d189c6 3866static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
3867{
3868 struct io_unlink *un = &req->unlink;
3869 int ret;
3870
45d189c6 3871 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
3872 return -EAGAIN;
3873
3874 if (un->flags & AT_REMOVEDIR)
3875 ret = do_rmdir(un->dfd, un->filename);
3876 else
3877 ret = do_unlinkat(un->dfd, un->filename);
3878
3879 req->flags &= ~REQ_F_NEED_CLEANUP;
3880 if (ret < 0)
3881 req_set_fail_links(req);
3882 io_req_complete(req, ret);
3883 return 0;
3884}
3885
36f4fa68
JA
3886static int io_shutdown_prep(struct io_kiocb *req,
3887 const struct io_uring_sqe *sqe)
3888{
3889#if defined(CONFIG_NET)
3890 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3891 return -EINVAL;
3892 if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3893 sqe->buf_index)
3894 return -EINVAL;
3895
3896 req->shutdown.how = READ_ONCE(sqe->len);
3897 return 0;
3898#else
3899 return -EOPNOTSUPP;
3900#endif
3901}
3902
45d189c6 3903static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
3904{
3905#if defined(CONFIG_NET)
3906 struct socket *sock;
3907 int ret;
3908
45d189c6 3909 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
3910 return -EAGAIN;
3911
48aba79b 3912 sock = sock_from_file(req->file);
36f4fa68 3913 if (unlikely(!sock))
48aba79b 3914 return -ENOTSOCK;
36f4fa68
JA
3915
3916 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d
JA
3917 if (ret < 0)
3918 req_set_fail_links(req);
36f4fa68
JA
3919 io_req_complete(req, ret);
3920 return 0;
3921#else
3922 return -EOPNOTSUPP;
3923#endif
3924}
3925
f2a8d5c7
PB
3926static int __io_splice_prep(struct io_kiocb *req,
3927 const struct io_uring_sqe *sqe)
7d67af2c
PB
3928{
3929 struct io_splice* sp = &req->splice;
3930 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 3931
3232dd02
PB
3932 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3933 return -EINVAL;
7d67af2c
PB
3934
3935 sp->file_in = NULL;
7d67af2c
PB
3936 sp->len = READ_ONCE(sqe->len);
3937 sp->flags = READ_ONCE(sqe->splice_flags);
3938
3939 if (unlikely(sp->flags & ~valid_flags))
3940 return -EINVAL;
3941
8371adf5
PB
3942 sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3943 (sp->flags & SPLICE_F_FD_IN_FIXED));
3944 if (!sp->file_in)
3945 return -EBADF;
7d67af2c
PB
3946 req->flags |= REQ_F_NEED_CLEANUP;
3947
7cdaf587
XW
3948 if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3949 /*
3950 * Splice operation will be punted aync, and here need to
3951 * modify io_wq_work.flags, so initialize io_wq_work firstly.
3952 */
3953 io_req_init_async(req);
7d67af2c 3954 req->work.flags |= IO_WQ_WORK_UNBOUND;
7cdaf587 3955 }
7d67af2c
PB
3956
3957 return 0;
3958}
3959
f2a8d5c7
PB
3960static int io_tee_prep(struct io_kiocb *req,
3961 const struct io_uring_sqe *sqe)
3962{
3963 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3964 return -EINVAL;
3965 return __io_splice_prep(req, sqe);
3966}
3967
45d189c6 3968static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
3969{
3970 struct io_splice *sp = &req->splice;
3971 struct file *in = sp->file_in;
3972 struct file *out = sp->file_out;
3973 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3974 long ret = 0;
3975
45d189c6 3976 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7
PB
3977 return -EAGAIN;
3978 if (sp->len)
3979 ret = do_tee(in, out, sp->len, flags);
3980
3981 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3982 req->flags &= ~REQ_F_NEED_CLEANUP;
3983
f2a8d5c7
PB
3984 if (ret != sp->len)
3985 req_set_fail_links(req);
e1e16097 3986 io_req_complete(req, ret);
f2a8d5c7
PB
3987 return 0;
3988}
3989
3990static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3991{
3992 struct io_splice* sp = &req->splice;
3993
3994 sp->off_in = READ_ONCE(sqe->splice_off_in);
3995 sp->off_out = READ_ONCE(sqe->off);
3996 return __io_splice_prep(req, sqe);
3997}
3998
45d189c6 3999static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
4000{
4001 struct io_splice *sp = &req->splice;
4002 struct file *in = sp->file_in;
4003 struct file *out = sp->file_out;
4004 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4005 loff_t *poff_in, *poff_out;
c9687426 4006 long ret = 0;
7d67af2c 4007
45d189c6 4008 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 4009 return -EAGAIN;
7d67af2c
PB
4010
4011 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4012 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 4013
948a7749 4014 if (sp->len)
c9687426 4015 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c
PB
4016
4017 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
4018 req->flags &= ~REQ_F_NEED_CLEANUP;
4019
7d67af2c
PB
4020 if (ret != sp->len)
4021 req_set_fail_links(req);
e1e16097 4022 io_req_complete(req, ret);
7d67af2c
PB
4023 return 0;
4024}
4025
2b188cc1
JA
4026/*
4027 * IORING_OP_NOP just posts a completion event, nothing else.
4028 */
889fca73 4029static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
4030{
4031 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 4032
def596e9
JA
4033 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4034 return -EINVAL;
4035
889fca73 4036 __io_req_complete(req, issue_flags, 0, 0);
2b188cc1
JA
4037 return 0;
4038}
4039
1155c76a 4040static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 4041{
6b06314c 4042 struct io_ring_ctx *ctx = req->ctx;
c992fe29 4043
09bb8394
JA
4044 if (!req->file)
4045 return -EBADF;
c992fe29 4046
6b06314c 4047 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 4048 return -EINVAL;
edafccee 4049 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
4050 return -EINVAL;
4051
8ed8d3c3
JA
4052 req->sync.flags = READ_ONCE(sqe->fsync_flags);
4053 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4054 return -EINVAL;
4055
4056 req->sync.off = READ_ONCE(sqe->off);
4057 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
4058 return 0;
4059}
4060
45d189c6 4061static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4062{
8ed8d3c3 4063 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
4064 int ret;
4065
ac45abc0 4066 /* fsync always requires a blocking context */
45d189c6 4067 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4068 return -EAGAIN;
4069
9adbd45d 4070 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
4071 end > 0 ? end : LLONG_MAX,
4072 req->sync.flags & IORING_FSYNC_DATASYNC);
4073 if (ret < 0)
4074 req_set_fail_links(req);
e1e16097 4075 io_req_complete(req, ret);
c992fe29
CH
4076 return 0;
4077}
4078
d63d1b5e
JA
4079static int io_fallocate_prep(struct io_kiocb *req,
4080 const struct io_uring_sqe *sqe)
4081{
4082 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
4083 return -EINVAL;
3232dd02
PB
4084 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4085 return -EINVAL;
d63d1b5e
JA
4086
4087 req->sync.off = READ_ONCE(sqe->off);
4088 req->sync.len = READ_ONCE(sqe->addr);
4089 req->sync.mode = READ_ONCE(sqe->len);
4090 return 0;
4091}
4092
45d189c6 4093static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 4094{
ac45abc0
PB
4095 int ret;
4096
d63d1b5e 4097 /* fallocate always requiring blocking context */
45d189c6 4098 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 4099 return -EAGAIN;
ac45abc0
PB
4100 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4101 req->sync.len);
ac45abc0
PB
4102 if (ret < 0)
4103 req_set_fail_links(req);
e1e16097 4104 io_req_complete(req, ret);
5d17b4a4
JA
4105 return 0;
4106}
4107
ec65fea5 4108static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 4109{
f8748881 4110 const char __user *fname;
15b71abe 4111 int ret;
b7bb4f7d 4112
ec65fea5 4113 if (unlikely(sqe->ioprio || sqe->buf_index))
15b71abe 4114 return -EINVAL;
ec65fea5 4115 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 4116 return -EBADF;
03b1230c 4117
ec65fea5
PB
4118 /* open.how should be already initialised */
4119 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 4120 req->open.how.flags |= O_LARGEFILE;
3529d8c2 4121
25e72d10
PB
4122 req->open.dfd = READ_ONCE(sqe->fd);
4123 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 4124 req->open.filename = getname(fname);
15b71abe
JA
4125 if (IS_ERR(req->open.filename)) {
4126 ret = PTR_ERR(req->open.filename);
4127 req->open.filename = NULL;
4128 return ret;
4129 }
4022e7af 4130 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 4131 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 4132 return 0;
03b1230c
JA
4133}
4134
ec65fea5
PB
4135static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4136{
4137 u64 flags, mode;
4138
14587a46 4139 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4eb8dded 4140 return -EINVAL;
ec65fea5
PB
4141 mode = READ_ONCE(sqe->len);
4142 flags = READ_ONCE(sqe->open_flags);
4143 req->open.how = build_open_how(flags, mode);
4144 return __io_openat_prep(req, sqe);
4145}
4146
cebdb986 4147static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 4148{
cebdb986 4149 struct open_how __user *how;
cebdb986 4150 size_t len;
0fa03c62
JA
4151 int ret;
4152
14587a46 4153 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4eb8dded 4154 return -EINVAL;
cebdb986
JA
4155 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4156 len = READ_ONCE(sqe->len);
cebdb986
JA
4157 if (len < OPEN_HOW_SIZE_VER0)
4158 return -EINVAL;
3529d8c2 4159
cebdb986
JA
4160 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4161 len);
4162 if (ret)
4163 return ret;
3529d8c2 4164
ec65fea5 4165 return __io_openat_prep(req, sqe);
cebdb986
JA
4166}
4167
45d189c6 4168static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
4169{
4170 struct open_flags op;
15b71abe 4171 struct file *file;
3a81fd02
JA
4172 bool nonblock_set;
4173 bool resolve_nonblock;
15b71abe
JA
4174 int ret;
4175
cebdb986 4176 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
4177 if (ret)
4178 goto err;
3a81fd02
JA
4179 nonblock_set = op.open_flag & O_NONBLOCK;
4180 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 4181 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
4182 /*
4183 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4184 * it'll always -EAGAIN
4185 */
4186 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4187 return -EAGAIN;
4188 op.lookup_flags |= LOOKUP_CACHED;
4189 op.open_flag |= O_NONBLOCK;
4190 }
15b71abe 4191
4022e7af 4192 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
15b71abe
JA
4193 if (ret < 0)
4194 goto err;
4195
4196 file = do_filp_open(req->open.dfd, req->open.filename, &op);
3a81fd02 4197 /* only retry if RESOLVE_CACHED wasn't already set by application */
45d189c6
PB
4198 if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
4199 file == ERR_PTR(-EAGAIN)) {
944d1444 4200 /*
3a81fd02
JA
4201 * We could hang on to this 'fd', but seems like marginal
4202 * gain for something that is now known to be a slower path.
4203 * So just put it, and we'll get a new one when we retry.
944d1444 4204 */
3a81fd02
JA
4205 put_unused_fd(ret);
4206 return -EAGAIN;
4207 }
4208
15b71abe
JA
4209 if (IS_ERR(file)) {
4210 put_unused_fd(ret);
4211 ret = PTR_ERR(file);
4212 } else {
45d189c6 4213 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3a81fd02 4214 file->f_flags &= ~O_NONBLOCK;
15b71abe
JA
4215 fsnotify_open(file);
4216 fd_install(ret, file);
4217 }
4218err:
4219 putname(req->open.filename);
8fef80bf 4220 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe
JA
4221 if (ret < 0)
4222 req_set_fail_links(req);
e1e16097 4223 io_req_complete(req, ret);
15b71abe
JA
4224 return 0;
4225}
4226
45d189c6 4227static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 4228{
45d189c6 4229 return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
cebdb986
JA
4230}
4231
067524e9
JA
4232static int io_remove_buffers_prep(struct io_kiocb *req,
4233 const struct io_uring_sqe *sqe)
4234{
4235 struct io_provide_buf *p = &req->pbuf;
4236 u64 tmp;
4237
4238 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
4239 return -EINVAL;
4240
4241 tmp = READ_ONCE(sqe->fd);
4242 if (!tmp || tmp > USHRT_MAX)
4243 return -EINVAL;
4244
4245 memset(p, 0, sizeof(*p));
4246 p->nbufs = tmp;
4247 p->bgid = READ_ONCE(sqe->buf_group);
4248 return 0;
4249}
4250
4251static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4252 int bgid, unsigned nbufs)
4253{
4254 unsigned i = 0;
4255
4256 /* shouldn't happen */
4257 if (!nbufs)
4258 return 0;
4259
4260 /* the head kbuf is the list itself */
4261 while (!list_empty(&buf->list)) {
4262 struct io_buffer *nxt;
4263
4264 nxt = list_first_entry(&buf->list, struct io_buffer, list);
4265 list_del(&nxt->list);
4266 kfree(nxt);
4267 if (++i == nbufs)
4268 return i;
4269 }
4270 i++;
4271 kfree(buf);
4272 idr_remove(&ctx->io_buffer_idr, bgid);
4273
4274 return i;
4275}
4276
889fca73 4277static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
4278{
4279 struct io_provide_buf *p = &req->pbuf;
4280 struct io_ring_ctx *ctx = req->ctx;
4281 struct io_buffer *head;
4282 int ret = 0;
45d189c6 4283 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
067524e9
JA
4284
4285 io_ring_submit_lock(ctx, !force_nonblock);
4286
4287 lockdep_assert_held(&ctx->uring_lock);
4288
4289 ret = -ENOENT;
4290 head = idr_find(&ctx->io_buffer_idr, p->bgid);
4291 if (head)
4292 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
067524e9
JA
4293 if (ret < 0)
4294 req_set_fail_links(req);
067524e9 4295
31bff9a5
PB
4296 /* need to hold the lock to complete IOPOLL requests */
4297 if (ctx->flags & IORING_SETUP_IOPOLL) {
889fca73 4298 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5
PB
4299 io_ring_submit_unlock(ctx, !force_nonblock);
4300 } else {
4301 io_ring_submit_unlock(ctx, !force_nonblock);
889fca73 4302 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5 4303 }
067524e9
JA
4304 return 0;
4305}
4306
ddf0322d
JA
4307static int io_provide_buffers_prep(struct io_kiocb *req,
4308 const struct io_uring_sqe *sqe)
4309{
4310 struct io_provide_buf *p = &req->pbuf;
4311 u64 tmp;
4312
4313 if (sqe->ioprio || sqe->rw_flags)
4314 return -EINVAL;
4315
4316 tmp = READ_ONCE(sqe->fd);
4317 if (!tmp || tmp > USHRT_MAX)
4318 return -E2BIG;
4319 p->nbufs = tmp;
4320 p->addr = READ_ONCE(sqe->addr);
4321 p->len = READ_ONCE(sqe->len);
4322
efe68c1c 4323 if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
ddf0322d
JA
4324 return -EFAULT;
4325
4326 p->bgid = READ_ONCE(sqe->buf_group);
4327 tmp = READ_ONCE(sqe->off);
4328 if (tmp > USHRT_MAX)
4329 return -E2BIG;
4330 p->bid = tmp;
4331 return 0;
4332}
4333
4334static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4335{
4336 struct io_buffer *buf;
4337 u64 addr = pbuf->addr;
4338 int i, bid = pbuf->bid;
4339
4340 for (i = 0; i < pbuf->nbufs; i++) {
4341 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
4342 if (!buf)
4343 break;
4344
4345 buf->addr = addr;
4346 buf->len = pbuf->len;
4347 buf->bid = bid;
4348 addr += pbuf->len;
4349 bid++;
4350 if (!*head) {
4351 INIT_LIST_HEAD(&buf->list);
4352 *head = buf;
4353 } else {
4354 list_add_tail(&buf->list, &(*head)->list);
4355 }
4356 }
4357
4358 return i ? i : -ENOMEM;
4359}
4360
889fca73 4361static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
4362{
4363 struct io_provide_buf *p = &req->pbuf;
4364 struct io_ring_ctx *ctx = req->ctx;
4365 struct io_buffer *head, *list;
4366 int ret = 0;
45d189c6 4367 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ddf0322d
JA
4368
4369 io_ring_submit_lock(ctx, !force_nonblock);
4370
4371 lockdep_assert_held(&ctx->uring_lock);
4372
4373 list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
4374
4375 ret = io_add_buffers(p, &head);
4376 if (ret < 0)
4377 goto out;
4378
4379 if (!list) {
4380 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
4381 GFP_KERNEL);
4382 if (ret < 0) {
067524e9 4383 __io_remove_buffers(ctx, head, p->bgid, -1U);
ddf0322d
JA
4384 goto out;
4385 }
4386 }
4387out:
ddf0322d
JA
4388 if (ret < 0)
4389 req_set_fail_links(req);
31bff9a5
PB
4390
4391 /* need to hold the lock to complete IOPOLL requests */
4392 if (ctx->flags & IORING_SETUP_IOPOLL) {
889fca73 4393 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5
PB
4394 io_ring_submit_unlock(ctx, !force_nonblock);
4395 } else {
4396 io_ring_submit_unlock(ctx, !force_nonblock);
889fca73 4397 __io_req_complete(req, issue_flags, ret, 0);
31bff9a5 4398 }
ddf0322d 4399 return 0;
cebdb986
JA
4400}
4401
3e4827b0
JA
4402static int io_epoll_ctl_prep(struct io_kiocb *req,
4403 const struct io_uring_sqe *sqe)
4404{
4405#if defined(CONFIG_EPOLL)
4406 if (sqe->ioprio || sqe->buf_index)
4407 return -EINVAL;
6ca56f84 4408 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
3232dd02 4409 return -EINVAL;
3e4827b0
JA
4410
4411 req->epoll.epfd = READ_ONCE(sqe->fd);
4412 req->epoll.op = READ_ONCE(sqe->len);
4413 req->epoll.fd = READ_ONCE(sqe->off);
4414
4415 if (ep_op_has_event(req->epoll.op)) {
4416 struct epoll_event __user *ev;
4417
4418 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4419 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4420 return -EFAULT;
4421 }
4422
4423 return 0;
4424#else
4425 return -EOPNOTSUPP;
4426#endif
4427}
4428
889fca73 4429static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
4430{
4431#if defined(CONFIG_EPOLL)
4432 struct io_epoll *ie = &req->epoll;
4433 int ret;
45d189c6 4434 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
4435
4436 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4437 if (force_nonblock && ret == -EAGAIN)
4438 return -EAGAIN;
4439
4440 if (ret < 0)
4441 req_set_fail_links(req);
889fca73 4442 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
4443 return 0;
4444#else
4445 return -EOPNOTSUPP;
4446#endif
4447}
4448
c1ca757b
JA
4449static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4450{
4451#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4452 if (sqe->ioprio || sqe->buf_index || sqe->off)
4453 return -EINVAL;
3232dd02
PB
4454 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4455 return -EINVAL;
c1ca757b
JA
4456
4457 req->madvise.addr = READ_ONCE(sqe->addr);
4458 req->madvise.len = READ_ONCE(sqe->len);
4459 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4460 return 0;
4461#else
4462 return -EOPNOTSUPP;
4463#endif
4464}
4465
45d189c6 4466static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
4467{
4468#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4469 struct io_madvise *ma = &req->madvise;
4470 int ret;
4471
45d189c6 4472 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
4473 return -EAGAIN;
4474
0726b01e 4475 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b
JA
4476 if (ret < 0)
4477 req_set_fail_links(req);
e1e16097 4478 io_req_complete(req, ret);
c1ca757b
JA
4479 return 0;
4480#else
4481 return -EOPNOTSUPP;
4482#endif
4483}
4484
4840e418
JA
4485static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4486{
4487 if (sqe->ioprio || sqe->buf_index || sqe->addr)
4488 return -EINVAL;
3232dd02
PB
4489 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4490 return -EINVAL;
4840e418
JA
4491
4492 req->fadvise.offset = READ_ONCE(sqe->off);
4493 req->fadvise.len = READ_ONCE(sqe->len);
4494 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4495 return 0;
4496}
4497
45d189c6 4498static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
4499{
4500 struct io_fadvise *fa = &req->fadvise;
4501 int ret;
4502
45d189c6 4503 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
4504 switch (fa->advice) {
4505 case POSIX_FADV_NORMAL:
4506 case POSIX_FADV_RANDOM:
4507 case POSIX_FADV_SEQUENTIAL:
4508 break;
4509 default:
4510 return -EAGAIN;
4511 }
4512 }
4840e418
JA
4513
4514 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4515 if (ret < 0)
4516 req_set_fail_links(req);
e1e16097 4517 io_req_complete(req, ret);
4840e418
JA
4518 return 0;
4519}
4520
eddc7ef5
JA
4521static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4522{
6ca56f84 4523 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
3232dd02 4524 return -EINVAL;
eddc7ef5
JA
4525 if (sqe->ioprio || sqe->buf_index)
4526 return -EINVAL;
9c280f90 4527 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4528 return -EBADF;
eddc7ef5 4529
1d9e1288
BM
4530 req->statx.dfd = READ_ONCE(sqe->fd);
4531 req->statx.mask = READ_ONCE(sqe->len);
e62753e4 4532 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
4533 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4534 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5
JA
4535
4536 return 0;
4537}
4538
45d189c6 4539static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 4540{
1d9e1288 4541 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
4542 int ret;
4543
45d189c6 4544 if (issue_flags & IO_URING_F_NONBLOCK) {
5b0bbee4
JA
4545 /* only need file table for an actual valid fd */
4546 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4547 req->flags |= REQ_F_NO_FILE_TABLE;
eddc7ef5 4548 return -EAGAIN;
5b0bbee4 4549 }
eddc7ef5 4550
e62753e4
BM
4551 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4552 ctx->buffer);
eddc7ef5 4553
eddc7ef5
JA
4554 if (ret < 0)
4555 req_set_fail_links(req);
e1e16097 4556 io_req_complete(req, ret);
eddc7ef5
JA
4557 return 0;
4558}
4559
b5dba59e
JA
4560static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4561{
14587a46 4562 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4563 return -EINVAL;
b5dba59e
JA
4564 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4565 sqe->rw_flags || sqe->buf_index)
4566 return -EINVAL;
9c280f90 4567 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4568 return -EBADF;
b5dba59e
JA
4569
4570 req->close.fd = READ_ONCE(sqe->fd);
b5dba59e 4571 return 0;
b5dba59e
JA
4572}
4573
889fca73 4574static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 4575{
9eac1904 4576 struct files_struct *files = current->files;
3af73b28 4577 struct io_close *close = &req->close;
9eac1904
JA
4578 struct fdtable *fdt;
4579 struct file *file;
b5dba59e
JA
4580 int ret;
4581
9eac1904
JA
4582 file = NULL;
4583 ret = -EBADF;
4584 spin_lock(&files->file_lock);
4585 fdt = files_fdtable(files);
4586 if (close->fd >= fdt->max_fds) {
4587 spin_unlock(&files->file_lock);
4588 goto err;
4589 }
4590 file = fdt->fd[close->fd];
4591 if (!file) {
4592 spin_unlock(&files->file_lock);
4593 goto err;
4594 }
4595
4596 if (file->f_op == &io_uring_fops) {
4597 spin_unlock(&files->file_lock);
4598 file = NULL;
4599 goto err;
3af73b28 4600 }
b5dba59e
JA
4601
4602 /* if the file has a flush method, be safe and punt to async */
45d189c6 4603 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 4604 spin_unlock(&files->file_lock);
0bf0eefd 4605 return -EAGAIN;
a2100672 4606 }
b5dba59e 4607
9eac1904
JA
4608 ret = __close_fd_get_file(close->fd, &file);
4609 spin_unlock(&files->file_lock);
4610 if (ret < 0) {
4611 if (ret == -ENOENT)
4612 ret = -EBADF;
4613 goto err;
4614 }
4615
3af73b28 4616 /* No ->flush() or already async, safely close from here */
9eac1904
JA
4617 ret = filp_close(file, current->files);
4618err:
3af73b28
PB
4619 if (ret < 0)
4620 req_set_fail_links(req);
9eac1904
JA
4621 if (file)
4622 fput(file);
889fca73 4623 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 4624 return 0;
b5dba59e
JA
4625}
4626
1155c76a 4627static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
4628{
4629 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4 4630
5d17b4a4
JA
4631 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4632 return -EINVAL;
4633 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4634 return -EINVAL;
4635
8ed8d3c3
JA
4636 req->sync.off = READ_ONCE(sqe->off);
4637 req->sync.len = READ_ONCE(sqe->len);
4638 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
4639 return 0;
4640}
4641
45d189c6 4642static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4643{
8ed8d3c3
JA
4644 int ret;
4645
ac45abc0 4646 /* sync_file_range always requires a blocking context */
45d189c6 4647 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4648 return -EAGAIN;
4649
9adbd45d 4650 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
4651 req->sync.flags);
4652 if (ret < 0)
4653 req_set_fail_links(req);
e1e16097 4654 io_req_complete(req, ret);
5d17b4a4
JA
4655 return 0;
4656}
4657
469956e8 4658#if defined(CONFIG_NET)
02d27d89
PB
4659static int io_setup_async_msg(struct io_kiocb *req,
4660 struct io_async_msghdr *kmsg)
4661{
e8c2bc1f
JA
4662 struct io_async_msghdr *async_msg = req->async_data;
4663
4664 if (async_msg)
02d27d89 4665 return -EAGAIN;
e8c2bc1f 4666 if (io_alloc_async_data(req)) {
257e84a5 4667 kfree(kmsg->free_iov);
02d27d89
PB
4668 return -ENOMEM;
4669 }
e8c2bc1f 4670 async_msg = req->async_data;
02d27d89 4671 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 4672 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 4673 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
4674 /* if were using fast_iov, set it to the new one */
4675 if (!async_msg->free_iov)
4676 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4677
02d27d89
PB
4678 return -EAGAIN;
4679}
4680
2ae523ed
PB
4681static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4682 struct io_async_msghdr *iomsg)
4683{
2ae523ed 4684 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 4685 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 4686 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 4687 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
4688}
4689
93642ef8
PB
4690static int io_sendmsg_prep_async(struct io_kiocb *req)
4691{
4692 int ret;
4693
4694 if (!io_op_defs[req->opcode].needs_async_data)
4695 return 0;
4696 ret = io_sendmsg_copy_hdr(req, req->async_data);
4697 if (!ret)
4698 req->flags |= REQ_F_NEED_CLEANUP;
4699 return ret;
4700}
4701
3529d8c2 4702static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 4703{
e47293fd 4704 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 4705
d2b6f48b
PB
4706 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4707 return -EINVAL;
4708
e47293fd 4709 sr->msg_flags = READ_ONCE(sqe->msg_flags);
270a5940 4710 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 4711 sr->len = READ_ONCE(sqe->len);
3529d8c2 4712
d8768362
JA
4713#ifdef CONFIG_COMPAT
4714 if (req->ctx->compat)
4715 sr->msg_flags |= MSG_CMSG_COMPAT;
4716#endif
93642ef8 4717 return 0;
03b1230c
JA
4718}
4719
889fca73 4720static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4721{
6b754c8b 4722 struct io_async_msghdr iomsg, *kmsg;
0fa03c62 4723 struct socket *sock;
7a7cacba 4724 unsigned flags;
0fa03c62
JA
4725 int ret;
4726
dba4a925 4727 sock = sock_from_file(req->file);
7a7cacba 4728 if (unlikely(!sock))
dba4a925 4729 return -ENOTSOCK;
3529d8c2 4730
257e84a5
PB
4731 kmsg = req->async_data;
4732 if (!kmsg) {
7a7cacba
PB
4733 ret = io_sendmsg_copy_hdr(req, &iomsg);
4734 if (ret)
4735 return ret;
4736 kmsg = &iomsg;
0fa03c62 4737 }
0fa03c62 4738
7a7cacba
PB
4739 flags = req->sr_msg.msg_flags;
4740 if (flags & MSG_DONTWAIT)
4741 req->flags |= REQ_F_NOWAIT;
45d189c6 4742 else if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4743 flags |= MSG_DONTWAIT;
e47293fd 4744
7a7cacba 4745 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
45d189c6 4746 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4747 return io_setup_async_msg(req, kmsg);
4748 if (ret == -ERESTARTSYS)
4749 ret = -EINTR;
0fa03c62 4750
257e84a5
PB
4751 /* fast path, check for non-NULL to avoid function call */
4752 if (kmsg->free_iov)
4753 kfree(kmsg->free_iov);
99bc4c38 4754 req->flags &= ~REQ_F_NEED_CLEANUP;
4e88d6e7
JA
4755 if (ret < 0)
4756 req_set_fail_links(req);
889fca73 4757 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 4758 return 0;
03b1230c 4759}
aa1fa28f 4760
889fca73 4761static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4762{
7a7cacba
PB
4763 struct io_sr_msg *sr = &req->sr_msg;
4764 struct msghdr msg;
4765 struct iovec iov;
fddaface 4766 struct socket *sock;
7a7cacba 4767 unsigned flags;
fddaface
JA
4768 int ret;
4769
dba4a925 4770 sock = sock_from_file(req->file);
7a7cacba 4771 if (unlikely(!sock))
dba4a925 4772 return -ENOTSOCK;
fddaface 4773
7a7cacba
PB
4774 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4775 if (unlikely(ret))
14db8411 4776 return ret;
fddaface 4777
7a7cacba
PB
4778 msg.msg_name = NULL;
4779 msg.msg_control = NULL;
4780 msg.msg_controllen = 0;
4781 msg.msg_namelen = 0;
fddaface 4782
7a7cacba
PB
4783 flags = req->sr_msg.msg_flags;
4784 if (flags & MSG_DONTWAIT)
4785 req->flags |= REQ_F_NOWAIT;
45d189c6 4786 else if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4787 flags |= MSG_DONTWAIT;
fddaface 4788
7a7cacba
PB
4789 msg.msg_flags = flags;
4790 ret = sock_sendmsg(sock, &msg);
45d189c6 4791 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4792 return -EAGAIN;
4793 if (ret == -ERESTARTSYS)
4794 ret = -EINTR;
fddaface 4795
fddaface
JA
4796 if (ret < 0)
4797 req_set_fail_links(req);
889fca73 4798 __io_req_complete(req, issue_flags, ret, 0);
fddaface 4799 return 0;
fddaface
JA
4800}
4801
1400e697
PB
4802static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4803 struct io_async_msghdr *iomsg)
52de1fe1
JA
4804{
4805 struct io_sr_msg *sr = &req->sr_msg;
4806 struct iovec __user *uiov;
4807 size_t iov_len;
4808 int ret;
4809
1400e697
PB
4810 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4811 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
4812 if (ret)
4813 return ret;
4814
4815 if (req->flags & REQ_F_BUFFER_SELECT) {
4816 if (iov_len > 1)
4817 return -EINVAL;
5476dfed 4818 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 4819 return -EFAULT;
5476dfed 4820 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 4821 iomsg->free_iov = NULL;
52de1fe1 4822 } else {
257e84a5 4823 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4824 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 4825 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 4826 false);
52de1fe1
JA
4827 if (ret > 0)
4828 ret = 0;
4829 }
4830
4831 return ret;
4832}
4833
4834#ifdef CONFIG_COMPAT
4835static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 4836 struct io_async_msghdr *iomsg)
52de1fe1
JA
4837{
4838 struct compat_msghdr __user *msg_compat;
4839 struct io_sr_msg *sr = &req->sr_msg;
4840 struct compat_iovec __user *uiov;
4841 compat_uptr_t ptr;
4842 compat_size_t len;
4843 int ret;
4844
270a5940 4845 msg_compat = (struct compat_msghdr __user *) sr->umsg;
1400e697 4846 ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
52de1fe1
JA
4847 &ptr, &len);
4848 if (ret)
4849 return ret;
4850
4851 uiov = compat_ptr(ptr);
4852 if (req->flags & REQ_F_BUFFER_SELECT) {
4853 compat_ssize_t clen;
4854
4855 if (len > 1)
4856 return -EINVAL;
4857 if (!access_ok(uiov, sizeof(*uiov)))
4858 return -EFAULT;
4859 if (__get_user(clen, &uiov->iov_len))
4860 return -EFAULT;
4861 if (clen < 0)
4862 return -EINVAL;
2d280bc8 4863 sr->len = clen;
257e84a5 4864 iomsg->free_iov = NULL;
52de1fe1 4865 } else {
257e84a5 4866 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4867 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 4868 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 4869 &iomsg->msg.msg_iter, true);
52de1fe1
JA
4870 if (ret < 0)
4871 return ret;
4872 }
4873
4874 return 0;
4875}
4876#endif
4877
1400e697
PB
4878static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4879 struct io_async_msghdr *iomsg)
52de1fe1 4880{
1400e697 4881 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
4882
4883#ifdef CONFIG_COMPAT
4884 if (req->ctx->compat)
1400e697 4885 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 4886#endif
52de1fe1 4887
1400e697 4888 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
4889}
4890
bcda7baa 4891static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
7fbb1b54 4892 bool needs_lock)
bcda7baa
JA
4893{
4894 struct io_sr_msg *sr = &req->sr_msg;
4895 struct io_buffer *kbuf;
4896
bcda7baa
JA
4897 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4898 if (IS_ERR(kbuf))
4899 return kbuf;
4900
4901 sr->kbuf = kbuf;
4902 req->flags |= REQ_F_BUFFER_SELECTED;
bcda7baa 4903 return kbuf;
fddaface
JA
4904}
4905
7fbb1b54
PB
4906static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4907{
4908 return io_put_kbuf(req, req->sr_msg.kbuf);
4909}
4910
93642ef8 4911static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 4912{
99bc4c38 4913 int ret;
3529d8c2 4914
93642ef8
PB
4915 if (!io_op_defs[req->opcode].needs_async_data)
4916 return 0;
4917 ret = io_recvmsg_copy_hdr(req, req->async_data);
4918 if (!ret)
4919 req->flags |= REQ_F_NEED_CLEANUP;
4920 return ret;
4921}
4922
4923static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4924{
4925 struct io_sr_msg *sr = &req->sr_msg;
4926
d2b6f48b
PB
4927 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4928 return -EINVAL;
4929
3529d8c2 4930 sr->msg_flags = READ_ONCE(sqe->msg_flags);
270a5940 4931 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 4932 sr->len = READ_ONCE(sqe->len);
bcda7baa 4933 sr->bgid = READ_ONCE(sqe->buf_group);
06b76d44 4934
d8768362
JA
4935#ifdef CONFIG_COMPAT
4936 if (req->ctx->compat)
4937 sr->msg_flags |= MSG_CMSG_COMPAT;
4938#endif
93642ef8 4939 return 0;
aa1fa28f
JA
4940}
4941
889fca73 4942static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4943{
6b754c8b 4944 struct io_async_msghdr iomsg, *kmsg;
03b1230c 4945 struct socket *sock;
7fbb1b54 4946 struct io_buffer *kbuf;
7a7cacba 4947 unsigned flags;
52de1fe1 4948 int ret, cflags = 0;
45d189c6 4949 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 4950
dba4a925 4951 sock = sock_from_file(req->file);
7a7cacba 4952 if (unlikely(!sock))
dba4a925 4953 return -ENOTSOCK;
3529d8c2 4954
257e84a5
PB
4955 kmsg = req->async_data;
4956 if (!kmsg) {
7a7cacba
PB
4957 ret = io_recvmsg_copy_hdr(req, &iomsg);
4958 if (ret)
681fda8d 4959 return ret;
7a7cacba
PB
4960 kmsg = &iomsg;
4961 }
03b1230c 4962
bc02ef33 4963 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4964 kbuf = io_recv_buffer_select(req, !force_nonblock);
bc02ef33 4965 if (IS_ERR(kbuf))
52de1fe1 4966 return PTR_ERR(kbuf);
7a7cacba 4967 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5476dfed
PB
4968 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4969 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
7a7cacba
PB
4970 1, req->sr_msg.len);
4971 }
52de1fe1 4972
7a7cacba
PB
4973 flags = req->sr_msg.msg_flags;
4974 if (flags & MSG_DONTWAIT)
4975 req->flags |= REQ_F_NOWAIT;
4976 else if (force_nonblock)
4977 flags |= MSG_DONTWAIT;
e47293fd 4978
7a7cacba
PB
4979 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4980 kmsg->uaddr, flags);
0e1b6fe3
PB
4981 if (force_nonblock && ret == -EAGAIN)
4982 return io_setup_async_msg(req, kmsg);
7a7cacba
PB
4983 if (ret == -ERESTARTSYS)
4984 ret = -EINTR;
03b1230c 4985
7fbb1b54
PB
4986 if (req->flags & REQ_F_BUFFER_SELECTED)
4987 cflags = io_put_recv_kbuf(req);
257e84a5
PB
4988 /* fast path, check for non-NULL to avoid function call */
4989 if (kmsg->free_iov)
4990 kfree(kmsg->free_iov);
99bc4c38 4991 req->flags &= ~REQ_F_NEED_CLEANUP;
4e88d6e7
JA
4992 if (ret < 0)
4993 req_set_fail_links(req);
889fca73 4994 __io_req_complete(req, issue_flags, ret, cflags);
03b1230c 4995 return 0;
0fa03c62 4996}
5d17b4a4 4997
889fca73 4998static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4999{
6b754c8b 5000 struct io_buffer *kbuf;
7a7cacba
PB
5001 struct io_sr_msg *sr = &req->sr_msg;
5002 struct msghdr msg;
5003 void __user *buf = sr->buf;
fddaface 5004 struct socket *sock;
7a7cacba
PB
5005 struct iovec iov;
5006 unsigned flags;
bcda7baa 5007 int ret, cflags = 0;
45d189c6 5008 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 5009
dba4a925 5010 sock = sock_from_file(req->file);
7a7cacba 5011 if (unlikely(!sock))
dba4a925 5012 return -ENOTSOCK;
fddaface 5013
bc02ef33 5014 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 5015 kbuf = io_recv_buffer_select(req, !force_nonblock);
bcda7baa
JA
5016 if (IS_ERR(kbuf))
5017 return PTR_ERR(kbuf);
7a7cacba 5018 buf = u64_to_user_ptr(kbuf->addr);
bc02ef33 5019 }
bcda7baa 5020
7a7cacba 5021 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
5022 if (unlikely(ret))
5023 goto out_free;
fddaface 5024
7a7cacba
PB
5025 msg.msg_name = NULL;
5026 msg.msg_control = NULL;
5027 msg.msg_controllen = 0;
5028 msg.msg_namelen = 0;
5029 msg.msg_iocb = NULL;
5030 msg.msg_flags = 0;
fddaface 5031
7a7cacba
PB
5032 flags = req->sr_msg.msg_flags;
5033 if (flags & MSG_DONTWAIT)
5034 req->flags |= REQ_F_NOWAIT;
5035 else if (force_nonblock)
5036 flags |= MSG_DONTWAIT;
5037
5038 ret = sock_recvmsg(sock, &msg, flags);
5039 if (force_nonblock && ret == -EAGAIN)
5040 return -EAGAIN;
5041 if (ret == -ERESTARTSYS)
5042 ret = -EINTR;
14c32eee 5043out_free:
7fbb1b54
PB
5044 if (req->flags & REQ_F_BUFFER_SELECTED)
5045 cflags = io_put_recv_kbuf(req);
fddaface
JA
5046 if (ret < 0)
5047 req_set_fail_links(req);
889fca73 5048 __io_req_complete(req, issue_flags, ret, cflags);
fddaface 5049 return 0;
fddaface
JA
5050}
5051
3529d8c2 5052static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 5053{
8ed8d3c3
JA
5054 struct io_accept *accept = &req->accept;
5055
14587a46 5056 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
17f2fe35 5057 return -EINVAL;
8042d6ce 5058 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
5059 return -EINVAL;
5060
d55e5f5b
JA
5061 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5062 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 5063 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 5064 accept->nofile = rlimit(RLIMIT_NOFILE);
8ed8d3c3 5065 return 0;
8ed8d3c3 5066}
17f2fe35 5067
889fca73 5068static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
5069{
5070 struct io_accept *accept = &req->accept;
45d189c6 5071 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 5072 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
8ed8d3c3
JA
5073 int ret;
5074
e697deed
JX
5075 if (req->file->f_flags & O_NONBLOCK)
5076 req->flags |= REQ_F_NOWAIT;
5077
8ed8d3c3 5078 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
09952e3e
JA
5079 accept->addr_len, accept->flags,
5080 accept->nofile);
8ed8d3c3 5081 if (ret == -EAGAIN && force_nonblock)
17f2fe35 5082 return -EAGAIN;
ac45abc0
PB
5083 if (ret < 0) {
5084 if (ret == -ERESTARTSYS)
5085 ret = -EINTR;
4e88d6e7 5086 req_set_fail_links(req);
ac45abc0 5087 }
889fca73 5088 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 5089 return 0;
8ed8d3c3
JA
5090}
5091
93642ef8
PB
5092static int io_connect_prep_async(struct io_kiocb *req)
5093{
5094 struct io_async_connect *io = req->async_data;
5095 struct io_connect *conn = &req->connect;
5096
5097 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5098}
5099
3529d8c2 5100static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 5101{
3529d8c2 5102 struct io_connect *conn = &req->connect;
f499a021 5103
14587a46 5104 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3fbb51c1
JA
5105 return -EINVAL;
5106 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
5107 return -EINVAL;
5108
3529d8c2
JA
5109 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5110 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 5111 return 0;
f499a021
JA
5112}
5113
889fca73 5114static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 5115{
e8c2bc1f 5116 struct io_async_connect __io, *io;
f8e85cf2 5117 unsigned file_flags;
3fbb51c1 5118 int ret;
45d189c6 5119 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 5120
e8c2bc1f
JA
5121 if (req->async_data) {
5122 io = req->async_data;
f499a021 5123 } else {
3529d8c2
JA
5124 ret = move_addr_to_kernel(req->connect.addr,
5125 req->connect.addr_len,
e8c2bc1f 5126 &__io.address);
f499a021
JA
5127 if (ret)
5128 goto out;
5129 io = &__io;
5130 }
5131
3fbb51c1
JA
5132 file_flags = force_nonblock ? O_NONBLOCK : 0;
5133
e8c2bc1f 5134 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 5135 req->connect.addr_len, file_flags);
87f80d62 5136 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
e8c2bc1f 5137 if (req->async_data)
b7bb4f7d 5138 return -EAGAIN;
e8c2bc1f 5139 if (io_alloc_async_data(req)) {
f499a021
JA
5140 ret = -ENOMEM;
5141 goto out;
5142 }
e8c2bc1f
JA
5143 io = req->async_data;
5144 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 5145 return -EAGAIN;
f499a021 5146 }
f8e85cf2
JA
5147 if (ret == -ERESTARTSYS)
5148 ret = -EINTR;
f499a021 5149out:
4e88d6e7
JA
5150 if (ret < 0)
5151 req_set_fail_links(req);
889fca73 5152 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 5153 return 0;
469956e8
Y
5154}
5155#else /* !CONFIG_NET */
99a10081
JA
5156#define IO_NETOP_FN(op) \
5157static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
5158{ \
5159 return -EOPNOTSUPP; \
5160}
5161
5162#define IO_NETOP_PREP(op) \
5163IO_NETOP_FN(op) \
5164static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5165{ \
5166 return -EOPNOTSUPP; \
5167} \
5168
5169#define IO_NETOP_PREP_ASYNC(op) \
5170IO_NETOP_PREP(op) \
5171static int io_##op##_prep_async(struct io_kiocb *req) \
5172{ \
5173 return -EOPNOTSUPP; \
5174}
5175
5176IO_NETOP_PREP_ASYNC(sendmsg);
5177IO_NETOP_PREP_ASYNC(recvmsg);
5178IO_NETOP_PREP_ASYNC(connect);
5179IO_NETOP_PREP(accept);
5180IO_NETOP_FN(send);
5181IO_NETOP_FN(recv);
469956e8 5182#endif /* CONFIG_NET */
f8e85cf2 5183
d7718a9d
JA
5184struct io_poll_table {
5185 struct poll_table_struct pt;
5186 struct io_kiocb *req;
5187 int error;
5188};
ce593a6c 5189
d7718a9d
JA
5190static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5191 __poll_t mask, task_work_func_t func)
5192{
aa96bf8a 5193 int ret;
d7718a9d
JA
5194
5195 /* for instances that support it check for an event match first: */
5196 if (mask && !(mask & poll->events))
5197 return 0;
5198
5199 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5200
5201 list_del_init(&poll->wait.entry);
5202
d7718a9d 5203 req->result = mask;
7cbf1722 5204 req->task_work.func = func;
6d816e08
JA
5205 percpu_ref_get(&req->ctx->refs);
5206
d7718a9d 5207 /*
e3aabf95
JA
5208 * If this fails, then the task is exiting. When a task exits, the
5209 * work gets canceled, so just cancel this request as well instead
5210 * of executing it. We can't safely execute it anyway, as we may not
5211 * have the needed state needed for it anyway.
d7718a9d 5212 */
355fb9e2 5213 ret = io_req_task_work_add(req);
aa96bf8a 5214 if (unlikely(ret)) {
e3aabf95 5215 WRITE_ONCE(poll->canceled, true);
eab30c4d 5216 io_req_task_work_add_fallback(req, func);
aa96bf8a 5217 }
d7718a9d
JA
5218 return 1;
5219}
5220
74ce6ce4
JA
5221static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5222 __acquires(&req->ctx->completion_lock)
5223{
5224 struct io_ring_ctx *ctx = req->ctx;
5225
5226 if (!req->result && !READ_ONCE(poll->canceled)) {
5227 struct poll_table_struct pt = { ._key = poll->events };
5228
5229 req->result = vfs_poll(req->file, &pt) & poll->events;
5230 }
5231
5232 spin_lock_irq(&ctx->completion_lock);
5233 if (!req->result && !READ_ONCE(poll->canceled)) {
5234 add_wait_queue(poll->head, &poll->wait);
5235 return true;
5236 }
5237
5238 return false;
5239}
5240
d4e7cd36 5241static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 5242{
e8c2bc1f 5243 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 5244 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 5245 return req->async_data;
d4e7cd36
JA
5246 return req->apoll->double_poll;
5247}
5248
5249static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5250{
5251 if (req->opcode == IORING_OP_POLL_ADD)
5252 return &req->poll;
5253 return &req->apoll->poll;
5254}
5255
5256static void io_poll_remove_double(struct io_kiocb *req)
5257{
5258 struct io_poll_iocb *poll = io_poll_get_double(req);
18bceab1
JA
5259
5260 lockdep_assert_held(&req->ctx->completion_lock);
5261
5262 if (poll && poll->head) {
5263 struct wait_queue_head *head = poll->head;
5264
5265 spin_lock(&head->lock);
5266 list_del_init(&poll->wait.entry);
5267 if (poll->wait.private)
5268 refcount_dec(&req->refs);
5269 poll->head = NULL;
5270 spin_unlock(&head->lock);
5271 }
5272}
5273
5274static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
5275{
5276 struct io_ring_ctx *ctx = req->ctx;
5277
d4e7cd36 5278 io_poll_remove_double(req);
18bceab1
JA
5279 req->poll.done = true;
5280 io_cqring_fill_event(req, error ? error : mangle_poll(mask));
5281 io_commit_cqring(ctx);
5282}
5283
dd221f46 5284static void io_poll_task_func(struct callback_head *cb)
18bceab1 5285{
dd221f46 5286 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
18bceab1 5287 struct io_ring_ctx *ctx = req->ctx;
dd221f46 5288 struct io_kiocb *nxt;
18bceab1
JA
5289
5290 if (io_poll_rewait(req, &req->poll)) {
5291 spin_unlock_irq(&ctx->completion_lock);
dd221f46
PB
5292 } else {
5293 hash_del(&req->hash_node);
5294 io_poll_complete(req, req->result, 0);
5295 spin_unlock_irq(&ctx->completion_lock);
18bceab1 5296
dd221f46
PB
5297 nxt = io_put_req_find_next(req);
5298 io_cqring_ev_posted(ctx);
5299 if (nxt)
5300 __io_req_task_submit(nxt);
5301 }
18bceab1 5302
6d816e08 5303 percpu_ref_put(&ctx->refs);
18bceab1
JA
5304}
5305
5306static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5307 int sync, void *key)
5308{
5309 struct io_kiocb *req = wait->private;
d4e7cd36 5310 struct io_poll_iocb *poll = io_poll_get_single(req);
18bceab1
JA
5311 __poll_t mask = key_to_poll(key);
5312
5313 /* for instances that support it check for an event match first: */
5314 if (mask && !(mask & poll->events))
5315 return 0;
5316
8706e04e
JA
5317 list_del_init(&wait->entry);
5318
807abcb0 5319 if (poll && poll->head) {
18bceab1
JA
5320 bool done;
5321
807abcb0
JA
5322 spin_lock(&poll->head->lock);
5323 done = list_empty(&poll->wait.entry);
18bceab1 5324 if (!done)
807abcb0 5325 list_del_init(&poll->wait.entry);
d4e7cd36
JA
5326 /* make sure double remove sees this as being gone */
5327 wait->private = NULL;
807abcb0 5328 spin_unlock(&poll->head->lock);
c8b5e260
JA
5329 if (!done) {
5330 /* use wait func handler, so it matches the rq type */
5331 poll->wait.func(&poll->wait, mode, sync, key);
5332 }
18bceab1
JA
5333 }
5334 refcount_dec(&req->refs);
5335 return 1;
5336}
5337
5338static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5339 wait_queue_func_t wake_func)
5340{
5341 poll->head = NULL;
5342 poll->done = false;
5343 poll->canceled = false;
5344 poll->events = events;
5345 INIT_LIST_HEAD(&poll->wait.entry);
5346 init_waitqueue_func_entry(&poll->wait, wake_func);
5347}
5348
5349static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
5350 struct wait_queue_head *head,
5351 struct io_poll_iocb **poll_ptr)
18bceab1
JA
5352{
5353 struct io_kiocb *req = pt->req;
5354
5355 /*
5356 * If poll->head is already set, it's because the file being polled
5357 * uses multiple waitqueues for poll handling (eg one for read, one
5358 * for write). Setup a separate io_poll_iocb if this happens.
5359 */
5360 if (unlikely(poll->head)) {
58852d4d
PB
5361 struct io_poll_iocb *poll_one = poll;
5362
18bceab1 5363 /* already have a 2nd entry, fail a third attempt */
807abcb0 5364 if (*poll_ptr) {
18bceab1
JA
5365 pt->error = -EINVAL;
5366 return;
5367 }
5368 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5369 if (!poll) {
5370 pt->error = -ENOMEM;
5371 return;
5372 }
58852d4d 5373 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
18bceab1
JA
5374 refcount_inc(&req->refs);
5375 poll->wait.private = req;
807abcb0 5376 *poll_ptr = poll;
18bceab1
JA
5377 }
5378
5379 pt->error = 0;
5380 poll->head = head;
a31eb4a2
JX
5381
5382 if (poll->events & EPOLLEXCLUSIVE)
5383 add_wait_queue_exclusive(head, &poll->wait);
5384 else
5385 add_wait_queue(head, &poll->wait);
18bceab1
JA
5386}
5387
5388static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5389 struct poll_table_struct *p)
5390{
5391 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
807abcb0 5392 struct async_poll *apoll = pt->req->apoll;
18bceab1 5393
807abcb0 5394 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
18bceab1
JA
5395}
5396
d7718a9d
JA
5397static void io_async_task_func(struct callback_head *cb)
5398{
5399 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5400 struct async_poll *apoll = req->apoll;
5401 struct io_ring_ctx *ctx = req->ctx;
5402
5403 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5404
74ce6ce4 5405 if (io_poll_rewait(req, &apoll->poll)) {
d7718a9d 5406 spin_unlock_irq(&ctx->completion_lock);
6d816e08 5407 percpu_ref_put(&ctx->refs);
74ce6ce4 5408 return;
d7718a9d
JA
5409 }
5410
31067255 5411 /* If req is still hashed, it cannot have been canceled. Don't check. */
0be0b0e3 5412 if (hash_hashed(&req->hash_node))
74ce6ce4 5413 hash_del(&req->hash_node);
2bae047e 5414
d4e7cd36 5415 io_poll_remove_double(req);
74ce6ce4
JA
5416 spin_unlock_irq(&ctx->completion_lock);
5417
0be0b0e3
PB
5418 if (!READ_ONCE(apoll->poll.canceled))
5419 __io_req_task_submit(req);
5420 else
5421 __io_req_task_cancel(req, -ECANCELED);
aa340845 5422
6d816e08 5423 percpu_ref_put(&ctx->refs);
807abcb0 5424 kfree(apoll->double_poll);
31067255 5425 kfree(apoll);
d7718a9d
JA
5426}
5427
5428static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5429 void *key)
5430{
5431 struct io_kiocb *req = wait->private;
5432 struct io_poll_iocb *poll = &req->apoll->poll;
5433
5434 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5435 key_to_poll(key));
5436
5437 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5438}
5439
5440static void io_poll_req_insert(struct io_kiocb *req)
5441{
5442 struct io_ring_ctx *ctx = req->ctx;
5443 struct hlist_head *list;
5444
5445 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5446 hlist_add_head(&req->hash_node, list);
5447}
5448
5449static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5450 struct io_poll_iocb *poll,
5451 struct io_poll_table *ipt, __poll_t mask,
5452 wait_queue_func_t wake_func)
5453 __acquires(&ctx->completion_lock)
5454{
5455 struct io_ring_ctx *ctx = req->ctx;
5456 bool cancel = false;
5457
4d52f338 5458 INIT_HLIST_NODE(&req->hash_node);
18bceab1 5459 io_init_poll_iocb(poll, mask, wake_func);
b90cd197 5460 poll->file = req->file;
18bceab1 5461 poll->wait.private = req;
d7718a9d
JA
5462
5463 ipt->pt._key = mask;
5464 ipt->req = req;
5465 ipt->error = -EINVAL;
5466
d7718a9d
JA
5467 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5468
5469 spin_lock_irq(&ctx->completion_lock);
5470 if (likely(poll->head)) {
5471 spin_lock(&poll->head->lock);
5472 if (unlikely(list_empty(&poll->wait.entry))) {
5473 if (ipt->error)
5474 cancel = true;
5475 ipt->error = 0;
5476 mask = 0;
5477 }
5478 if (mask || ipt->error)
5479 list_del_init(&poll->wait.entry);
5480 else if (cancel)
5481 WRITE_ONCE(poll->canceled, true);
5482 else if (!poll->done) /* actually waiting for an event */
5483 io_poll_req_insert(req);
5484 spin_unlock(&poll->head->lock);
5485 }
5486
5487 return mask;
5488}
5489
5490static bool io_arm_poll_handler(struct io_kiocb *req)
5491{
5492 const struct io_op_def *def = &io_op_defs[req->opcode];
5493 struct io_ring_ctx *ctx = req->ctx;
5494 struct async_poll *apoll;
5495 struct io_poll_table ipt;
5496 __poll_t mask, ret;
9dab14b8 5497 int rw;
d7718a9d
JA
5498
5499 if (!req->file || !file_can_poll(req->file))
5500 return false;
24c74678 5501 if (req->flags & REQ_F_POLLED)
d7718a9d 5502 return false;
9dab14b8
JA
5503 if (def->pollin)
5504 rw = READ;
5505 else if (def->pollout)
5506 rw = WRITE;
5507 else
5508 return false;
5509 /* if we can't nonblock try, then no point in arming a poll handler */
5510 if (!io_file_supports_async(req->file, rw))
d7718a9d
JA
5511 return false;
5512
5513 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5514 if (unlikely(!apoll))
5515 return false;
807abcb0 5516 apoll->double_poll = NULL;
d7718a9d
JA
5517
5518 req->flags |= REQ_F_POLLED;
d7718a9d 5519 req->apoll = apoll;
d7718a9d 5520
8755d97a 5521 mask = 0;
d7718a9d 5522 if (def->pollin)
8755d97a 5523 mask |= POLLIN | POLLRDNORM;
d7718a9d
JA
5524 if (def->pollout)
5525 mask |= POLLOUT | POLLWRNORM;
901341bb
LH
5526
5527 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5528 if ((req->opcode == IORING_OP_RECVMSG) &&
5529 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5530 mask &= ~POLLIN;
5531
d7718a9d
JA
5532 mask |= POLLERR | POLLPRI;
5533
5534 ipt.pt._qproc = io_async_queue_proc;
5535
5536 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5537 io_async_wake);
a36da65c 5538 if (ret || ipt.error) {
d4e7cd36 5539 io_poll_remove_double(req);
d7718a9d 5540 spin_unlock_irq(&ctx->completion_lock);
807abcb0 5541 kfree(apoll->double_poll);
d7718a9d
JA
5542 kfree(apoll);
5543 return false;
5544 }
5545 spin_unlock_irq(&ctx->completion_lock);
5546 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5547 apoll->poll.events);
5548 return true;
5549}
5550
5551static bool __io_poll_remove_one(struct io_kiocb *req,
5552 struct io_poll_iocb *poll)
221c5eb2 5553{
b41e9852 5554 bool do_complete = false;
221c5eb2
JA
5555
5556 spin_lock(&poll->head->lock);
5557 WRITE_ONCE(poll->canceled, true);
392edb45
JA
5558 if (!list_empty(&poll->wait.entry)) {
5559 list_del_init(&poll->wait.entry);
b41e9852 5560 do_complete = true;
221c5eb2
JA
5561 }
5562 spin_unlock(&poll->head->lock);
3bfa5bcb 5563 hash_del(&req->hash_node);
d7718a9d
JA
5564 return do_complete;
5565}
5566
5567static bool io_poll_remove_one(struct io_kiocb *req)
5568{
5569 bool do_complete;
5570
d4e7cd36
JA
5571 io_poll_remove_double(req);
5572
d7718a9d
JA
5573 if (req->opcode == IORING_OP_POLL_ADD) {
5574 do_complete = __io_poll_remove_one(req, &req->poll);
5575 } else {
3bfa5bcb
JA
5576 struct async_poll *apoll = req->apoll;
5577
d7718a9d 5578 /* non-poll requests have submit ref still */
3bfa5bcb
JA
5579 do_complete = __io_poll_remove_one(req, &apoll->poll);
5580 if (do_complete) {
d7718a9d 5581 io_put_req(req);
807abcb0 5582 kfree(apoll->double_poll);
3bfa5bcb
JA
5583 kfree(apoll);
5584 }
b1f573bd
XW
5585 }
5586
b41e9852
JA
5587 if (do_complete) {
5588 io_cqring_fill_event(req, -ECANCELED);
5589 io_commit_cqring(req->ctx);
f254ac04 5590 req_set_fail_links(req);
216578e5 5591 io_put_req_deferred(req, 1);
b41e9852
JA
5592 }
5593
5594 return do_complete;
221c5eb2
JA
5595}
5596
76e1b642
JA
5597/*
5598 * Returns true if we found and killed one or more poll requests
5599 */
6b81928d
PB
5600static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5601 struct files_struct *files)
221c5eb2 5602{
78076bb6 5603 struct hlist_node *tmp;
221c5eb2 5604 struct io_kiocb *req;
8e2e1faf 5605 int posted = 0, i;
221c5eb2
JA
5606
5607 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
5608 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5609 struct hlist_head *list;
5610
5611 list = &ctx->cancel_hash[i];
f3606e3a 5612 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
6b81928d 5613 if (io_match_task(req, tsk, files))
f3606e3a
JA
5614 posted += io_poll_remove_one(req);
5615 }
221c5eb2
JA
5616 }
5617 spin_unlock_irq(&ctx->completion_lock);
b41e9852 5618
8e2e1faf
JA
5619 if (posted)
5620 io_cqring_ev_posted(ctx);
76e1b642
JA
5621
5622 return posted != 0;
221c5eb2
JA
5623}
5624
47f46768
JA
5625static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5626{
78076bb6 5627 struct hlist_head *list;
47f46768
JA
5628 struct io_kiocb *req;
5629
78076bb6
JA
5630 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5631 hlist_for_each_entry(req, list, hash_node) {
b41e9852
JA
5632 if (sqe_addr != req->user_data)
5633 continue;
5634 if (io_poll_remove_one(req))
eac406c6 5635 return 0;
b41e9852 5636 return -EALREADY;
47f46768
JA
5637 }
5638
5639 return -ENOENT;
5640}
5641
3529d8c2
JA
5642static int io_poll_remove_prep(struct io_kiocb *req,
5643 const struct io_uring_sqe *sqe)
0969e783 5644{
0969e783
JA
5645 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5646 return -EINVAL;
5647 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5648 sqe->poll_events)
5649 return -EINVAL;
5650
018043be 5651 req->poll_remove.addr = READ_ONCE(sqe->addr);
0969e783
JA
5652 return 0;
5653}
5654
221c5eb2
JA
5655/*
5656 * Find a running poll command that matches one specified in sqe->addr,
5657 * and remove it if found.
5658 */
61e98203 5659static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
221c5eb2
JA
5660{
5661 struct io_ring_ctx *ctx = req->ctx;
47f46768 5662 int ret;
221c5eb2 5663
221c5eb2 5664 spin_lock_irq(&ctx->completion_lock);
018043be 5665 ret = io_poll_cancel(ctx, req->poll_remove.addr);
221c5eb2
JA
5666 spin_unlock_irq(&ctx->completion_lock);
5667
4e88d6e7
JA
5668 if (ret < 0)
5669 req_set_fail_links(req);
e1e16097 5670 io_req_complete(req, ret);
221c5eb2
JA
5671 return 0;
5672}
5673
221c5eb2
JA
5674static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5675 void *key)
5676{
c2f2eb7d
JA
5677 struct io_kiocb *req = wait->private;
5678 struct io_poll_iocb *poll = &req->poll;
221c5eb2 5679
d7718a9d 5680 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
221c5eb2
JA
5681}
5682
221c5eb2
JA
5683static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5684 struct poll_table_struct *p)
5685{
5686 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5687
e8c2bc1f 5688 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
eac406c6
JA
5689}
5690
3529d8c2 5691static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
5692{
5693 struct io_poll_iocb *poll = &req->poll;
5769a351 5694 u32 events;
221c5eb2
JA
5695
5696 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5697 return -EINVAL;
5698 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5699 return -EINVAL;
5700
5769a351
JX
5701 events = READ_ONCE(sqe->poll32_events);
5702#ifdef __BIG_ENDIAN
5703 events = swahw32(events);
5704#endif
a31eb4a2
JX
5705 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5706 (events & EPOLLEXCLUSIVE);
0969e783
JA
5707 return 0;
5708}
5709
61e98203 5710static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
5711{
5712 struct io_poll_iocb *poll = &req->poll;
5713 struct io_ring_ctx *ctx = req->ctx;
5714 struct io_poll_table ipt;
0969e783 5715 __poll_t mask;
0969e783 5716
d7718a9d 5717 ipt.pt._qproc = io_poll_queue_proc;
36703247 5718
d7718a9d
JA
5719 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5720 io_poll_wake);
221c5eb2 5721
8c838788 5722 if (mask) { /* no async, we'd stolen it */
221c5eb2 5723 ipt.error = 0;
b0dd8a41 5724 io_poll_complete(req, mask, 0);
221c5eb2 5725 }
221c5eb2
JA
5726 spin_unlock_irq(&ctx->completion_lock);
5727
8c838788
JA
5728 if (mask) {
5729 io_cqring_ev_posted(ctx);
014db007 5730 io_put_req(req);
221c5eb2 5731 }
8c838788 5732 return ipt.error;
221c5eb2
JA
5733}
5734
5262f567
JA
5735static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5736{
ad8a48ac
JA
5737 struct io_timeout_data *data = container_of(timer,
5738 struct io_timeout_data, timer);
5739 struct io_kiocb *req = data->req;
5740 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
5741 unsigned long flags;
5742
5262f567 5743 spin_lock_irqsave(&ctx->completion_lock, flags);
a71976f3 5744 list_del_init(&req->timeout.list);
01cec8c1
PB
5745 atomic_set(&req->ctx->cq_timeouts,
5746 atomic_read(&req->ctx->cq_timeouts) + 1);
5747
78e19bbe 5748 io_cqring_fill_event(req, -ETIME);
5262f567
JA
5749 io_commit_cqring(ctx);
5750 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5751
5752 io_cqring_ev_posted(ctx);
4e88d6e7 5753 req_set_fail_links(req);
5262f567
JA
5754 io_put_req(req);
5755 return HRTIMER_NORESTART;
5756}
5757
fbd15848
PB
5758static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5759 __u64 user_data)
f254ac04 5760{
fbd15848 5761 struct io_timeout_data *io;
47f46768
JA
5762 struct io_kiocb *req;
5763 int ret = -ENOENT;
f254ac04 5764
135fcde8 5765 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
47f46768 5766 if (user_data == req->user_data) {
47f46768
JA
5767 ret = 0;
5768 break;
5769 }
5770 }
5771
5772 if (ret == -ENOENT)
fbd15848
PB
5773 return ERR_PTR(ret);
5774
5775 io = req->async_data;
e8c2bc1f 5776 ret = hrtimer_try_to_cancel(&io->timer);
f254ac04 5777 if (ret == -1)
fbd15848 5778 return ERR_PTR(-EALREADY);
a71976f3 5779 list_del_init(&req->timeout.list);
fbd15848
PB
5780 return req;
5781}
47f46768 5782
fbd15848
PB
5783static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5784{
5785 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5786
5787 if (IS_ERR(req))
5788 return PTR_ERR(req);
f254ac04
JA
5789
5790 req_set_fail_links(req);
f254ac04 5791 io_cqring_fill_event(req, -ECANCELED);
216578e5 5792 io_put_req_deferred(req, 1);
f254ac04
JA
5793 return 0;
5794}
5795
9c8e11b3
PB
5796static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5797 struct timespec64 *ts, enum hrtimer_mode mode)
47f46768 5798{
9c8e11b3
PB
5799 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5800 struct io_timeout_data *data;
47f46768 5801
9c8e11b3
PB
5802 if (IS_ERR(req))
5803 return PTR_ERR(req);
47f46768 5804
9c8e11b3
PB
5805 req->timeout.off = 0; /* noseq */
5806 data = req->async_data;
5807 list_add_tail(&req->timeout.list, &ctx->timeout_list);
5808 hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5809 data->timer.function = io_timeout_fn;
5810 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5811 return 0;
47f46768
JA
5812}
5813
3529d8c2
JA
5814static int io_timeout_remove_prep(struct io_kiocb *req,
5815 const struct io_uring_sqe *sqe)
b29472ee 5816{
9c8e11b3
PB
5817 struct io_timeout_rem *tr = &req->timeout_rem;
5818
b29472ee
JA
5819 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5820 return -EINVAL;
61710e43
DA
5821 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5822 return -EINVAL;
9c8e11b3 5823 if (sqe->ioprio || sqe->buf_index || sqe->len)
b29472ee
JA
5824 return -EINVAL;
5825
9c8e11b3
PB
5826 tr->addr = READ_ONCE(sqe->addr);
5827 tr->flags = READ_ONCE(sqe->timeout_flags);
5828 if (tr->flags & IORING_TIMEOUT_UPDATE) {
5829 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5830 return -EINVAL;
5831 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5832 return -EFAULT;
5833 } else if (tr->flags) {
5834 /* timeout removal doesn't support flags */
b29472ee 5835 return -EINVAL;
9c8e11b3 5836 }
b29472ee 5837
b29472ee
JA
5838 return 0;
5839}
5840
8662daec
PB
5841static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5842{
5843 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5844 : HRTIMER_MODE_REL;
5845}
5846
11365043
JA
5847/*
5848 * Remove or update an existing timeout command
5849 */
61e98203 5850static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 5851{
9c8e11b3 5852 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 5853 struct io_ring_ctx *ctx = req->ctx;
47f46768 5854 int ret;
11365043 5855
11365043 5856 spin_lock_irq(&ctx->completion_lock);
8662daec 5857 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
9c8e11b3 5858 ret = io_timeout_cancel(ctx, tr->addr);
8662daec
PB
5859 else
5860 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5861 io_translate_timeout_mode(tr->flags));
11365043 5862
47f46768 5863 io_cqring_fill_event(req, ret);
11365043
JA
5864 io_commit_cqring(ctx);
5865 spin_unlock_irq(&ctx->completion_lock);
5262f567 5866 io_cqring_ev_posted(ctx);
4e88d6e7
JA
5867 if (ret < 0)
5868 req_set_fail_links(req);
ec9c02ad 5869 io_put_req(req);
11365043 5870 return 0;
5262f567
JA
5871}
5872
3529d8c2 5873static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 5874 bool is_timeout_link)
5262f567 5875{
ad8a48ac 5876 struct io_timeout_data *data;
a41525ab 5877 unsigned flags;
56080b02 5878 u32 off = READ_ONCE(sqe->off);
5262f567 5879
ad8a48ac 5880 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 5881 return -EINVAL;
ad8a48ac 5882 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 5883 return -EINVAL;
56080b02 5884 if (off && is_timeout_link)
2d28390a 5885 return -EINVAL;
a41525ab
JA
5886 flags = READ_ONCE(sqe->timeout_flags);
5887 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 5888 return -EINVAL;
bdf20073 5889
bfe68a22 5890 req->timeout.off = off;
26a61679 5891
e8c2bc1f 5892 if (!req->async_data && io_alloc_async_data(req))
26a61679
JA
5893 return -ENOMEM;
5894
e8c2bc1f 5895 data = req->async_data;
ad8a48ac 5896 data->req = req;
ad8a48ac
JA
5897
5898 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
5899 return -EFAULT;
5900
8662daec 5901 data->mode = io_translate_timeout_mode(flags);
ad8a48ac
JA
5902 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5903 return 0;
5904}
5905
61e98203 5906static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 5907{
ad8a48ac 5908 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 5909 struct io_timeout_data *data = req->async_data;
ad8a48ac 5910 struct list_head *entry;
bfe68a22 5911 u32 tail, off = req->timeout.off;
ad8a48ac 5912
733f5c95 5913 spin_lock_irq(&ctx->completion_lock);
93bd25bb 5914
5262f567
JA
5915 /*
5916 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
5917 * timeout event to be satisfied. If it isn't set, then this is
5918 * a pure timeout request, sequence isn't used.
5262f567 5919 */
8eb7e2d0 5920 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
5921 entry = ctx->timeout_list.prev;
5922 goto add;
5923 }
5262f567 5924
bfe68a22
PB
5925 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5926 req->timeout.target_seq = tail + off;
5262f567 5927
f010505b
MDG
5928 /* Update the last seq here in case io_flush_timeouts() hasn't.
5929 * This is safe because ->completion_lock is held, and submissions
5930 * and completions are never mixed in the same ->completion_lock section.
5931 */
5932 ctx->cq_last_tm_flush = tail;
5933
5262f567
JA
5934 /*
5935 * Insertion sort, ensuring the first entry in the list is always
5936 * the one we need first.
5937 */
5262f567 5938 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
5939 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5940 timeout.list);
5262f567 5941
8eb7e2d0 5942 if (io_is_timeout_noseq(nxt))
93bd25bb 5943 continue;
bfe68a22
PB
5944 /* nxt.seq is behind @tail, otherwise would've been completed */
5945 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
5946 break;
5947 }
93bd25bb 5948add:
135fcde8 5949 list_add(&req->timeout.list, entry);
ad8a48ac
JA
5950 data->timer.function = io_timeout_fn;
5951 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 5952 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
5953 return 0;
5954}
5262f567 5955
62755e35
JA
5956static bool io_cancel_cb(struct io_wq_work *work, void *data)
5957{
5958 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5959
5960 return req->user_data == (unsigned long) data;
5961}
5962
e977d6d3 5963static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
62755e35 5964{
62755e35 5965 enum io_wq_cancel cancel_ret;
62755e35
JA
5966 int ret = 0;
5967
4f26bda1 5968 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
62755e35
JA
5969 switch (cancel_ret) {
5970 case IO_WQ_CANCEL_OK:
5971 ret = 0;
5972 break;
5973 case IO_WQ_CANCEL_RUNNING:
5974 ret = -EALREADY;
5975 break;
5976 case IO_WQ_CANCEL_NOTFOUND:
5977 ret = -ENOENT;
5978 break;
5979 }
5980
e977d6d3
JA
5981 return ret;
5982}
5983
47f46768
JA
5984static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5985 struct io_kiocb *req, __u64 sqe_addr,
014db007 5986 int success_ret)
47f46768
JA
5987{
5988 unsigned long flags;
5989 int ret;
5990
5991 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5992 if (ret != -ENOENT) {
5993 spin_lock_irqsave(&ctx->completion_lock, flags);
5994 goto done;
5995 }
5996
5997 spin_lock_irqsave(&ctx->completion_lock, flags);
5998 ret = io_timeout_cancel(ctx, sqe_addr);
5999 if (ret != -ENOENT)
6000 goto done;
6001 ret = io_poll_cancel(ctx, sqe_addr);
6002done:
b0dd8a41
JA
6003 if (!ret)
6004 ret = success_ret;
47f46768
JA
6005 io_cqring_fill_event(req, ret);
6006 io_commit_cqring(ctx);
6007 spin_unlock_irqrestore(&ctx->completion_lock, flags);
6008 io_cqring_ev_posted(ctx);
6009
4e88d6e7
JA
6010 if (ret < 0)
6011 req_set_fail_links(req);
014db007 6012 io_put_req(req);
47f46768
JA
6013}
6014
3529d8c2
JA
6015static int io_async_cancel_prep(struct io_kiocb *req,
6016 const struct io_uring_sqe *sqe)
e977d6d3 6017{
fbf23849 6018 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3 6019 return -EINVAL;
61710e43
DA
6020 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6021 return -EINVAL;
6022 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
e977d6d3
JA
6023 return -EINVAL;
6024
fbf23849
JA
6025 req->cancel.addr = READ_ONCE(sqe->addr);
6026 return 0;
6027}
6028
61e98203 6029static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6030{
6031 struct io_ring_ctx *ctx = req->ctx;
fbf23849 6032
014db007 6033 io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5262f567
JA
6034 return 0;
6035}
6036
269bbe5f 6037static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
6038 const struct io_uring_sqe *sqe)
6039{
6ca56f84
JA
6040 if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
6041 return -EINVAL;
61710e43
DA
6042 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6043 return -EINVAL;
6044 if (sqe->ioprio || sqe->rw_flags)
05f3fb3c
JA
6045 return -EINVAL;
6046
269bbe5f
BM
6047 req->rsrc_update.offset = READ_ONCE(sqe->off);
6048 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6049 if (!req->rsrc_update.nr_args)
05f3fb3c 6050 return -EINVAL;
269bbe5f 6051 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
6052 return 0;
6053}
6054
889fca73 6055static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6056{
6057 struct io_ring_ctx *ctx = req->ctx;
269bbe5f 6058 struct io_uring_rsrc_update up;
05f3fb3c 6059 int ret;
fbf23849 6060
45d189c6 6061 if (issue_flags & IO_URING_F_NONBLOCK)
05f3fb3c 6062 return -EAGAIN;
05f3fb3c 6063
269bbe5f
BM
6064 up.offset = req->rsrc_update.offset;
6065 up.data = req->rsrc_update.arg;
05f3fb3c
JA
6066
6067 mutex_lock(&ctx->uring_lock);
269bbe5f 6068 ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
05f3fb3c
JA
6069 mutex_unlock(&ctx->uring_lock);
6070
6071 if (ret < 0)
6072 req_set_fail_links(req);
889fca73 6073 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
6074 return 0;
6075}
6076
bfe76559 6077static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 6078{
d625c6ee 6079 switch (req->opcode) {
e781573e 6080 case IORING_OP_NOP:
bfe76559 6081 return 0;
f67676d1
JA
6082 case IORING_OP_READV:
6083 case IORING_OP_READ_FIXED:
3a6820f2 6084 case IORING_OP_READ:
bfe76559 6085 return io_read_prep(req, sqe);
f67676d1
JA
6086 case IORING_OP_WRITEV:
6087 case IORING_OP_WRITE_FIXED:
3a6820f2 6088 case IORING_OP_WRITE:
bfe76559 6089 return io_write_prep(req, sqe);
0969e783 6090 case IORING_OP_POLL_ADD:
bfe76559 6091 return io_poll_add_prep(req, sqe);
0969e783 6092 case IORING_OP_POLL_REMOVE:
bfe76559 6093 return io_poll_remove_prep(req, sqe);
8ed8d3c3 6094 case IORING_OP_FSYNC:
1155c76a 6095 return io_fsync_prep(req, sqe);
8ed8d3c3 6096 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 6097 return io_sfr_prep(req, sqe);
03b1230c 6098 case IORING_OP_SENDMSG:
fddaface 6099 case IORING_OP_SEND:
bfe76559 6100 return io_sendmsg_prep(req, sqe);
03b1230c 6101 case IORING_OP_RECVMSG:
fddaface 6102 case IORING_OP_RECV:
bfe76559 6103 return io_recvmsg_prep(req, sqe);
f499a021 6104 case IORING_OP_CONNECT:
bfe76559 6105 return io_connect_prep(req, sqe);
2d28390a 6106 case IORING_OP_TIMEOUT:
bfe76559 6107 return io_timeout_prep(req, sqe, false);
b29472ee 6108 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 6109 return io_timeout_remove_prep(req, sqe);
fbf23849 6110 case IORING_OP_ASYNC_CANCEL:
bfe76559 6111 return io_async_cancel_prep(req, sqe);
2d28390a 6112 case IORING_OP_LINK_TIMEOUT:
bfe76559 6113 return io_timeout_prep(req, sqe, true);
8ed8d3c3 6114 case IORING_OP_ACCEPT:
bfe76559 6115 return io_accept_prep(req, sqe);
d63d1b5e 6116 case IORING_OP_FALLOCATE:
bfe76559 6117 return io_fallocate_prep(req, sqe);
15b71abe 6118 case IORING_OP_OPENAT:
bfe76559 6119 return io_openat_prep(req, sqe);
b5dba59e 6120 case IORING_OP_CLOSE:
bfe76559 6121 return io_close_prep(req, sqe);
05f3fb3c 6122 case IORING_OP_FILES_UPDATE:
269bbe5f 6123 return io_rsrc_update_prep(req, sqe);
eddc7ef5 6124 case IORING_OP_STATX:
bfe76559 6125 return io_statx_prep(req, sqe);
4840e418 6126 case IORING_OP_FADVISE:
bfe76559 6127 return io_fadvise_prep(req, sqe);
c1ca757b 6128 case IORING_OP_MADVISE:
bfe76559 6129 return io_madvise_prep(req, sqe);
cebdb986 6130 case IORING_OP_OPENAT2:
bfe76559 6131 return io_openat2_prep(req, sqe);
3e4827b0 6132 case IORING_OP_EPOLL_CTL:
bfe76559 6133 return io_epoll_ctl_prep(req, sqe);
7d67af2c 6134 case IORING_OP_SPLICE:
bfe76559 6135 return io_splice_prep(req, sqe);
ddf0322d 6136 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 6137 return io_provide_buffers_prep(req, sqe);
067524e9 6138 case IORING_OP_REMOVE_BUFFERS:
bfe76559 6139 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 6140 case IORING_OP_TEE:
bfe76559 6141 return io_tee_prep(req, sqe);
36f4fa68
JA
6142 case IORING_OP_SHUTDOWN:
6143 return io_shutdown_prep(req, sqe);
80a261fd
JA
6144 case IORING_OP_RENAMEAT:
6145 return io_renameat_prep(req, sqe);
14a1143b
JA
6146 case IORING_OP_UNLINKAT:
6147 return io_unlinkat_prep(req, sqe);
f67676d1
JA
6148 }
6149
bfe76559
PB
6150 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6151 req->opcode);
6152 return-EINVAL;
6153}
6154
93642ef8 6155static int io_req_prep_async(struct io_kiocb *req)
bfe76559 6156{
93642ef8
PB
6157 switch (req->opcode) {
6158 case IORING_OP_READV:
6159 case IORING_OP_READ_FIXED:
6160 case IORING_OP_READ:
6161 return io_rw_prep_async(req, READ);
6162 case IORING_OP_WRITEV:
6163 case IORING_OP_WRITE_FIXED:
6164 case IORING_OP_WRITE:
6165 return io_rw_prep_async(req, WRITE);
6166 case IORING_OP_SENDMSG:
6167 case IORING_OP_SEND:
6168 return io_sendmsg_prep_async(req);
6169 case IORING_OP_RECVMSG:
6170 case IORING_OP_RECV:
6171 return io_recvmsg_prep_async(req);
6172 case IORING_OP_CONNECT:
6173 return io_connect_prep_async(req);
6174 }
6175 return 0;
6176}
6177
be7053b7 6178static int io_req_defer_prep(struct io_kiocb *req)
bfe76559 6179{
be7053b7 6180 if (!io_op_defs[req->opcode].needs_async_data)
bfe76559 6181 return 0;
be7053b7 6182 /* some opcodes init it during the inital prep */
93642ef8 6183 if (req->async_data)
be7053b7
PB
6184 return 0;
6185 if (__io_alloc_async_data(req))
bfe76559 6186 return -EAGAIN;
be7053b7 6187 return io_req_prep_async(req);
f67676d1
JA
6188}
6189
9cf7c104
PB
6190static u32 io_get_sequence(struct io_kiocb *req)
6191{
6192 struct io_kiocb *pos;
6193 struct io_ring_ctx *ctx = req->ctx;
f2f87370 6194 u32 total_submitted, nr_reqs = 0;
9cf7c104 6195
f2f87370
PB
6196 io_for_each_link(pos, req)
6197 nr_reqs++;
9cf7c104
PB
6198
6199 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
6200 return total_submitted - nr_reqs;
6201}
6202
be7053b7 6203static int io_req_defer(struct io_kiocb *req)
de0617e4 6204{
a197f664 6205 struct io_ring_ctx *ctx = req->ctx;
27dc8338 6206 struct io_defer_entry *de;
f67676d1 6207 int ret;
9cf7c104 6208 u32 seq;
de0617e4 6209
9d858b21 6210 /* Still need defer if there is pending req in defer list. */
9cf7c104
PB
6211 if (likely(list_empty_careful(&ctx->defer_list) &&
6212 !(req->flags & REQ_F_IO_DRAIN)))
6213 return 0;
6214
6215 seq = io_get_sequence(req);
6216 /* Still a chance to pass the sequence check */
6217 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
de0617e4
JA
6218 return 0;
6219
be7053b7
PB
6220 ret = io_req_defer_prep(req);
6221 if (ret)
6222 return ret;
cbdcb435 6223 io_prep_async_link(req);
27dc8338
PB
6224 de = kmalloc(sizeof(*de), GFP_KERNEL);
6225 if (!de)
6226 return -ENOMEM;
2d28390a 6227
de0617e4 6228 spin_lock_irq(&ctx->completion_lock);
9cf7c104 6229 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
de0617e4 6230 spin_unlock_irq(&ctx->completion_lock);
27dc8338 6231 kfree(de);
ae34817b
PB
6232 io_queue_async_work(req);
6233 return -EIOCBQUEUED;
de0617e4
JA
6234 }
6235
915967f6 6236 trace_io_uring_defer(ctx, req, req->user_data);
27dc8338 6237 de->req = req;
9cf7c104 6238 de->seq = seq;
27dc8338 6239 list_add_tail(&de->list, &ctx->defer_list);
de0617e4
JA
6240 spin_unlock_irq(&ctx->completion_lock);
6241 return -EIOCBQUEUED;
6242}
6243
3ca405eb 6244static void __io_clean_op(struct io_kiocb *req)
99bc4c38 6245{
0e1b6fe3
PB
6246 if (req->flags & REQ_F_BUFFER_SELECTED) {
6247 switch (req->opcode) {
6248 case IORING_OP_READV:
6249 case IORING_OP_READ_FIXED:
6250 case IORING_OP_READ:
bcda7baa 6251 kfree((void *)(unsigned long)req->rw.addr);
0e1b6fe3
PB
6252 break;
6253 case IORING_OP_RECVMSG:
6254 case IORING_OP_RECV:
bcda7baa 6255 kfree(req->sr_msg.kbuf);
0e1b6fe3
PB
6256 break;
6257 }
6258 req->flags &= ~REQ_F_BUFFER_SELECTED;
99bc4c38
PB
6259 }
6260
0e1b6fe3
PB
6261 if (req->flags & REQ_F_NEED_CLEANUP) {
6262 switch (req->opcode) {
6263 case IORING_OP_READV:
6264 case IORING_OP_READ_FIXED:
6265 case IORING_OP_READ:
6266 case IORING_OP_WRITEV:
6267 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
6268 case IORING_OP_WRITE: {
6269 struct io_async_rw *io = req->async_data;
6270 if (io->free_iovec)
6271 kfree(io->free_iovec);
0e1b6fe3 6272 break;
e8c2bc1f 6273 }
0e1b6fe3 6274 case IORING_OP_RECVMSG:
e8c2bc1f
JA
6275 case IORING_OP_SENDMSG: {
6276 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
6277
6278 kfree(io->free_iov);
0e1b6fe3 6279 break;
e8c2bc1f 6280 }
0e1b6fe3
PB
6281 case IORING_OP_SPLICE:
6282 case IORING_OP_TEE:
6283 io_put_file(req, req->splice.file_in,
6284 (req->splice.flags & SPLICE_F_FD_IN_FIXED));
6285 break;
f3cd4850
JA
6286 case IORING_OP_OPENAT:
6287 case IORING_OP_OPENAT2:
6288 if (req->open.filename)
6289 putname(req->open.filename);
6290 break;
80a261fd
JA
6291 case IORING_OP_RENAMEAT:
6292 putname(req->rename.oldpath);
6293 putname(req->rename.newpath);
6294 break;
14a1143b
JA
6295 case IORING_OP_UNLINKAT:
6296 putname(req->unlink.filename);
6297 break;
0e1b6fe3
PB
6298 }
6299 req->flags &= ~REQ_F_NEED_CLEANUP;
99bc4c38 6300 }
99bc4c38
PB
6301}
6302
889fca73 6303static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 6304{
a197f664 6305 struct io_ring_ctx *ctx = req->ctx;
d625c6ee 6306 int ret;
2b188cc1 6307
d625c6ee 6308 switch (req->opcode) {
2b188cc1 6309 case IORING_OP_NOP:
889fca73 6310 ret = io_nop(req, issue_flags);
2b188cc1
JA
6311 break;
6312 case IORING_OP_READV:
edafccee 6313 case IORING_OP_READ_FIXED:
3a6820f2 6314 case IORING_OP_READ:
889fca73 6315 ret = io_read(req, issue_flags);
edafccee 6316 break;
3529d8c2 6317 case IORING_OP_WRITEV:
edafccee 6318 case IORING_OP_WRITE_FIXED:
3a6820f2 6319 case IORING_OP_WRITE:
889fca73 6320 ret = io_write(req, issue_flags);
2b188cc1 6321 break;
c992fe29 6322 case IORING_OP_FSYNC:
45d189c6 6323 ret = io_fsync(req, issue_flags);
c992fe29 6324 break;
221c5eb2 6325 case IORING_OP_POLL_ADD:
61e98203 6326 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
6327 break;
6328 case IORING_OP_POLL_REMOVE:
61e98203 6329 ret = io_poll_remove(req, issue_flags);
221c5eb2 6330 break;
5d17b4a4 6331 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 6332 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 6333 break;
0fa03c62 6334 case IORING_OP_SENDMSG:
889fca73 6335 ret = io_sendmsg(req, issue_flags);
062d04d7 6336 break;
fddaface 6337 case IORING_OP_SEND:
889fca73 6338 ret = io_send(req, issue_flags);
0fa03c62 6339 break;
aa1fa28f 6340 case IORING_OP_RECVMSG:
889fca73 6341 ret = io_recvmsg(req, issue_flags);
062d04d7 6342 break;
fddaface 6343 case IORING_OP_RECV:
889fca73 6344 ret = io_recv(req, issue_flags);
aa1fa28f 6345 break;
5262f567 6346 case IORING_OP_TIMEOUT:
61e98203 6347 ret = io_timeout(req, issue_flags);
5262f567 6348 break;
11365043 6349 case IORING_OP_TIMEOUT_REMOVE:
61e98203 6350 ret = io_timeout_remove(req, issue_flags);
11365043 6351 break;
17f2fe35 6352 case IORING_OP_ACCEPT:
889fca73 6353 ret = io_accept(req, issue_flags);
17f2fe35 6354 break;
f8e85cf2 6355 case IORING_OP_CONNECT:
889fca73 6356 ret = io_connect(req, issue_flags);
f8e85cf2 6357 break;
62755e35 6358 case IORING_OP_ASYNC_CANCEL:
61e98203 6359 ret = io_async_cancel(req, issue_flags);
62755e35 6360 break;
d63d1b5e 6361 case IORING_OP_FALLOCATE:
45d189c6 6362 ret = io_fallocate(req, issue_flags);
d63d1b5e 6363 break;
15b71abe 6364 case IORING_OP_OPENAT:
45d189c6 6365 ret = io_openat(req, issue_flags);
15b71abe 6366 break;
b5dba59e 6367 case IORING_OP_CLOSE:
889fca73 6368 ret = io_close(req, issue_flags);
b5dba59e 6369 break;
05f3fb3c 6370 case IORING_OP_FILES_UPDATE:
889fca73 6371 ret = io_files_update(req, issue_flags);
05f3fb3c 6372 break;
eddc7ef5 6373 case IORING_OP_STATX:
45d189c6 6374 ret = io_statx(req, issue_flags);
eddc7ef5 6375 break;
4840e418 6376 case IORING_OP_FADVISE:
45d189c6 6377 ret = io_fadvise(req, issue_flags);
4840e418 6378 break;
c1ca757b 6379 case IORING_OP_MADVISE:
45d189c6 6380 ret = io_madvise(req, issue_flags);
c1ca757b 6381 break;
cebdb986 6382 case IORING_OP_OPENAT2:
45d189c6 6383 ret = io_openat2(req, issue_flags);
cebdb986 6384 break;
3e4827b0 6385 case IORING_OP_EPOLL_CTL:
889fca73 6386 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 6387 break;
7d67af2c 6388 case IORING_OP_SPLICE:
45d189c6 6389 ret = io_splice(req, issue_flags);
7d67af2c 6390 break;
ddf0322d 6391 case IORING_OP_PROVIDE_BUFFERS:
889fca73 6392 ret = io_provide_buffers(req, issue_flags);
ddf0322d 6393 break;
067524e9 6394 case IORING_OP_REMOVE_BUFFERS:
889fca73 6395 ret = io_remove_buffers(req, issue_flags);
3e4827b0 6396 break;
f2a8d5c7 6397 case IORING_OP_TEE:
45d189c6 6398 ret = io_tee(req, issue_flags);
f2a8d5c7 6399 break;
36f4fa68 6400 case IORING_OP_SHUTDOWN:
45d189c6 6401 ret = io_shutdown(req, issue_flags);
36f4fa68 6402 break;
80a261fd 6403 case IORING_OP_RENAMEAT:
45d189c6 6404 ret = io_renameat(req, issue_flags);
80a261fd 6405 break;
14a1143b 6406 case IORING_OP_UNLINKAT:
45d189c6 6407 ret = io_unlinkat(req, issue_flags);
14a1143b 6408 break;
2b188cc1
JA
6409 default:
6410 ret = -EINVAL;
6411 break;
6412 }
6413
def596e9
JA
6414 if (ret)
6415 return ret;
6416
b532576e
JA
6417 /* If the op doesn't have a file, we're not polling for it */
6418 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
11ba820b
JA
6419 const bool in_async = io_wq_current_is_worker();
6420
11ba820b
JA
6421 /* workqueue context doesn't hold uring_lock, grab it now */
6422 if (in_async)
6423 mutex_lock(&ctx->uring_lock);
6424
2e9dbe90 6425 io_iopoll_req_issued(req, in_async);
11ba820b
JA
6426
6427 if (in_async)
6428 mutex_unlock(&ctx->uring_lock);
def596e9
JA
6429 }
6430
6431 return 0;
2b188cc1
JA
6432}
6433
5280f7e5 6434static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
6435{
6436 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6df1db6b 6437 struct io_kiocb *timeout;
561fb04a 6438 int ret = 0;
2b188cc1 6439
6df1db6b
PB
6440 timeout = io_prep_linked_timeout(req);
6441 if (timeout)
6442 io_queue_linked_timeout(timeout);
d4c81f38 6443
4014d943 6444 if (work->flags & IO_WQ_WORK_CANCEL)
561fb04a 6445 ret = -ECANCELED;
31b51510 6446
561fb04a 6447 if (!ret) {
561fb04a 6448 do {
889fca73 6449 ret = io_issue_sqe(req, 0);
561fb04a
JA
6450 /*
6451 * We can get EAGAIN for polled IO even though we're
6452 * forcing a sync submission from here, since we can't
6453 * wait for request slots on the block side.
6454 */
6455 if (ret != -EAGAIN)
6456 break;
6457 cond_resched();
6458 } while (1);
6459 }
31b51510 6460
a3df7698 6461 /* avoid locking problems by failing it from a clean context */
561fb04a 6462 if (ret) {
a3df7698
PB
6463 /* io-wq is going to take one down */
6464 refcount_inc(&req->refs);
6465 io_req_task_queue_fail(req, ret);
edafccee 6466 }
2b188cc1
JA
6467}
6468
65e19f54
JA
6469static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6470 int index)
6471{
269bbe5f 6472 struct fixed_rsrc_table *table;
65e19f54 6473
05f3fb3c 6474 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
84695089 6475 return table->files[index & IORING_FILE_TABLE_MASK];
65e19f54
JA
6476}
6477
8371adf5
PB
6478static struct file *io_file_get(struct io_submit_state *state,
6479 struct io_kiocb *req, int fd, bool fixed)
09bb8394 6480{
a197f664 6481 struct io_ring_ctx *ctx = req->ctx;
8da11c19 6482 struct file *file;
09bb8394 6483
8da11c19 6484 if (fixed) {
479f517b 6485 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
8371adf5 6486 return NULL;
b7620121 6487 fd = array_index_nospec(fd, ctx->nr_user_files);
8da11c19 6488 file = io_file_from_index(ctx, fd);
36f72fe2 6489 io_set_resource_node(req);
09bb8394 6490 } else {
c826bd7a 6491 trace_io_uring_file_get(ctx, fd);
8da11c19 6492 file = __io_file_get(state, fd);
09bb8394
JA
6493 }
6494
ce3d5aae
PB
6495 if (file && unlikely(file->f_op == &io_uring_fops))
6496 io_req_track_inflight(req);
8371adf5 6497 return file;
09bb8394
JA
6498}
6499
2665abfd 6500static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 6501{
ad8a48ac
JA
6502 struct io_timeout_data *data = container_of(timer,
6503 struct io_timeout_data, timer);
90cd7e42 6504 struct io_kiocb *prev, *req = data->req;
2665abfd 6505 struct io_ring_ctx *ctx = req->ctx;
2665abfd 6506 unsigned long flags;
2665abfd
JA
6507
6508 spin_lock_irqsave(&ctx->completion_lock, flags);
90cd7e42
PB
6509 prev = req->timeout.head;
6510 req->timeout.head = NULL;
2665abfd
JA
6511
6512 /*
6513 * We don't expect the list to be empty, that will only happen if we
6514 * race with the completion of the linked work.
6515 */
90cd7e42 6516 if (prev && refcount_inc_not_zero(&prev->refs))
f2f87370 6517 io_remove_next_linked(prev);
90cd7e42
PB
6518 else
6519 prev = NULL;
2665abfd
JA
6520 spin_unlock_irqrestore(&ctx->completion_lock, flags);
6521
6522 if (prev) {
4e88d6e7 6523 req_set_fail_links(prev);
014db007 6524 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
9ae1f8dd 6525 io_put_req_deferred(prev, 1);
47f46768 6526 } else {
9ae1f8dd
PB
6527 io_req_complete_post(req, -ETIME, 0);
6528 io_put_req_deferred(req, 1);
2665abfd 6529 }
2665abfd
JA
6530 return HRTIMER_NORESTART;
6531}
6532
7271ef3a 6533static void __io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 6534{
76a46e06 6535 /*
f2f87370
PB
6536 * If the back reference is NULL, then our linked request finished
6537 * before we got a chance to setup the timer
76a46e06 6538 */
90cd7e42 6539 if (req->timeout.head) {
e8c2bc1f 6540 struct io_timeout_data *data = req->async_data;
94ae5e77 6541
ad8a48ac
JA
6542 data->timer.function = io_link_timeout_fn;
6543 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6544 data->mode);
2665abfd 6545 }
7271ef3a
JA
6546}
6547
6548static void io_queue_linked_timeout(struct io_kiocb *req)
6549{
6550 struct io_ring_ctx *ctx = req->ctx;
6551
6552 spin_lock_irq(&ctx->completion_lock);
6553 __io_queue_linked_timeout(req);
76a46e06 6554 spin_unlock_irq(&ctx->completion_lock);
2665abfd 6555
2665abfd 6556 /* drop submission reference */
76a46e06
JA
6557 io_put_req(req);
6558}
2665abfd 6559
ad8a48ac 6560static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd 6561{
f2f87370 6562 struct io_kiocb *nxt = req->link;
2665abfd 6563
f2f87370
PB
6564 if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6565 nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 6566 return NULL;
2665abfd 6567
90cd7e42 6568 nxt->timeout.head = req;
900fad45 6569 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
76a46e06 6570 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 6571 return nxt;
2665abfd
JA
6572}
6573
c5eef2b9 6574static void __io_queue_sqe(struct io_kiocb *req)
2b188cc1 6575{
d3d7298d 6576 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
193155c8 6577 const struct cred *old_creds = NULL;
e0c5c576 6578 int ret;
2b188cc1 6579
2e5aa6cb
PB
6580 if ((req->flags & REQ_F_WORK_INITIALIZED) &&
6581 (req->work.flags & IO_WQ_WORK_CREDS) &&
d3d7298d
PB
6582 req->work.identity->creds != current_cred())
6583 old_creds = override_creds(req->work.identity->creds);
193155c8 6584
c5eef2b9 6585 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 6586
d3d7298d
PB
6587 if (old_creds)
6588 revert_creds(old_creds);
491381ce
JA
6589
6590 /*
6591 * We async punt it if the file wasn't marked NOWAIT, or if the file
6592 * doesn't support non-blocking read/write attempts
6593 */
24c74678 6594 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
f063c547 6595 if (!io_arm_poll_handler(req)) {
f063c547
PB
6596 /*
6597 * Queued up for async execution, worker will release
6598 * submit reference when the iocb is actually submitted.
6599 */
6600 io_queue_async_work(req);
2b188cc1 6601 }
0d63c148
PB
6602 } else if (likely(!ret)) {
6603 /* drop submission reference */
e342c807 6604 if (req->flags & REQ_F_COMPLETE_INLINE) {
c5eef2b9
PB
6605 struct io_ring_ctx *ctx = req->ctx;
6606 struct io_comp_state *cs = &ctx->submit_state.comp;
e65ef56d 6607
6dd0be1e 6608 cs->reqs[cs->nr++] = req;
d3d7298d 6609 if (cs->nr == ARRAY_SIZE(cs->reqs))
c5eef2b9 6610 io_submit_flush_completions(cs, ctx);
9affd664 6611 } else {
d3d7298d 6612 io_put_req(req);
0d63c148
PB
6613 }
6614 } else {
4e88d6e7 6615 req_set_fail_links(req);
e65ef56d 6616 io_put_req(req);
e1e16097 6617 io_req_complete(req, ret);
9e645e11 6618 }
d3d7298d
PB
6619 if (linked_timeout)
6620 io_queue_linked_timeout(linked_timeout);
2b188cc1
JA
6621}
6622
be7053b7 6623static void io_queue_sqe(struct io_kiocb *req)
4fe2c963
JL
6624{
6625 int ret;
6626
be7053b7 6627 ret = io_req_defer(req);
4fe2c963
JL
6628 if (ret) {
6629 if (ret != -EIOCBQUEUED) {
1118591a 6630fail_req:
4e88d6e7 6631 req_set_fail_links(req);
e1e16097
JA
6632 io_put_req(req);
6633 io_req_complete(req, ret);
4fe2c963 6634 }
2550878f 6635 } else if (req->flags & REQ_F_FORCE_ASYNC) {
be7053b7
PB
6636 ret = io_req_defer_prep(req);
6637 if (unlikely(ret))
6638 goto fail_req;
ce35a47a
JA
6639 io_queue_async_work(req);
6640 } else {
c5eef2b9 6641 __io_queue_sqe(req);
ce35a47a 6642 }
4fe2c963
JL
6643}
6644
b16fed66
PB
6645/*
6646 * Check SQE restrictions (opcode and flags).
6647 *
6648 * Returns 'true' if SQE is allowed, 'false' otherwise.
6649 */
6650static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6651 struct io_kiocb *req,
6652 unsigned int sqe_flags)
4fe2c963 6653{
b16fed66
PB
6654 if (!ctx->restricted)
6655 return true;
6656
6657 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6658 return false;
6659
6660 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6661 ctx->restrictions.sqe_flags_required)
6662 return false;
6663
6664 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6665 ctx->restrictions.sqe_flags_required))
6666 return false;
6667
6668 return true;
4fe2c963
JL
6669}
6670
b16fed66
PB
6671static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6672 const struct io_uring_sqe *sqe)
6673{
6674 struct io_submit_state *state;
6675 unsigned int sqe_flags;
6676 int id, ret = 0;
6677
6678 req->opcode = READ_ONCE(sqe->opcode);
6679 /* same numerical values with corresponding REQ_F_*, safe to copy */
6680 req->flags = sqe_flags = READ_ONCE(sqe->flags);
6681 req->user_data = READ_ONCE(sqe->user_data);
6682 req->async_data = NULL;
6683 req->file = NULL;
6684 req->ctx = ctx;
6685 req->link = NULL;
6686 req->fixed_rsrc_refs = NULL;
6687 /* one is dropped after submission, the other at completion */
6688 refcount_set(&req->refs, 2);
6689 req->task = current;
6690 req->result = 0;
863e0560 6691
b16fed66 6692 /* enforce forwards compatibility on users */
ebf4a5db
PB
6693 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
6694 req->flags = 0;
b16fed66 6695 return -EINVAL;
ebf4a5db 6696 }
b16fed66
PB
6697
6698 if (unlikely(req->opcode >= IORING_OP_LAST))
6699 return -EINVAL;
6700
6701 if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
6702 return -EFAULT;
6703
6704 if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6705 return -EACCES;
6706
6707 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6708 !io_op_defs[req->opcode].buffer_select)
6709 return -EOPNOTSUPP;
6710
6711 id = READ_ONCE(sqe->personality);
6712 if (id) {
6713 struct io_identity *iod;
6714
6715 iod = idr_find(&ctx->personality_idr, id);
6716 if (unlikely(!iod))
6717 return -EINVAL;
6718 refcount_inc(&iod->count);
6719
6720 __io_req_init_async(req);
6721 get_cred(iod->creds);
6722 req->work.identity = iod;
6723 req->work.flags |= IO_WQ_WORK_CREDS;
6724 }
6725
6726 state = &ctx->submit_state;
6727
6728 /*
6729 * Plug now if we have more than 1 IO left after this, and the target
6730 * is potentially a read/write to block based storage.
6731 */
6732 if (!state->plug_started && state->ios_left > 1 &&
6733 io_op_defs[req->opcode].plug) {
6734 blk_start_plug(&state->plug);
6735 state->plug_started = true;
6736 }
6737
6738 if (io_op_defs[req->opcode].needs_file) {
6739 bool fixed = req->flags & REQ_F_FIXED_FILE;
6740
6741 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6742 if (unlikely(!req->file))
6743 ret = -EBADF;
6744 }
6745
6746 state->ios_left--;
6747 return ret;
6748}
6749
a6b8cadc 6750static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 6751 const struct io_uring_sqe *sqe)
9e645e11 6752{
a1ab7b35 6753 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 6754 int ret;
9e645e11 6755
a6b8cadc
PB
6756 ret = io_init_req(ctx, req, sqe);
6757 if (unlikely(ret)) {
6758fail_req:
6759 io_put_req(req);
6760 io_req_complete(req, ret);
de59bc10
PB
6761 if (link->head) {
6762 /* fail even hard links since we don't submit */
cf109604 6763 link->head->flags |= REQ_F_FAIL_LINK;
de59bc10
PB
6764 io_put_req(link->head);
6765 io_req_complete(link->head, -ECANCELED);
6766 link->head = NULL;
6767 }
a6b8cadc
PB
6768 return ret;
6769 }
be7053b7
PB
6770 ret = io_req_prep(req, sqe);
6771 if (unlikely(ret))
6772 goto fail_req;
a6b8cadc 6773
be7053b7 6774 /* don't need @sqe from now on */
a6b8cadc
PB
6775 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6776 true, ctx->flags & IORING_SETUP_SQPOLL);
6777
9e645e11
JA
6778 /*
6779 * If we already have a head request, queue this one for async
6780 * submittal once the head completes. If we don't have a head but
6781 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6782 * submitted sync once the chain is complete. If none of those
6783 * conditions are true (normal request), then just queue it.
6784 */
863e0560
PB
6785 if (link->head) {
6786 struct io_kiocb *head = link->head;
4e88d6e7 6787
8cdf2193
PB
6788 /*
6789 * Taking sequential execution of a link, draining both sides
6790 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6791 * requests in the link. So, it drains the head and the
6792 * next after the link request. The last one is done via
6793 * drain_next flag to persist the effect across calls.
6794 */
ef4ff581 6795 if (req->flags & REQ_F_IO_DRAIN) {
711be031
PB
6796 head->flags |= REQ_F_IO_DRAIN;
6797 ctx->drain_next = 1;
6798 }
be7053b7 6799 ret = io_req_defer_prep(req);
cf109604 6800 if (unlikely(ret))
a6b8cadc 6801 goto fail_req;
9d76377f 6802 trace_io_uring_link(ctx, req, head);
f2f87370 6803 link->last->link = req;
863e0560 6804 link->last = req;
32fe525b
PB
6805
6806 /* last request of a link, enqueue the link */
ef4ff581 6807 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
de59bc10 6808 io_queue_sqe(head);
863e0560 6809 link->head = NULL;
32fe525b 6810 }
9e645e11 6811 } else {
711be031
PB
6812 if (unlikely(ctx->drain_next)) {
6813 req->flags |= REQ_F_IO_DRAIN;
ef4ff581 6814 ctx->drain_next = 0;
711be031 6815 }
ef4ff581 6816 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
863e0560
PB
6817 link->head = req;
6818 link->last = req;
711be031 6819 } else {
be7053b7 6820 io_queue_sqe(req);
711be031 6821 }
9e645e11 6822 }
2e6e1fde 6823
1d4240cc 6824 return 0;
9e645e11
JA
6825}
6826
9a56a232
JA
6827/*
6828 * Batched submission is done, ensure local IO is flushed out.
6829 */
ba88ff11
PB
6830static void io_submit_state_end(struct io_submit_state *state,
6831 struct io_ring_ctx *ctx)
9a56a232 6832{
a1ab7b35 6833 if (state->link.head)
de59bc10 6834 io_queue_sqe(state->link.head);
6dd0be1e 6835 if (state->comp.nr)
ba88ff11 6836 io_submit_flush_completions(&state->comp, ctx);
27926b68
JA
6837 if (state->plug_started)
6838 blk_finish_plug(&state->plug);
9f13c35b 6839 io_state_file_put(state);
9a56a232
JA
6840}
6841
6842/*
6843 * Start submission side cache.
6844 */
6845static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 6846 unsigned int max_ios)
9a56a232 6847{
27926b68 6848 state->plug_started = false;
9a56a232 6849 state->ios_left = max_ios;
a1ab7b35
PB
6850 /* set only head, no need to init link_last in advance */
6851 state->link.head = NULL;
9a56a232
JA
6852}
6853
2b188cc1
JA
6854static void io_commit_sqring(struct io_ring_ctx *ctx)
6855{
75b28aff 6856 struct io_rings *rings = ctx->rings;
2b188cc1 6857
caf582c6
PB
6858 /*
6859 * Ensure any loads from the SQEs are done at this point,
6860 * since once we write the new head, the application could
6861 * write new data to them.
6862 */
6863 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
6864}
6865
2b188cc1 6866/*
3529d8c2 6867 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
2b188cc1
JA
6868 * that is mapped by userspace. This means that care needs to be taken to
6869 * ensure that reads are stable, as we cannot rely on userspace always
6870 * being a good citizen. If members of the sqe are validated and then later
6871 * used, it's important that those reads are done through READ_ONCE() to
6872 * prevent a re-load down the line.
6873 */
709b302f 6874static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 6875{
75b28aff 6876 u32 *sq_array = ctx->sq_array;
2b188cc1
JA
6877 unsigned head;
6878
6879 /*
6880 * The cached sq head (or cq tail) serves two purposes:
6881 *
6882 * 1) allows us to batch the cost of updating the user visible
6883 * head updates.
6884 * 2) allows the kernel side to track the head on its own, even
6885 * though the application is the one updating it.
6886 */
4fccfcbb 6887 head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
709b302f
PB
6888 if (likely(head < ctx->sq_entries))
6889 return &ctx->sq_sqes[head];
2b188cc1
JA
6890
6891 /* drop invalid entries */
498ccd9e 6892 ctx->cached_sq_dropped++;
ee7d46d9 6893 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
709b302f
PB
6894 return NULL;
6895}
6896
0f212204 6897static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6c271ce2 6898{
46c4e16a 6899 int submitted = 0;
6c271ce2 6900
c4a2ed72 6901 /* if we have a backlog and couldn't flush it all, return BUSY */
ad3eb2c8 6902 if (test_bit(0, &ctx->sq_check_overflow)) {
6c503150 6903 if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
ad3eb2c8
JA
6904 return -EBUSY;
6905 }
6c271ce2 6906
ee7d46d9
PB
6907 /* make sure SQ entry isn't read before tail */
6908 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
9ef4f124 6909
2b85edfc
PB
6910 if (!percpu_ref_tryget_many(&ctx->refs, nr))
6911 return -EAGAIN;
6c271ce2 6912
d8a6df10 6913 percpu_counter_add(&current->io_uring->inflight, nr);
faf7b51c 6914 refcount_add(nr, &current->usage);
ba88ff11 6915 io_submit_state_start(&ctx->submit_state, nr);
b14cca0c 6916
46c4e16a 6917 while (submitted < nr) {
3529d8c2 6918 const struct io_uring_sqe *sqe;
196be95c 6919 struct io_kiocb *req;
fb5ccc98 6920
258b29a9 6921 req = io_alloc_req(ctx);
196be95c
PB
6922 if (unlikely(!req)) {
6923 if (!submitted)
6924 submitted = -EAGAIN;
fb5ccc98 6925 break;
196be95c 6926 }
4fccfcbb
PB
6927 sqe = io_get_sqe(ctx);
6928 if (unlikely(!sqe)) {
6929 kmem_cache_free(req_cachep, req);
6930 break;
6931 }
d3656344
JA
6932 /* will complete beyond this point, count as submitted */
6933 submitted++;
a1ab7b35 6934 if (io_submit_sqe(ctx, req, sqe))
196be95c 6935 break;
6c271ce2
JA
6936 }
6937
9466f437
PB
6938 if (unlikely(submitted != nr)) {
6939 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
d8a6df10
JA
6940 struct io_uring_task *tctx = current->io_uring;
6941 int unused = nr - ref_used;
9466f437 6942
d8a6df10
JA
6943 percpu_ref_put_many(&ctx->refs, unused);
6944 percpu_counter_sub(&tctx->inflight, unused);
6945 put_task_struct_many(current, unused);
9466f437 6946 }
6c271ce2 6947
a1ab7b35 6948 io_submit_state_end(&ctx->submit_state, ctx);
ae9428ca
PB
6949 /* Commit SQ ring head once we've consumed and submitted all SQEs */
6950 io_commit_sqring(ctx);
6951
6c271ce2
JA
6952 return submitted;
6953}
6954
23b3628e
XW
6955static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6956{
6957 /* Tell userspace we may need a wakeup call */
6958 spin_lock_irq(&ctx->completion_lock);
6959 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6960 spin_unlock_irq(&ctx->completion_lock);
6961}
6962
6963static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6964{
6965 spin_lock_irq(&ctx->completion_lock);
6966 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6967 spin_unlock_irq(&ctx->completion_lock);
6968}
6969
08369246 6970static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 6971{
c8d1ba58 6972 unsigned int to_submit;
bdcd3eab 6973 int ret = 0;
6c271ce2 6974
c8d1ba58 6975 to_submit = io_sqring_entries(ctx);
e95eee2d
JA
6976 /* if we're handling multiple rings, cap submit size for fairness */
6977 if (cap_entries && to_submit > 8)
6978 to_submit = 8;
6979
906a3c6f 6980 if (!list_empty(&ctx->iopoll_list) || to_submit) {
c8d1ba58 6981 unsigned nr_events = 0;
a4c0b3de 6982
c8d1ba58 6983 mutex_lock(&ctx->uring_lock);
906a3c6f 6984 if (!list_empty(&ctx->iopoll_list))
c8d1ba58 6985 io_do_iopoll(ctx, &nr_events, 0);
906a3c6f 6986
d9d05217
PB
6987 if (to_submit && !ctx->sqo_dead &&
6988 likely(!percpu_ref_is_dying(&ctx->refs)))
08369246 6989 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58
JA
6990 mutex_unlock(&ctx->uring_lock);
6991 }
6c271ce2 6992
90554200
JA
6993 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6994 wake_up(&ctx->sqo_sq_wait);
6c271ce2 6995
08369246
XW
6996 return ret;
6997}
6c271ce2 6998
08369246
XW
6999static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7000{
7001 struct io_ring_ctx *ctx;
7002 unsigned sq_thread_idle = 0;
6c271ce2 7003
08369246
XW
7004 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7005 if (sq_thread_idle < ctx->sq_thread_idle)
7006 sq_thread_idle = ctx->sq_thread_idle;
c8d1ba58 7007 }
c1edbf5f 7008
08369246 7009 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 7010}
6c271ce2 7011
69fb2131
JA
7012static void io_sqd_init_new(struct io_sq_data *sqd)
7013{
7014 struct io_ring_ctx *ctx;
7015
7016 while (!list_empty(&sqd->ctx_new_list)) {
7017 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
69fb2131
JA
7018 list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
7019 complete(&ctx->sq_thread_comp);
7020 }
08369246
XW
7021
7022 io_sqd_update_thread_idle(sqd);
69fb2131
JA
7023}
7024
c8d1ba58
JA
7025static int io_sq_thread(void *data)
7026{
91d8f519 7027 struct cgroup_subsys_state *cur_css = NULL;
28cea78a
JA
7028 struct files_struct *old_files = current->files;
7029 struct nsproxy *old_nsproxy = current->nsproxy;
69fb2131
JA
7030 const struct cred *old_cred = NULL;
7031 struct io_sq_data *sqd = data;
7032 struct io_ring_ctx *ctx;
a0d9205f 7033 unsigned long timeout = 0;
08369246 7034 DEFINE_WAIT(wait);
6c271ce2 7035
28cea78a
JA
7036 task_lock(current);
7037 current->files = NULL;
7038 current->nsproxy = NULL;
7039 task_unlock(current);
6c271ce2 7040
69fb2131 7041 while (!kthread_should_stop()) {
08369246
XW
7042 int ret;
7043 bool cap_entries, sqt_spin, needs_sched;
c1edbf5f
JA
7044
7045 /*
69fb2131
JA
7046 * Any changes to the sqd lists are synchronized through the
7047 * kthread parking. This synchronizes the thread vs users,
7048 * the users are synchronized on the sqd->ctx_lock.
c1edbf5f 7049 */
65b2b213 7050 if (kthread_should_park()) {
69fb2131 7051 kthread_parkme();
65b2b213
XW
7052 /*
7053 * When sq thread is unparked, in case the previous park operation
7054 * comes from io_put_sq_data(), which means that sq thread is going
7055 * to be stopped, so here needs to have a check.
7056 */
7057 if (kthread_should_stop())
7058 break;
7059 }
7143b5ac 7060
08369246 7061 if (unlikely(!list_empty(&sqd->ctx_new_list))) {
69fb2131 7062 io_sqd_init_new(sqd);
08369246
XW
7063 timeout = jiffies + sqd->sq_thread_idle;
7064 }
6c271ce2 7065
08369246 7066 sqt_spin = false;
e95eee2d 7067 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131
JA
7068 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7069 if (current->cred != ctx->creds) {
7070 if (old_cred)
7071 revert_creds(old_cred);
7072 old_cred = override_creds(ctx->creds);
bdcd3eab 7073 }
91d8f519 7074 io_sq_thread_associate_blkcg(ctx, &cur_css);
4ea33a97
JA
7075#ifdef CONFIG_AUDIT
7076 current->loginuid = ctx->loginuid;
7077 current->sessionid = ctx->sessionid;
7078#endif
bdcd3eab 7079
08369246
XW
7080 ret = __io_sq_thread(ctx, cap_entries);
7081 if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
7082 sqt_spin = true;
6c271ce2 7083
28cea78a 7084 io_sq_thread_drop_mm_files();
69fb2131 7085 }
6c271ce2 7086
08369246 7087 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58 7088 io_run_task_work();
d434ab6d 7089 io_sq_thread_drop_mm_files();
c8d1ba58 7090 cond_resched();
08369246
XW
7091 if (sqt_spin)
7092 timeout = jiffies + sqd->sq_thread_idle;
7093 continue;
7094 }
7095
08369246
XW
7096 needs_sched = true;
7097 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7098 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7099 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
7100 !list_empty_careful(&ctx->iopoll_list)) {
7101 needs_sched = false;
7102 break;
7103 }
7104 if (io_sqring_entries(ctx)) {
7105 needs_sched = false;
7106 break;
7107 }
7108 }
7109
8b28fdf2 7110 if (needs_sched && !kthread_should_park()) {
69fb2131
JA
7111 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7112 io_ring_set_wakeup_flag(ctx);
08369246 7113
69fb2131 7114 schedule();
69fb2131
JA
7115 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7116 io_ring_clear_wakeup_flag(ctx);
6c271ce2 7117 }
08369246
XW
7118
7119 finish_wait(&sqd->wait, &wait);
7120 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2
JA
7121 }
7122
4c6e277c 7123 io_run_task_work();
d434ab6d 7124 io_sq_thread_drop_mm_files();
b41e9852 7125
91d8f519
DZ
7126 if (cur_css)
7127 io_sq_thread_unassociate_blkcg();
69fb2131
JA
7128 if (old_cred)
7129 revert_creds(old_cred);
06058632 7130
28cea78a
JA
7131 task_lock(current);
7132 current->files = old_files;
7133 current->nsproxy = old_nsproxy;
7134 task_unlock(current);
7135
2bbcd6d3 7136 kthread_parkme();
06058632 7137
6c271ce2
JA
7138 return 0;
7139}
7140
bda52162
JA
7141struct io_wait_queue {
7142 struct wait_queue_entry wq;
7143 struct io_ring_ctx *ctx;
7144 unsigned to_wait;
7145 unsigned nr_timeouts;
7146};
7147
6c503150 7148static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
7149{
7150 struct io_ring_ctx *ctx = iowq->ctx;
7151
7152 /*
d195a66e 7153 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
7154 * started waiting. For timeouts, we always want to return to userspace,
7155 * regardless of event count.
7156 */
6c503150 7157 return io_cqring_events(ctx) >= iowq->to_wait ||
bda52162
JA
7158 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
7159}
7160
7161static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7162 int wake_flags, void *key)
7163{
7164 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7165 wq);
7166
6c503150
PB
7167 /*
7168 * Cannot safely flush overflowed CQEs from here, ensure we wake up
7169 * the task, and the next invocation will do it.
7170 */
7171 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
7172 return autoremove_wake_function(curr, mode, wake_flags, key);
7173 return -1;
bda52162
JA
7174}
7175
af9c1a44
JA
7176static int io_run_task_work_sig(void)
7177{
7178 if (io_run_task_work())
7179 return 1;
7180 if (!signal_pending(current))
7181 return 0;
792ee0f6
JA
7182 if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
7183 return -ERESTARTSYS;
af9c1a44
JA
7184 return -EINTR;
7185}
7186
eeb60b9a
PB
7187/* when returns >0, the caller should retry */
7188static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7189 struct io_wait_queue *iowq,
7190 signed long *timeout)
7191{
7192 int ret;
7193
7194 /* make sure we run task_work before checking for signals */
7195 ret = io_run_task_work_sig();
7196 if (ret || io_should_wake(iowq))
7197 return ret;
7198 /* let the caller flush overflows, retry */
7199 if (test_bit(0, &ctx->cq_check_overflow))
7200 return 1;
7201
7202 *timeout = schedule_timeout(*timeout);
7203 return !*timeout ? -ETIME : 1;
7204}
7205
2b188cc1
JA
7206/*
7207 * Wait until events become available, if we don't already have some. The
7208 * application must reap them itself, as they reside on the shared cq ring.
7209 */
7210static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
7211 const sigset_t __user *sig, size_t sigsz,
7212 struct __kernel_timespec __user *uts)
2b188cc1 7213{
bda52162
JA
7214 struct io_wait_queue iowq = {
7215 .wq = {
7216 .private = current,
7217 .func = io_wake_function,
7218 .entry = LIST_HEAD_INIT(iowq.wq.entry),
7219 },
7220 .ctx = ctx,
7221 .to_wait = min_events,
7222 };
75b28aff 7223 struct io_rings *rings = ctx->rings;
c1d5a224
PB
7224 signed long timeout = MAX_SCHEDULE_TIMEOUT;
7225 int ret;
2b188cc1 7226
b41e9852 7227 do {
6c503150
PB
7228 io_cqring_overflow_flush(ctx, false, NULL, NULL);
7229 if (io_cqring_events(ctx) >= min_events)
b41e9852 7230 return 0;
4c6e277c 7231 if (!io_run_task_work())
b41e9852 7232 break;
b41e9852 7233 } while (1);
2b188cc1
JA
7234
7235 if (sig) {
9e75ad5d
AB
7236#ifdef CONFIG_COMPAT
7237 if (in_compat_syscall())
7238 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 7239 sigsz);
9e75ad5d
AB
7240 else
7241#endif
b772434b 7242 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 7243
2b188cc1
JA
7244 if (ret)
7245 return ret;
7246 }
7247
c73ebb68 7248 if (uts) {
c1d5a224
PB
7249 struct timespec64 ts;
7250
c73ebb68
HX
7251 if (get_timespec64(&ts, uts))
7252 return -EFAULT;
7253 timeout = timespec64_to_jiffies(&ts);
7254 }
7255
bda52162 7256 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
c826bd7a 7257 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 7258 do {
6c503150 7259 io_cqring_overflow_flush(ctx, false, NULL, NULL);
bda52162
JA
7260 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
7261 TASK_INTERRUPTIBLE);
eeb60b9a
PB
7262 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7263 finish_wait(&ctx->wait, &iowq.wq);
7264 } while (ret > 0);
bda52162 7265
b7db41c9 7266 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 7267
75b28aff 7268 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
7269}
7270
6b06314c
JA
7271static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7272{
7273#if defined(CONFIG_UNIX)
7274 if (ctx->ring_sock) {
7275 struct sock *sock = ctx->ring_sock->sk;
7276 struct sk_buff *skb;
7277
7278 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7279 kfree_skb(skb);
7280 }
7281#else
7282 int i;
7283
65e19f54
JA
7284 for (i = 0; i < ctx->nr_user_files; i++) {
7285 struct file *file;
7286
7287 file = io_file_from_index(ctx, i);
7288 if (file)
7289 fput(file);
7290 }
6b06314c
JA
7291#endif
7292}
7293
00835dce 7294static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
05f3fb3c 7295{
269bbe5f 7296 struct fixed_rsrc_data *data;
05f3fb3c 7297
269bbe5f 7298 data = container_of(ref, struct fixed_rsrc_data, refs);
05f3fb3c
JA
7299 complete(&data->done);
7300}
7301
2a63b2d9 7302static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
1642b445 7303{
2a63b2d9 7304 spin_lock_bh(&ctx->rsrc_ref_lock);
1642b445
PB
7305}
7306
2a63b2d9 7307static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
6b06314c 7308{
2a63b2d9
BM
7309 spin_unlock_bh(&ctx->rsrc_ref_lock);
7310}
65e19f54 7311
d67d2263
BM
7312static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
7313 struct fixed_rsrc_data *rsrc_data,
269bbe5f 7314 struct fixed_rsrc_ref_node *ref_node)
1642b445 7315{
2a63b2d9 7316 io_rsrc_ref_lock(ctx);
269bbe5f 7317 rsrc_data->node = ref_node;
d67d2263 7318 list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
2a63b2d9 7319 io_rsrc_ref_unlock(ctx);
269bbe5f 7320 percpu_ref_get(&rsrc_data->refs);
1642b445
PB
7321}
7322
8bad28d8 7323static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
6b06314c 7324{
8bad28d8 7325 struct fixed_rsrc_ref_node *ref_node = NULL;
6b06314c 7326
2a63b2d9 7327 io_rsrc_ref_lock(ctx);
1e5d770b 7328 ref_node = data->node;
e6cb007c 7329 data->node = NULL;
2a63b2d9 7330 io_rsrc_ref_unlock(ctx);
05589553
XW
7331 if (ref_node)
7332 percpu_ref_kill(&ref_node->refs);
8bad28d8 7333}
05589553 7334
8bad28d8
HX
7335static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
7336 struct io_ring_ctx *ctx,
f2303b1f
PB
7337 void (*rsrc_put)(struct io_ring_ctx *ctx,
7338 struct io_rsrc_put *prsrc))
8bad28d8 7339{
f2303b1f 7340 struct fixed_rsrc_ref_node *backup_node;
8bad28d8 7341 int ret;
05589553 7342
8bad28d8
HX
7343 if (data->quiesce)
7344 return -ENXIO;
05589553 7345
8bad28d8 7346 data->quiesce = true;
1ffc5422 7347 do {
f2303b1f
PB
7348 ret = -ENOMEM;
7349 backup_node = alloc_fixed_rsrc_ref_node(ctx);
7350 if (!backup_node)
7351 break;
7352 backup_node->rsrc_data = data;
7353 backup_node->rsrc_put = rsrc_put;
7354
8bad28d8
HX
7355 io_sqe_rsrc_kill_node(ctx, data);
7356 percpu_ref_kill(&data->refs);
7357 flush_delayed_work(&ctx->rsrc_put_work);
7358
1ffc5422 7359 ret = wait_for_completion_interruptible(&data->done);
88f171ab 7360 if (!ret || !io_refs_resurrect(&data->refs, &data->done))
1ffc5422 7361 break;
8bad28d8 7362
8bad28d8
HX
7363 io_sqe_rsrc_set_node(ctx, data, backup_node);
7364 backup_node = NULL;
8bad28d8 7365 mutex_unlock(&ctx->uring_lock);
1ffc5422 7366 ret = io_run_task_work_sig();
8bad28d8 7367 mutex_lock(&ctx->uring_lock);
f2303b1f 7368 } while (ret >= 0);
8bad28d8 7369 data->quiesce = false;
05f3fb3c 7370
8bad28d8
HX
7371 if (backup_node)
7372 destroy_fixed_rsrc_ref_node(backup_node);
7373 return ret;
d7954b2b
BM
7374}
7375
1ad555c6
BM
7376static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
7377{
7378 struct fixed_rsrc_data *data;
7379
7380 data = kzalloc(sizeof(*data), GFP_KERNEL);
7381 if (!data)
7382 return NULL;
7383
00835dce 7384 if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
1ad555c6
BM
7385 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
7386 kfree(data);
7387 return NULL;
7388 }
7389 data->ctx = ctx;
7390 init_completion(&data->done);
7391 return data;
7392}
7393
7394static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
7395{
7396 percpu_ref_exit(&data->refs);
7397 kfree(data->table);
7398 kfree(data);
7399}
7400
d7954b2b
BM
7401static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7402{
7403 struct fixed_rsrc_data *data = ctx->file_data;
d7954b2b
BM
7404 unsigned nr_tables, i;
7405 int ret;
7406
8bad28d8
HX
7407 /*
7408 * percpu_ref_is_dying() is to stop parallel files unregister
7409 * Since we possibly drop uring lock later in this function to
7410 * run task work.
7411 */
7412 if (!data || percpu_ref_is_dying(&data->refs))
d7954b2b 7413 return -ENXIO;
f2303b1f 7414 ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
d7954b2b
BM
7415 if (ret)
7416 return ret;
7417
6b06314c 7418 __io_sqe_files_unregister(ctx);
65e19f54
JA
7419 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7420 for (i = 0; i < nr_tables; i++)
05f3fb3c 7421 kfree(data->table[i].files);
1ad555c6 7422 free_fixed_rsrc_data(data);
05f3fb3c 7423 ctx->file_data = NULL;
6b06314c
JA
7424 ctx->nr_user_files = 0;
7425 return 0;
7426}
7427
534ca6d6 7428static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 7429{
534ca6d6 7430 if (refcount_dec_and_test(&sqd->refs)) {
2bbcd6d3
RP
7431 /*
7432 * The park is a bit of a work-around, without it we get
7433 * warning spews on shutdown with SQPOLL set and affinity
7434 * set to a single CPU.
7435 */
534ca6d6
JA
7436 if (sqd->thread) {
7437 kthread_park(sqd->thread);
7438 kthread_stop(sqd->thread);
7439 }
7440
7441 kfree(sqd);
7442 }
7443}
7444
aa06165d
JA
7445static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7446{
7447 struct io_ring_ctx *ctx_attach;
7448 struct io_sq_data *sqd;
7449 struct fd f;
7450
7451 f = fdget(p->wq_fd);
7452 if (!f.file)
7453 return ERR_PTR(-ENXIO);
7454 if (f.file->f_op != &io_uring_fops) {
7455 fdput(f);
7456 return ERR_PTR(-EINVAL);
7457 }
7458
7459 ctx_attach = f.file->private_data;
7460 sqd = ctx_attach->sq_data;
7461 if (!sqd) {
7462 fdput(f);
7463 return ERR_PTR(-EINVAL);
7464 }
7465
7466 refcount_inc(&sqd->refs);
7467 fdput(f);
7468 return sqd;
7469}
7470
534ca6d6
JA
7471static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7472{
7473 struct io_sq_data *sqd;
7474
aa06165d
JA
7475 if (p->flags & IORING_SETUP_ATTACH_WQ)
7476 return io_attach_sq_data(p);
7477
534ca6d6
JA
7478 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7479 if (!sqd)
7480 return ERR_PTR(-ENOMEM);
7481
7482 refcount_set(&sqd->refs, 1);
69fb2131
JA
7483 INIT_LIST_HEAD(&sqd->ctx_list);
7484 INIT_LIST_HEAD(&sqd->ctx_new_list);
7485 mutex_init(&sqd->ctx_lock);
7486 mutex_init(&sqd->lock);
534ca6d6
JA
7487 init_waitqueue_head(&sqd->wait);
7488 return sqd;
7489}
7490
69fb2131
JA
7491static void io_sq_thread_unpark(struct io_sq_data *sqd)
7492 __releases(&sqd->lock)
7493{
7494 if (!sqd->thread)
7495 return;
7496 kthread_unpark(sqd->thread);
7497 mutex_unlock(&sqd->lock);
7498}
7499
7500static void io_sq_thread_park(struct io_sq_data *sqd)
7501 __acquires(&sqd->lock)
7502{
7503 if (!sqd->thread)
7504 return;
7505 mutex_lock(&sqd->lock);
7506 kthread_park(sqd->thread);
7507}
7508
534ca6d6
JA
7509static void io_sq_thread_stop(struct io_ring_ctx *ctx)
7510{
7511 struct io_sq_data *sqd = ctx->sq_data;
7512
7513 if (sqd) {
7514 if (sqd->thread) {
7515 /*
7516 * We may arrive here from the error branch in
7517 * io_sq_offload_create() where the kthread is created
7518 * without being waked up, thus wake it up now to make
7519 * sure the wait will complete.
7520 */
7521 wake_up_process(sqd->thread);
7522 wait_for_completion(&ctx->sq_thread_comp);
69fb2131
JA
7523
7524 io_sq_thread_park(sqd);
7525 }
7526
7527 mutex_lock(&sqd->ctx_lock);
7528 list_del(&ctx->sqd_list);
08369246 7529 io_sqd_update_thread_idle(sqd);
69fb2131
JA
7530 mutex_unlock(&sqd->ctx_lock);
7531
08369246 7532 if (sqd->thread)
69fb2131 7533 io_sq_thread_unpark(sqd);
534ca6d6
JA
7534
7535 io_put_sq_data(sqd);
7536 ctx->sq_data = NULL;
6c271ce2
JA
7537 }
7538}
7539
6b06314c
JA
7540static void io_finish_async(struct io_ring_ctx *ctx)
7541{
6c271ce2
JA
7542 io_sq_thread_stop(ctx);
7543
561fb04a
JA
7544 if (ctx->io_wq) {
7545 io_wq_destroy(ctx->io_wq);
7546 ctx->io_wq = NULL;
6b06314c
JA
7547 }
7548}
7549
7550#if defined(CONFIG_UNIX)
6b06314c
JA
7551/*
7552 * Ensure the UNIX gc is aware of our file set, so we are certain that
7553 * the io_uring can be safely unregistered on process exit, even if we have
7554 * loops in the file referencing.
7555 */
7556static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7557{
7558 struct sock *sk = ctx->ring_sock->sk;
7559 struct scm_fp_list *fpl;
7560 struct sk_buff *skb;
08a45173 7561 int i, nr_files;
6b06314c 7562
6b06314c
JA
7563 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7564 if (!fpl)
7565 return -ENOMEM;
7566
7567 skb = alloc_skb(0, GFP_KERNEL);
7568 if (!skb) {
7569 kfree(fpl);
7570 return -ENOMEM;
7571 }
7572
7573 skb->sk = sk;
6b06314c 7574
08a45173 7575 nr_files = 0;
6b06314c
JA
7576 fpl->user = get_uid(ctx->user);
7577 for (i = 0; i < nr; i++) {
65e19f54
JA
7578 struct file *file = io_file_from_index(ctx, i + offset);
7579
7580 if (!file)
08a45173 7581 continue;
65e19f54 7582 fpl->fp[nr_files] = get_file(file);
08a45173
JA
7583 unix_inflight(fpl->user, fpl->fp[nr_files]);
7584 nr_files++;
6b06314c
JA
7585 }
7586
08a45173
JA
7587 if (nr_files) {
7588 fpl->max = SCM_MAX_FD;
7589 fpl->count = nr_files;
7590 UNIXCB(skb).fp = fpl;
05f3fb3c 7591 skb->destructor = unix_destruct_scm;
08a45173
JA
7592 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7593 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 7594
08a45173
JA
7595 for (i = 0; i < nr_files; i++)
7596 fput(fpl->fp[i]);
7597 } else {
7598 kfree_skb(skb);
7599 kfree(fpl);
7600 }
6b06314c
JA
7601
7602 return 0;
7603}
7604
7605/*
7606 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7607 * causes regular reference counting to break down. We rely on the UNIX
7608 * garbage collection to take care of this problem for us.
7609 */
7610static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7611{
7612 unsigned left, total;
7613 int ret = 0;
7614
7615 total = 0;
7616 left = ctx->nr_user_files;
7617 while (left) {
7618 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
7619
7620 ret = __io_sqe_files_scm(ctx, this_files, total);
7621 if (ret)
7622 break;
7623 left -= this_files;
7624 total += this_files;
7625 }
7626
7627 if (!ret)
7628 return 0;
7629
7630 while (total < ctx->nr_user_files) {
65e19f54
JA
7631 struct file *file = io_file_from_index(ctx, total);
7632
7633 if (file)
7634 fput(file);
6b06314c
JA
7635 total++;
7636 }
7637
7638 return ret;
7639}
7640#else
7641static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7642{
7643 return 0;
7644}
7645#endif
7646
269bbe5f 7647static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
5398ae69 7648 unsigned nr_tables, unsigned nr_files)
65e19f54
JA
7649{
7650 int i;
7651
7652 for (i = 0; i < nr_tables; i++) {
269bbe5f 7653 struct fixed_rsrc_table *table = &file_data->table[i];
65e19f54
JA
7654 unsigned this_files;
7655
7656 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7657 table->files = kcalloc(this_files, sizeof(struct file *),
7658 GFP_KERNEL);
7659 if (!table->files)
7660 break;
7661 nr_files -= this_files;
7662 }
7663
7664 if (i == nr_tables)
7665 return 0;
7666
7667 for (i = 0; i < nr_tables; i++) {
269bbe5f 7668 struct fixed_rsrc_table *table = &file_data->table[i];
65e19f54
JA
7669 kfree(table->files);
7670 }
7671 return 1;
7672}
7673
50238531 7674static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 7675{
50238531 7676 struct file *file = prsrc->file;
05f3fb3c
JA
7677#if defined(CONFIG_UNIX)
7678 struct sock *sock = ctx->ring_sock->sk;
7679 struct sk_buff_head list, *head = &sock->sk_receive_queue;
7680 struct sk_buff *skb;
7681 int i;
7682
7683 __skb_queue_head_init(&list);
7684
7685 /*
7686 * Find the skb that holds this file in its SCM_RIGHTS. When found,
7687 * remove this entry and rearrange the file array.
7688 */
7689 skb = skb_dequeue(head);
7690 while (skb) {
7691 struct scm_fp_list *fp;
7692
7693 fp = UNIXCB(skb).fp;
7694 for (i = 0; i < fp->count; i++) {
7695 int left;
7696
7697 if (fp->fp[i] != file)
7698 continue;
7699
7700 unix_notinflight(fp->user, fp->fp[i]);
7701 left = fp->count - 1 - i;
7702 if (left) {
7703 memmove(&fp->fp[i], &fp->fp[i + 1],
7704 left * sizeof(struct file *));
7705 }
7706 fp->count--;
7707 if (!fp->count) {
7708 kfree_skb(skb);
7709 skb = NULL;
7710 } else {
7711 __skb_queue_tail(&list, skb);
7712 }
7713 fput(file);
7714 file = NULL;
7715 break;
7716 }
7717
7718 if (!file)
7719 break;
7720
7721 __skb_queue_tail(&list, skb);
7722
7723 skb = skb_dequeue(head);
7724 }
7725
7726 if (skb_peek(&list)) {
7727 spin_lock_irq(&head->lock);
7728 while ((skb = __skb_dequeue(&list)) != NULL)
7729 __skb_queue_tail(head, skb);
7730 spin_unlock_irq(&head->lock);
7731 }
7732#else
7733 fput(file);
7734#endif
7735}
7736
269bbe5f 7737static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
65e19f54 7738{
269bbe5f
BM
7739 struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
7740 struct io_ring_ctx *ctx = rsrc_data->ctx;
7741 struct io_rsrc_put *prsrc, *tmp;
05589553 7742
269bbe5f
BM
7743 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7744 list_del(&prsrc->list);
50238531 7745 ref_node->rsrc_put(ctx, prsrc);
269bbe5f 7746 kfree(prsrc);
65e19f54 7747 }
05589553 7748
05589553
XW
7749 percpu_ref_exit(&ref_node->refs);
7750 kfree(ref_node);
269bbe5f 7751 percpu_ref_put(&rsrc_data->refs);
2faf852d 7752}
65e19f54 7753
269bbe5f 7754static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
7755{
7756 struct io_ring_ctx *ctx;
7757 struct llist_node *node;
7758
269bbe5f
BM
7759 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7760 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
7761
7762 while (node) {
269bbe5f 7763 struct fixed_rsrc_ref_node *ref_node;
4a38aed2
JA
7764 struct llist_node *next = node->next;
7765
269bbe5f
BM
7766 ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
7767 __io_rsrc_put_work(ref_node);
4a38aed2
JA
7768 node = next;
7769 }
7770}
7771
ea64ec02
PB
7772static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
7773 unsigned i)
2faf852d 7774{
ea64ec02
PB
7775 struct fixed_rsrc_table *table;
7776
7777 table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7778 return &table->files[i & IORING_FILE_TABLE_MASK];
7779}
7780
00835dce 7781static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
2faf852d 7782{
269bbe5f
BM
7783 struct fixed_rsrc_ref_node *ref_node;
7784 struct fixed_rsrc_data *data;
4a38aed2 7785 struct io_ring_ctx *ctx;
e297822b 7786 bool first_add = false;
4a38aed2 7787 int delay = HZ;
65e19f54 7788
269bbe5f
BM
7789 ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
7790 data = ref_node->rsrc_data;
e297822b
PB
7791 ctx = data->ctx;
7792
2a63b2d9 7793 io_rsrc_ref_lock(ctx);
e297822b
PB
7794 ref_node->done = true;
7795
d67d2263
BM
7796 while (!list_empty(&ctx->rsrc_ref_list)) {
7797 ref_node = list_first_entry(&ctx->rsrc_ref_list,
269bbe5f 7798 struct fixed_rsrc_ref_node, node);
e297822b
PB
7799 /* recycle ref nodes in order */
7800 if (!ref_node->done)
7801 break;
7802 list_del(&ref_node->node);
269bbe5f 7803 first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
e297822b 7804 }
2a63b2d9 7805 io_rsrc_ref_unlock(ctx);
05589553 7806
e297822b 7807 if (percpu_ref_is_dying(&data->refs))
4a38aed2 7808 delay = 0;
05589553 7809
4a38aed2 7810 if (!delay)
269bbe5f 7811 mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
4a38aed2 7812 else if (first_add)
269bbe5f 7813 queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
05f3fb3c 7814}
65e19f54 7815
6802535d 7816static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
05589553 7817 struct io_ring_ctx *ctx)
05f3fb3c 7818{
269bbe5f 7819 struct fixed_rsrc_ref_node *ref_node;
05f3fb3c 7820
05589553
XW
7821 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7822 if (!ref_node)
3e2224c5 7823 return NULL;
05f3fb3c 7824
00835dce 7825 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
05589553
XW
7826 0, GFP_KERNEL)) {
7827 kfree(ref_node);
3e2224c5 7828 return NULL;
05589553
XW
7829 }
7830 INIT_LIST_HEAD(&ref_node->node);
269bbe5f 7831 INIT_LIST_HEAD(&ref_node->rsrc_list);
e297822b 7832 ref_node->done = false;
05589553 7833 return ref_node;
05589553
XW
7834}
7835
bc9744cd
PB
7836static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
7837 struct fixed_rsrc_ref_node *ref_node)
6802535d 7838{
269bbe5f 7839 ref_node->rsrc_data = ctx->file_data;
50238531 7840 ref_node->rsrc_put = io_ring_file_put;
05589553
XW
7841}
7842
269bbe5f 7843static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
05589553
XW
7844{
7845 percpu_ref_exit(&ref_node->refs);
7846 kfree(ref_node);
65e19f54
JA
7847}
7848
ea64ec02 7849
6b06314c
JA
7850static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7851 unsigned nr_args)
7852{
7853 __s32 __user *fds = (__s32 __user *) arg;
600cf3f8 7854 unsigned nr_tables, i;
05f3fb3c 7855 struct file *file;
600cf3f8 7856 int fd, ret = -ENOMEM;
269bbe5f
BM
7857 struct fixed_rsrc_ref_node *ref_node;
7858 struct fixed_rsrc_data *file_data;
6b06314c 7859
05f3fb3c 7860 if (ctx->file_data)
6b06314c
JA
7861 return -EBUSY;
7862 if (!nr_args)
7863 return -EINVAL;
7864 if (nr_args > IORING_MAX_FIXED_FILES)
7865 return -EMFILE;
7866
1ad555c6 7867 file_data = alloc_fixed_rsrc_data(ctx);
5398ae69 7868 if (!file_data)
05f3fb3c 7869 return -ENOMEM;
13770a71 7870 ctx->file_data = file_data;
05f3fb3c 7871
65e19f54 7872 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
035fbafc 7873 file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
5398ae69 7874 GFP_KERNEL);
600cf3f8
PB
7875 if (!file_data->table)
7876 goto out_free;
05f3fb3c 7877
600cf3f8 7878 if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
1ad555c6 7879 goto out_free;
65e19f54 7880
08a45173 7881 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
600cf3f8
PB
7882 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7883 ret = -EFAULT;
7884 goto out_fput;
7885 }
08a45173 7886 /* allow sparse sets */
600cf3f8 7887 if (fd == -1)
08a45173 7888 continue;
6b06314c 7889
05f3fb3c 7890 file = fget(fd);
6b06314c 7891 ret = -EBADF;
05f3fb3c 7892 if (!file)
600cf3f8 7893 goto out_fput;
05f3fb3c 7894
6b06314c
JA
7895 /*
7896 * Don't allow io_uring instances to be registered. If UNIX
7897 * isn't enabled, then this causes a reference cycle and this
7898 * instance can never get freed. If UNIX is enabled we'll
7899 * handle it just fine, but there's still no point in allowing
7900 * a ring fd as it doesn't support regular read/write anyway.
7901 */
05f3fb3c
JA
7902 if (file->f_op == &io_uring_fops) {
7903 fput(file);
600cf3f8 7904 goto out_fput;
6b06314c 7905 }
ea64ec02 7906 *io_fixed_file_slot(file_data, i) = file;
6b06314c
JA
7907 }
7908
6b06314c 7909 ret = io_sqe_files_scm(ctx);
05589553 7910 if (ret) {
6b06314c 7911 io_sqe_files_unregister(ctx);
05589553
XW
7912 return ret;
7913 }
6b06314c 7914
bc9744cd 7915 ref_node = alloc_fixed_rsrc_ref_node(ctx);
3e2224c5 7916 if (!ref_node) {
05589553 7917 io_sqe_files_unregister(ctx);
3e2224c5 7918 return -ENOMEM;
05589553 7919 }
bc9744cd 7920 init_fixed_file_ref_node(ctx, ref_node);
05589553 7921
d67d2263 7922 io_sqe_rsrc_set_node(ctx, file_data, ref_node);
6b06314c 7923 return ret;
600cf3f8
PB
7924out_fput:
7925 for (i = 0; i < ctx->nr_user_files; i++) {
7926 file = io_file_from_index(ctx, i);
7927 if (file)
7928 fput(file);
7929 }
7930 for (i = 0; i < nr_tables; i++)
7931 kfree(file_data->table[i].files);
7932 ctx->nr_user_files = 0;
600cf3f8 7933out_free:
1ad555c6 7934 free_fixed_rsrc_data(ctx->file_data);
55cbc256 7935 ctx->file_data = NULL;
6b06314c
JA
7936 return ret;
7937}
7938
c3a31e60
JA
7939static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7940 int index)
7941{
7942#if defined(CONFIG_UNIX)
7943 struct sock *sock = ctx->ring_sock->sk;
7944 struct sk_buff_head *head = &sock->sk_receive_queue;
7945 struct sk_buff *skb;
7946
7947 /*
7948 * See if we can merge this file into an existing skb SCM_RIGHTS
7949 * file set. If there's no room, fall back to allocating a new skb
7950 * and filling it in.
7951 */
7952 spin_lock_irq(&head->lock);
7953 skb = skb_peek(head);
7954 if (skb) {
7955 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7956
7957 if (fpl->count < SCM_MAX_FD) {
7958 __skb_unlink(skb, head);
7959 spin_unlock_irq(&head->lock);
7960 fpl->fp[fpl->count] = get_file(file);
7961 unix_inflight(fpl->user, fpl->fp[fpl->count]);
7962 fpl->count++;
7963 spin_lock_irq(&head->lock);
7964 __skb_queue_head(head, skb);
7965 } else {
7966 skb = NULL;
7967 }
7968 }
7969 spin_unlock_irq(&head->lock);
7970
7971 if (skb) {
7972 fput(file);
7973 return 0;
7974 }
7975
7976 return __io_sqe_files_scm(ctx, 1, index);
7977#else
7978 return 0;
7979#endif
7980}
7981
50238531 7982static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
05f3fb3c 7983{
269bbe5f
BM
7984 struct io_rsrc_put *prsrc;
7985 struct fixed_rsrc_ref_node *ref_node = data->node;
05f3fb3c 7986
269bbe5f
BM
7987 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7988 if (!prsrc)
a5318d3c 7989 return -ENOMEM;
05f3fb3c 7990
50238531 7991 prsrc->rsrc = rsrc;
269bbe5f 7992 list_add(&prsrc->list, &ref_node->rsrc_list);
05589553 7993
a5318d3c 7994 return 0;
05f3fb3c
JA
7995}
7996
269bbe5f
BM
7997static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
7998 struct file *file)
7999{
50238531 8000 return io_queue_rsrc_removal(data, (void *)file);
269bbe5f
BM
8001}
8002
05f3fb3c 8003static int __io_sqe_files_update(struct io_ring_ctx *ctx,
269bbe5f 8004 struct io_uring_rsrc_update *up,
05f3fb3c
JA
8005 unsigned nr_args)
8006{
269bbe5f
BM
8007 struct fixed_rsrc_data *data = ctx->file_data;
8008 struct fixed_rsrc_ref_node *ref_node;
ea64ec02 8009 struct file *file, **file_slot;
c3a31e60
JA
8010 __s32 __user *fds;
8011 int fd, i, err;
8012 __u32 done;
05589553 8013 bool needs_switch = false;
c3a31e60 8014
05f3fb3c 8015 if (check_add_overflow(up->offset, nr_args, &done))
c3a31e60
JA
8016 return -EOVERFLOW;
8017 if (done > ctx->nr_user_files)
8018 return -EINVAL;
8019
bc9744cd 8020 ref_node = alloc_fixed_rsrc_ref_node(ctx);
3e2224c5
MWO
8021 if (!ref_node)
8022 return -ENOMEM;
bc9744cd 8023 init_fixed_file_ref_node(ctx, ref_node);
05589553 8024
269bbe5f 8025 fds = u64_to_user_ptr(up->data);
67973b93 8026 for (done = 0; done < nr_args; done++) {
c3a31e60
JA
8027 err = 0;
8028 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
8029 err = -EFAULT;
8030 break;
8031 }
4e0377a1 8032 if (fd == IORING_REGISTER_FILES_SKIP)
8033 continue;
8034
67973b93 8035 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
ea64ec02
PB
8036 file_slot = io_fixed_file_slot(ctx->file_data, i);
8037
8038 if (*file_slot) {
8039 err = io_queue_file_removal(data, *file_slot);
a5318d3c
HD
8040 if (err)
8041 break;
ea64ec02 8042 *file_slot = NULL;
05589553 8043 needs_switch = true;
c3a31e60
JA
8044 }
8045 if (fd != -1) {
c3a31e60
JA
8046 file = fget(fd);
8047 if (!file) {
8048 err = -EBADF;
8049 break;
8050 }
8051 /*
8052 * Don't allow io_uring instances to be registered. If
8053 * UNIX isn't enabled, then this causes a reference
8054 * cycle and this instance can never get freed. If UNIX
8055 * is enabled we'll handle it just fine, but there's
8056 * still no point in allowing a ring fd as it doesn't
8057 * support regular read/write anyway.
8058 */
8059 if (file->f_op == &io_uring_fops) {
8060 fput(file);
8061 err = -EBADF;
8062 break;
8063 }
e68a3ff8 8064 *file_slot = file;
c3a31e60 8065 err = io_sqe_file_register(ctx, file, i);
f3bd9dae 8066 if (err) {
e68a3ff8 8067 *file_slot = NULL;
f3bd9dae 8068 fput(file);
c3a31e60 8069 break;
f3bd9dae 8070 }
c3a31e60 8071 }
05f3fb3c
JA
8072 }
8073
05589553 8074 if (needs_switch) {
b2e96852 8075 percpu_ref_kill(&data->node->refs);
d67d2263 8076 io_sqe_rsrc_set_node(ctx, data, ref_node);
05589553 8077 } else
269bbe5f 8078 destroy_fixed_rsrc_ref_node(ref_node);
c3a31e60
JA
8079
8080 return done ? done : err;
8081}
05589553 8082
05f3fb3c
JA
8083static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
8084 unsigned nr_args)
8085{
269bbe5f 8086 struct io_uring_rsrc_update up;
05f3fb3c
JA
8087
8088 if (!ctx->file_data)
8089 return -ENXIO;
8090 if (!nr_args)
8091 return -EINVAL;
8092 if (copy_from_user(&up, arg, sizeof(up)))
8093 return -EFAULT;
8094 if (up.resv)
8095 return -EINVAL;
8096
8097 return __io_sqe_files_update(ctx, &up, nr_args);
8098}
c3a31e60 8099
5280f7e5 8100static struct io_wq_work *io_free_work(struct io_wq_work *work)
7d723065
JA
8101{
8102 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8103
5280f7e5
PB
8104 req = io_put_req_find_next(req);
8105 return req ? &req->work : NULL;
7d723065
JA
8106}
8107
24369c2e
PB
8108static int io_init_wq_offload(struct io_ring_ctx *ctx,
8109 struct io_uring_params *p)
8110{
8111 struct io_wq_data data;
8112 struct fd f;
8113 struct io_ring_ctx *ctx_attach;
8114 unsigned int concurrency;
8115 int ret = 0;
8116
8117 data.user = ctx->user;
e9fd9396 8118 data.free_work = io_free_work;
f5fa38c5 8119 data.do_work = io_wq_submit_work;
24369c2e
PB
8120
8121 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
8122 /* Do QD, or 4 * CPUS, whatever is smallest */
8123 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
8124
8125 ctx->io_wq = io_wq_create(concurrency, &data);
8126 if (IS_ERR(ctx->io_wq)) {
8127 ret = PTR_ERR(ctx->io_wq);
8128 ctx->io_wq = NULL;
8129 }
8130 return ret;
8131 }
8132
8133 f = fdget(p->wq_fd);
8134 if (!f.file)
8135 return -EBADF;
8136
8137 if (f.file->f_op != &io_uring_fops) {
8138 ret = -EINVAL;
8139 goto out_fput;
8140 }
8141
8142 ctx_attach = f.file->private_data;
8143 /* @io_wq is protected by holding the fd */
8144 if (!io_wq_get(ctx_attach->io_wq, &data)) {
8145 ret = -EINVAL;
8146 goto out_fput;
8147 }
8148
8149 ctx->io_wq = ctx_attach->io_wq;
8150out_fput:
8151 fdput(f);
8152 return ret;
8153}
8154
0f212204
JA
8155static int io_uring_alloc_task_context(struct task_struct *task)
8156{
8157 struct io_uring_task *tctx;
d8a6df10 8158 int ret;
0f212204
JA
8159
8160 tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
8161 if (unlikely(!tctx))
8162 return -ENOMEM;
8163
d8a6df10
JA
8164 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8165 if (unlikely(ret)) {
8166 kfree(tctx);
8167 return ret;
8168 }
8169
0f212204
JA
8170 xa_init(&tctx->xa);
8171 init_waitqueue_head(&tctx->wait);
8172 tctx->last = NULL;
fdaf083c
JA
8173 atomic_set(&tctx->in_idle, 0);
8174 tctx->sqpoll = false;
500a373d
JA
8175 io_init_identity(&tctx->__identity);
8176 tctx->identity = &tctx->__identity;
0f212204 8177 task->io_uring = tctx;
7cbf1722
JA
8178 spin_lock_init(&tctx->task_lock);
8179 INIT_WQ_LIST(&tctx->task_list);
8180 tctx->task_state = 0;
8181 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
8182 return 0;
8183}
8184
8185void __io_uring_free(struct task_struct *tsk)
8186{
8187 struct io_uring_task *tctx = tsk->io_uring;
8188
8189 WARN_ON_ONCE(!xa_empty(&tctx->xa));
500a373d
JA
8190 WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
8191 if (tctx->identity != &tctx->__identity)
8192 kfree(tctx->identity);
d8a6df10 8193 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
8194 kfree(tctx);
8195 tsk->io_uring = NULL;
8196}
8197
7e84e1c7
SG
8198static int io_sq_offload_create(struct io_ring_ctx *ctx,
8199 struct io_uring_params *p)
2b188cc1
JA
8200{
8201 int ret;
8202
6c271ce2 8203 if (ctx->flags & IORING_SETUP_SQPOLL) {
534ca6d6
JA
8204 struct io_sq_data *sqd;
8205
3ec482d1 8206 ret = -EPERM;
ce59fc69 8207 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
3ec482d1
JA
8208 goto err;
8209
534ca6d6
JA
8210 sqd = io_get_sq_data(p);
8211 if (IS_ERR(sqd)) {
8212 ret = PTR_ERR(sqd);
8213 goto err;
8214 }
69fb2131 8215
534ca6d6 8216 ctx->sq_data = sqd;
69fb2131
JA
8217 io_sq_thread_park(sqd);
8218 mutex_lock(&sqd->ctx_lock);
8219 list_add(&ctx->sqd_list, &sqd->ctx_new_list);
8220 mutex_unlock(&sqd->ctx_lock);
8221 io_sq_thread_unpark(sqd);
534ca6d6 8222
917257da
JA
8223 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8224 if (!ctx->sq_thread_idle)
8225 ctx->sq_thread_idle = HZ;
8226
aa06165d
JA
8227 if (sqd->thread)
8228 goto done;
8229
6c271ce2 8230 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 8231 int cpu = p->sq_thread_cpu;
6c271ce2 8232
917257da 8233 ret = -EINVAL;
44a9bd18
JA
8234 if (cpu >= nr_cpu_ids)
8235 goto err;
7889f44d 8236 if (!cpu_online(cpu))
917257da
JA
8237 goto err;
8238
69fb2131 8239 sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
534ca6d6 8240 cpu, "io_uring-sq");
6c271ce2 8241 } else {
69fb2131 8242 sqd->thread = kthread_create(io_sq_thread, sqd,
6c271ce2
JA
8243 "io_uring-sq");
8244 }
534ca6d6
JA
8245 if (IS_ERR(sqd->thread)) {
8246 ret = PTR_ERR(sqd->thread);
8247 sqd->thread = NULL;
6c271ce2
JA
8248 goto err;
8249 }
534ca6d6 8250 ret = io_uring_alloc_task_context(sqd->thread);
0f212204
JA
8251 if (ret)
8252 goto err;
6c271ce2
JA
8253 } else if (p->flags & IORING_SETUP_SQ_AFF) {
8254 /* Can't have SQ_AFF without SQPOLL */
8255 ret = -EINVAL;
8256 goto err;
8257 }
8258
aa06165d 8259done:
24369c2e
PB
8260 ret = io_init_wq_offload(ctx, p);
8261 if (ret)
2b188cc1 8262 goto err;
2b188cc1
JA
8263
8264 return 0;
8265err:
54a91f3b 8266 io_finish_async(ctx);
2b188cc1
JA
8267 return ret;
8268}
8269
7e84e1c7
SG
8270static void io_sq_offload_start(struct io_ring_ctx *ctx)
8271{
534ca6d6
JA
8272 struct io_sq_data *sqd = ctx->sq_data;
8273
8274 if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
8275 wake_up_process(sqd->thread);
7e84e1c7
SG
8276}
8277
a087e2b5
BM
8278static inline void __io_unaccount_mem(struct user_struct *user,
8279 unsigned long nr_pages)
2b188cc1
JA
8280{
8281 atomic_long_sub(nr_pages, &user->locked_vm);
8282}
8283
a087e2b5
BM
8284static inline int __io_account_mem(struct user_struct *user,
8285 unsigned long nr_pages)
2b188cc1
JA
8286{
8287 unsigned long page_limit, cur_pages, new_pages;
8288
8289 /* Don't allow more pages than we can safely lock */
8290 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8291
8292 do {
8293 cur_pages = atomic_long_read(&user->locked_vm);
8294 new_pages = cur_pages + nr_pages;
8295 if (new_pages > page_limit)
8296 return -ENOMEM;
8297 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8298 new_pages) != cur_pages);
8299
8300 return 0;
8301}
8302
26bfa89e 8303static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8304{
aad5d8da 8305 if (ctx->limit_mem)
a087e2b5 8306 __io_unaccount_mem(ctx->user, nr_pages);
30975825 8307
26bfa89e
JA
8308 if (ctx->mm_account)
8309 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8310}
8311
26bfa89e 8312static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8313{
30975825
BM
8314 int ret;
8315
8316 if (ctx->limit_mem) {
8317 ret = __io_account_mem(ctx->user, nr_pages);
8318 if (ret)
8319 return ret;
8320 }
8321
26bfa89e
JA
8322 if (ctx->mm_account)
8323 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8324
8325 return 0;
8326}
8327
2b188cc1
JA
8328static void io_mem_free(void *ptr)
8329{
52e04ef4
MR
8330 struct page *page;
8331
8332 if (!ptr)
8333 return;
2b188cc1 8334
52e04ef4 8335 page = virt_to_head_page(ptr);
2b188cc1
JA
8336 if (put_page_testzero(page))
8337 free_compound_page(page);
8338}
8339
8340static void *io_mem_alloc(size_t size)
8341{
8342 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
26bfa89e 8343 __GFP_NORETRY | __GFP_ACCOUNT;
2b188cc1
JA
8344
8345 return (void *) __get_free_pages(gfp_flags, get_order(size));
8346}
8347
75b28aff
HV
8348static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8349 size_t *sq_offset)
8350{
8351 struct io_rings *rings;
8352 size_t off, sq_array_size;
8353
8354 off = struct_size(rings, cqes, cq_entries);
8355 if (off == SIZE_MAX)
8356 return SIZE_MAX;
8357
8358#ifdef CONFIG_SMP
8359 off = ALIGN(off, SMP_CACHE_BYTES);
8360 if (off == 0)
8361 return SIZE_MAX;
8362#endif
8363
b36200f5
DV
8364 if (sq_offset)
8365 *sq_offset = off;
8366
75b28aff
HV
8367 sq_array_size = array_size(sizeof(u32), sq_entries);
8368 if (sq_array_size == SIZE_MAX)
8369 return SIZE_MAX;
8370
8371 if (check_add_overflow(off, sq_array_size, &off))
8372 return SIZE_MAX;
8373
75b28aff
HV
8374 return off;
8375}
8376
0a96bbe4 8377static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee
JA
8378{
8379 int i, j;
8380
8381 if (!ctx->user_bufs)
8382 return -ENXIO;
8383
8384 for (i = 0; i < ctx->nr_user_bufs; i++) {
8385 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8386
8387 for (j = 0; j < imu->nr_bvecs; j++)
f1f6a7dd 8388 unpin_user_page(imu->bvec[j].bv_page);
edafccee 8389
de293938 8390 if (imu->acct_pages)
26bfa89e 8391 io_unaccount_mem(ctx, imu->acct_pages);
d4ef6475 8392 kvfree(imu->bvec);
edafccee
JA
8393 imu->nr_bvecs = 0;
8394 }
8395
8396 kfree(ctx->user_bufs);
8397 ctx->user_bufs = NULL;
8398 ctx->nr_user_bufs = 0;
8399 return 0;
8400}
8401
8402static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8403 void __user *arg, unsigned index)
8404{
8405 struct iovec __user *src;
8406
8407#ifdef CONFIG_COMPAT
8408 if (ctx->compat) {
8409 struct compat_iovec __user *ciovs;
8410 struct compat_iovec ciov;
8411
8412 ciovs = (struct compat_iovec __user *) arg;
8413 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8414 return -EFAULT;
8415
d55e5f5b 8416 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
8417 dst->iov_len = ciov.iov_len;
8418 return 0;
8419 }
8420#endif
8421 src = (struct iovec __user *) arg;
8422 if (copy_from_user(dst, &src[index], sizeof(*dst)))
8423 return -EFAULT;
8424 return 0;
8425}
8426
de293938
JA
8427/*
8428 * Not super efficient, but this is just a registration time. And we do cache
8429 * the last compound head, so generally we'll only do a full search if we don't
8430 * match that one.
8431 *
8432 * We check if the given compound head page has already been accounted, to
8433 * avoid double accounting it. This allows us to account the full size of the
8434 * page, not just the constituent pages of a huge page.
8435 */
8436static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8437 int nr_pages, struct page *hpage)
8438{
8439 int i, j;
8440
8441 /* check current page array */
8442 for (i = 0; i < nr_pages; i++) {
8443 if (!PageCompound(pages[i]))
8444 continue;
8445 if (compound_head(pages[i]) == hpage)
8446 return true;
8447 }
8448
8449 /* check previously registered pages */
8450 for (i = 0; i < ctx->nr_user_bufs; i++) {
8451 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8452
8453 for (j = 0; j < imu->nr_bvecs; j++) {
8454 if (!PageCompound(imu->bvec[j].bv_page))
8455 continue;
8456 if (compound_head(imu->bvec[j].bv_page) == hpage)
8457 return true;
8458 }
8459 }
8460
8461 return false;
8462}
8463
8464static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8465 int nr_pages, struct io_mapped_ubuf *imu,
8466 struct page **last_hpage)
8467{
8468 int i, ret;
8469
8470 for (i = 0; i < nr_pages; i++) {
8471 if (!PageCompound(pages[i])) {
8472 imu->acct_pages++;
8473 } else {
8474 struct page *hpage;
8475
8476 hpage = compound_head(pages[i]);
8477 if (hpage == *last_hpage)
8478 continue;
8479 *last_hpage = hpage;
8480 if (headpage_already_acct(ctx, pages, i, hpage))
8481 continue;
8482 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8483 }
8484 }
8485
8486 if (!imu->acct_pages)
8487 return 0;
8488
26bfa89e 8489 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
8490 if (ret)
8491 imu->acct_pages = 0;
8492 return ret;
8493}
8494
0a96bbe4
BM
8495static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8496 struct io_mapped_ubuf *imu,
8497 struct page **last_hpage)
edafccee
JA
8498{
8499 struct vm_area_struct **vmas = NULL;
8500 struct page **pages = NULL;
0a96bbe4
BM
8501 unsigned long off, start, end, ubuf;
8502 size_t size;
8503 int ret, pret, nr_pages, i;
8504
8505 ubuf = (unsigned long) iov->iov_base;
8506 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8507 start = ubuf >> PAGE_SHIFT;
8508 nr_pages = end - start;
8509
8510 ret = -ENOMEM;
8511
8512 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8513 if (!pages)
8514 goto done;
8515
8516 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8517 GFP_KERNEL);
8518 if (!vmas)
8519 goto done;
edafccee 8520
0a96bbe4
BM
8521 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8522 GFP_KERNEL);
8523 if (!imu->bvec)
8524 goto done;
8525
8526 ret = 0;
8527 mmap_read_lock(current->mm);
8528 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8529 pages, vmas);
8530 if (pret == nr_pages) {
8531 /* don't support file backed memory */
8532 for (i = 0; i < nr_pages; i++) {
8533 struct vm_area_struct *vma = vmas[i];
8534
8535 if (vma->vm_file &&
8536 !is_file_hugepages(vma->vm_file)) {
8537 ret = -EOPNOTSUPP;
8538 break;
8539 }
8540 }
8541 } else {
8542 ret = pret < 0 ? pret : -EFAULT;
8543 }
8544 mmap_read_unlock(current->mm);
8545 if (ret) {
8546 /*
8547 * if we did partial map, or found file backed vmas,
8548 * release any pages we did get
8549 */
8550 if (pret > 0)
8551 unpin_user_pages(pages, pret);
8552 kvfree(imu->bvec);
8553 goto done;
8554 }
8555
8556 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8557 if (ret) {
8558 unpin_user_pages(pages, pret);
8559 kvfree(imu->bvec);
8560 goto done;
8561 }
8562
8563 off = ubuf & ~PAGE_MASK;
8564 size = iov->iov_len;
8565 for (i = 0; i < nr_pages; i++) {
8566 size_t vec_len;
8567
8568 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8569 imu->bvec[i].bv_page = pages[i];
8570 imu->bvec[i].bv_len = vec_len;
8571 imu->bvec[i].bv_offset = off;
8572 off = 0;
8573 size -= vec_len;
8574 }
8575 /* store original address for later verification */
8576 imu->ubuf = ubuf;
8577 imu->len = iov->iov_len;
8578 imu->nr_bvecs = nr_pages;
8579 ret = 0;
8580done:
8581 kvfree(pages);
8582 kvfree(vmas);
8583 return ret;
8584}
8585
2b358604 8586static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 8587{
edafccee
JA
8588 if (ctx->user_bufs)
8589 return -EBUSY;
8590 if (!nr_args || nr_args > UIO_MAXIOV)
8591 return -EINVAL;
8592
8593 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8594 GFP_KERNEL);
8595 if (!ctx->user_bufs)
8596 return -ENOMEM;
8597
2b358604
BM
8598 return 0;
8599}
edafccee 8600
2b358604
BM
8601static int io_buffer_validate(struct iovec *iov)
8602{
8603 /*
8604 * Don't impose further limits on the size and buffer
8605 * constraints here, we'll -EINVAL later when IO is
8606 * submitted if they are wrong.
8607 */
8608 if (!iov->iov_base || !iov->iov_len)
8609 return -EFAULT;
edafccee 8610
2b358604
BM
8611 /* arbitrary limit, but we need something */
8612 if (iov->iov_len > SZ_1G)
8613 return -EFAULT;
edafccee 8614
2b358604
BM
8615 return 0;
8616}
edafccee 8617
2b358604
BM
8618static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8619 unsigned int nr_args)
8620{
8621 int i, ret;
8622 struct iovec iov;
8623 struct page *last_hpage = NULL;
edafccee 8624
2b358604
BM
8625 ret = io_buffers_map_alloc(ctx, nr_args);
8626 if (ret)
8627 return ret;
edafccee 8628
edafccee
JA
8629 for (i = 0; i < nr_args; i++) {
8630 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
edafccee 8631
edafccee
JA
8632 ret = io_copy_iov(ctx, &iov, arg, i);
8633 if (ret)
0a96bbe4 8634 break;
de293938 8635
2b358604
BM
8636 ret = io_buffer_validate(&iov);
8637 if (ret)
0a96bbe4 8638 break;
edafccee 8639
0a96bbe4
BM
8640 ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
8641 if (ret)
8642 break;
edafccee
JA
8643
8644 ctx->nr_user_bufs++;
8645 }
0a96bbe4
BM
8646
8647 if (ret)
8648 io_sqe_buffers_unregister(ctx);
8649
edafccee
JA
8650 return ret;
8651}
8652
9b402849
JA
8653static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8654{
8655 __s32 __user *fds = arg;
8656 int fd;
8657
8658 if (ctx->cq_ev_fd)
8659 return -EBUSY;
8660
8661 if (copy_from_user(&fd, fds, sizeof(*fds)))
8662 return -EFAULT;
8663
8664 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8665 if (IS_ERR(ctx->cq_ev_fd)) {
8666 int ret = PTR_ERR(ctx->cq_ev_fd);
8667 ctx->cq_ev_fd = NULL;
8668 return ret;
8669 }
8670
8671 return 0;
8672}
8673
8674static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8675{
8676 if (ctx->cq_ev_fd) {
8677 eventfd_ctx_put(ctx->cq_ev_fd);
8678 ctx->cq_ev_fd = NULL;
8679 return 0;
8680 }
8681
8682 return -ENXIO;
8683}
8684
5a2e745d
JA
8685static int __io_destroy_buffers(int id, void *p, void *data)
8686{
8687 struct io_ring_ctx *ctx = data;
8688 struct io_buffer *buf = p;
8689
067524e9 8690 __io_remove_buffers(ctx, buf, id, -1U);
5a2e745d
JA
8691 return 0;
8692}
8693
8694static void io_destroy_buffers(struct io_ring_ctx *ctx)
8695{
8696 idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8697 idr_destroy(&ctx->io_buffer_idr);
8698}
8699
68e68ee6 8700static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
1b4c351f 8701{
68e68ee6 8702 struct io_kiocb *req, *nxt;
1b4c351f 8703
68e68ee6
JA
8704 list_for_each_entry_safe(req, nxt, list, compl.list) {
8705 if (tsk && req->task != tsk)
8706 continue;
1b4c351f
JA
8707 list_del(&req->compl.list);
8708 kmem_cache_free(req_cachep, req);
8709 }
8710}
8711
9a4fdbd8 8712static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
2b188cc1 8713{
bf019da7
PB
8714 struct io_submit_state *submit_state = &ctx->submit_state;
8715
9a4fdbd8
JA
8716 mutex_lock(&ctx->uring_lock);
8717
8718 if (submit_state->free_reqs)
8719 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8720 submit_state->reqs);
8721
8722 io_req_cache_free(&submit_state->comp.free_list, NULL);
8723
8724 spin_lock_irq(&ctx->completion_lock);
8725 io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
8726 spin_unlock_irq(&ctx->completion_lock);
8727
8728 mutex_unlock(&ctx->uring_lock);
8729}
8730
2b188cc1
JA
8731static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8732{
04fc6c80
PB
8733 /*
8734 * Some may use context even when all refs and requests have been put,
8735 * and they are free to do so while still holding uring_lock, see
8736 * __io_req_task_submit(). Wait for them to finish.
8737 */
8738 mutex_lock(&ctx->uring_lock);
8739 mutex_unlock(&ctx->uring_lock);
8740
6b06314c 8741 io_finish_async(ctx);
0a96bbe4 8742 io_sqe_buffers_unregister(ctx);
2aede0e4
JA
8743
8744 if (ctx->sqo_task) {
8745 put_task_struct(ctx->sqo_task);
8746 ctx->sqo_task = NULL;
8747 mmdrop(ctx->mm_account);
8748 ctx->mm_account = NULL;
30975825 8749 }
def596e9 8750
91d8f519
DZ
8751#ifdef CONFIG_BLK_CGROUP
8752 if (ctx->sqo_blkcg_css)
8753 css_put(ctx->sqo_blkcg_css);
8754#endif
8755
8bad28d8 8756 mutex_lock(&ctx->uring_lock);
6b06314c 8757 io_sqe_files_unregister(ctx);
8bad28d8 8758 mutex_unlock(&ctx->uring_lock);
9b402849 8759 io_eventfd_unregister(ctx);
5a2e745d 8760 io_destroy_buffers(ctx);
41726c9a 8761 idr_destroy(&ctx->personality_idr);
def596e9 8762
2b188cc1 8763#if defined(CONFIG_UNIX)
355e8d26
EB
8764 if (ctx->ring_sock) {
8765 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 8766 sock_release(ctx->ring_sock);
355e8d26 8767 }
2b188cc1
JA
8768#endif
8769
75b28aff 8770 io_mem_free(ctx->rings);
2b188cc1 8771 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
8772
8773 percpu_ref_exit(&ctx->refs);
2b188cc1 8774 free_uid(ctx->user);
181e448d 8775 put_cred(ctx->creds);
9a4fdbd8 8776 io_req_caches_free(ctx, NULL);
78076bb6 8777 kfree(ctx->cancel_hash);
2b188cc1
JA
8778 kfree(ctx);
8779}
8780
8781static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8782{
8783 struct io_ring_ctx *ctx = file->private_data;
8784 __poll_t mask = 0;
8785
8786 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
8787 /*
8788 * synchronizes with barrier from wq_has_sleeper call in
8789 * io_commit_cqring
8790 */
2b188cc1 8791 smp_rmb();
90554200 8792 if (!io_sqring_full(ctx))
2b188cc1 8793 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
8794
8795 /*
8796 * Don't flush cqring overflow list here, just do a simple check.
8797 * Otherwise there could possible be ABBA deadlock:
8798 * CPU0 CPU1
8799 * ---- ----
8800 * lock(&ctx->uring_lock);
8801 * lock(&ep->mtx);
8802 * lock(&ctx->uring_lock);
8803 * lock(&ep->mtx);
8804 *
8805 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8806 * pushs them to do the flush.
8807 */
8808 if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
2b188cc1
JA
8809 mask |= EPOLLIN | EPOLLRDNORM;
8810
8811 return mask;
8812}
8813
8814static int io_uring_fasync(int fd, struct file *file, int on)
8815{
8816 struct io_ring_ctx *ctx = file->private_data;
8817
8818 return fasync_helper(fd, file, on, &ctx->cq_fasync);
8819}
8820
0bead8cd 8821static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 8822{
1e6fa521 8823 struct io_identity *iod;
071698e1 8824
1e6fa521
JA
8825 iod = idr_remove(&ctx->personality_idr, id);
8826 if (iod) {
8827 put_cred(iod->creds);
8828 if (refcount_dec_and_test(&iod->count))
8829 kfree(iod);
0bead8cd 8830 return 0;
1e6fa521 8831 }
0bead8cd
YD
8832
8833 return -EINVAL;
8834}
8835
8836static int io_remove_personalities(int id, void *p, void *data)
8837{
8838 struct io_ring_ctx *ctx = data;
8839
8840 io_unregister_personality(ctx, id);
071698e1
JA
8841 return 0;
8842}
8843
7c25c0d1
JA
8844static void io_run_ctx_fallback(struct io_ring_ctx *ctx)
8845{
8846 struct callback_head *work, *head, *next;
8847
8848 do {
8849 do {
8850 head = NULL;
8851 work = READ_ONCE(ctx->exit_task_work);
8852 } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
8853
8854 if (!work)
8855 break;
8856
8857 do {
8858 next = work->next;
8859 work->func(work);
8860 work = next;
8861 cond_resched();
8862 } while (work);
8863 } while (1);
8864}
8865
85faa7b8
JA
8866static void io_ring_exit_work(struct work_struct *work)
8867{
b2edc0a7
PB
8868 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8869 exit_work);
85faa7b8 8870
56952e91
JA
8871 /*
8872 * If we're doing polled IO and end up having requests being
8873 * submitted async (out-of-line), then completions can come in while
8874 * we're waiting for refs to drop. We need to reap these manually,
8875 * as nobody else will be looking for them.
8876 */
b2edc0a7 8877 do {
9936c7c2 8878 io_uring_try_cancel_requests(ctx, NULL, NULL);
7c25c0d1 8879 io_run_ctx_fallback(ctx);
b2edc0a7 8880 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
85faa7b8
JA
8881 io_ring_ctx_free(ctx);
8882}
8883
00c18640
JA
8884static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
8885{
8886 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8887
8888 return req->ctx == data;
8889}
8890
2b188cc1
JA
8891static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8892{
8893 mutex_lock(&ctx->uring_lock);
8894 percpu_ref_kill(&ctx->refs);
d9d05217
PB
8895
8896 if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead))
8897 ctx->sqo_dead = 1;
8898
cda286f0
PB
8899 /* if force is set, the ring is going away. always drop after that */
8900 ctx->cq_overflow_flushed = 1;
634578f8 8901 if (ctx->rings)
6c503150 8902 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
5c766a90 8903 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
2b188cc1
JA
8904 mutex_unlock(&ctx->uring_lock);
8905
6b81928d
PB
8906 io_kill_timeouts(ctx, NULL, NULL);
8907 io_poll_remove_all(ctx, NULL, NULL);
561fb04a
JA
8908
8909 if (ctx->io_wq)
00c18640 8910 io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
561fb04a 8911
15dff286 8912 /* if we failed setting up the ctx, we might not have any rings */
b2edc0a7 8913 io_iopoll_try_reap_events(ctx);
309fc03a 8914
85faa7b8 8915 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
8916 /*
8917 * Use system_unbound_wq to avoid spawning tons of event kworkers
8918 * if we're exiting a ton of rings at the same time. It just adds
8919 * noise and overhead, there's no discernable change in runtime
8920 * over using system_wq.
8921 */
8922 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
8923}
8924
8925static int io_uring_release(struct inode *inode, struct file *file)
8926{
8927 struct io_ring_ctx *ctx = file->private_data;
8928
8929 file->private_data = NULL;
8930 io_ring_ctx_wait_and_kill(ctx);
8931 return 0;
8932}
8933
f6edbabb
PB
8934struct io_task_cancel {
8935 struct task_struct *task;
8936 struct files_struct *files;
8937};
f254ac04 8938
f6edbabb 8939static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 8940{
9a472ef7 8941 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 8942 struct io_task_cancel *cancel = data;
9a472ef7
PB
8943 bool ret;
8944
f6edbabb 8945 if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
9a472ef7
PB
8946 unsigned long flags;
8947 struct io_ring_ctx *ctx = req->ctx;
8948
8949 /* protect against races with linked timeouts */
8950 spin_lock_irqsave(&ctx->completion_lock, flags);
f6edbabb 8951 ret = io_match_task(req, cancel->task, cancel->files);
9a472ef7
PB
8952 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8953 } else {
f6edbabb 8954 ret = io_match_task(req, cancel->task, cancel->files);
9a472ef7
PB
8955 }
8956 return ret;
b711d4ea
JA
8957}
8958
b7ddce3c 8959static void io_cancel_defer_files(struct io_ring_ctx *ctx,
ef9865a4 8960 struct task_struct *task,
b7ddce3c
PB
8961 struct files_struct *files)
8962{
8963 struct io_defer_entry *de = NULL;
8964 LIST_HEAD(list);
8965
8966 spin_lock_irq(&ctx->completion_lock);
8967 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
08d23634 8968 if (io_match_task(de->req, task, files)) {
b7ddce3c
PB
8969 list_cut_position(&list, &ctx->defer_list, &de->list);
8970 break;
8971 }
8972 }
8973 spin_unlock_irq(&ctx->completion_lock);
8974
8975 while (!list_empty(&list)) {
8976 de = list_first_entry(&list, struct io_defer_entry, list);
8977 list_del_init(&de->list);
8978 req_set_fail_links(de->req);
8979 io_put_req(de->req);
8980 io_req_complete(de->req, -ECANCELED);
8981 kfree(de);
8982 }
8983}
8984
9936c7c2
PB
8985static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8986 struct task_struct *task,
8987 struct files_struct *files)
8988{
8989 struct io_task_cancel cancel = { .task = task, .files = files, };
8990
8991 while (1) {
8992 enum io_wq_cancel cret;
8993 bool ret = false;
8994
8995 if (ctx->io_wq) {
8996 cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
8997 &cancel, true);
8998 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8999 }
9000
9001 /* SQPOLL thread does its own polling */
9002 if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
9003 while (!list_empty_careful(&ctx->iopoll_list)) {
9004 io_iopoll_try_reap_events(ctx);
9005 ret = true;
9006 }
9007 }
9008
9009 ret |= io_poll_remove_all(ctx, task, files);
9010 ret |= io_kill_timeouts(ctx, task, files);
9011 ret |= io_run_task_work();
9012 io_cqring_overflow_flush(ctx, true, task, files);
9013 if (!ret)
9014 break;
9015 cond_resched();
9016 }
9017}
9018
ca70f00b
PB
9019static int io_uring_count_inflight(struct io_ring_ctx *ctx,
9020 struct task_struct *task,
9021 struct files_struct *files)
9022{
9023 struct io_kiocb *req;
9024 int cnt = 0;
9025
9026 spin_lock_irq(&ctx->inflight_lock);
9027 list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
9028 cnt += io_match_task(req, task, files);
9029 spin_unlock_irq(&ctx->inflight_lock);
9030 return cnt;
9031}
9032
b52fda00 9033static void io_uring_cancel_files(struct io_ring_ctx *ctx,
df9923f9 9034 struct task_struct *task,
fcb323cc
JA
9035 struct files_struct *files)
9036{
fcb323cc 9037 while (!list_empty_careful(&ctx->inflight_list)) {
d8f1b971 9038 DEFINE_WAIT(wait);
ca70f00b 9039 int inflight;
fcb323cc 9040
ca70f00b
PB
9041 inflight = io_uring_count_inflight(ctx, task, files);
9042 if (!inflight)
fcb323cc 9043 break;
f6edbabb 9044
9936c7c2 9045 io_uring_try_cancel_requests(ctx, task, files);
ca70f00b 9046
34343786
PB
9047 if (ctx->sq_data)
9048 io_sq_thread_unpark(ctx->sq_data);
ca70f00b
PB
9049 prepare_to_wait(&task->io_uring->wait, &wait,
9050 TASK_UNINTERRUPTIBLE);
9051 if (inflight == io_uring_count_inflight(ctx, task, files))
9052 schedule();
c98de08c 9053 finish_wait(&task->io_uring->wait, &wait);
34343786
PB
9054 if (ctx->sq_data)
9055 io_sq_thread_park(ctx->sq_data);
0f212204 9056 }
0f212204
JA
9057}
9058
d9d05217
PB
9059static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
9060{
d9d05217
PB
9061 mutex_lock(&ctx->uring_lock);
9062 ctx->sqo_dead = 1;
9063 mutex_unlock(&ctx->uring_lock);
9064
9065 /* make sure callers enter the ring to get error */
b4411616
PB
9066 if (ctx->rings)
9067 io_ring_set_wakeup_flag(ctx);
d9d05217
PB
9068}
9069
0f212204
JA
9070/*
9071 * We need to iteratively cancel requests, in case a request has dependent
9072 * hard links. These persist even for failure of cancelations, hence keep
9073 * looping until none are found.
9074 */
9075static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
9076 struct files_struct *files)
9077{
9078 struct task_struct *task = current;
9079
fdaf083c 9080 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
d9d05217 9081 io_disable_sqo_submit(ctx);
534ca6d6 9082 task = ctx->sq_data->thread;
fdaf083c
JA
9083 atomic_inc(&task->io_uring->in_idle);
9084 io_sq_thread_park(ctx->sq_data);
9085 }
0f212204 9086
df9923f9 9087 io_cancel_defer_files(ctx, task, files);
0f212204 9088
3a7efd1a 9089 io_uring_cancel_files(ctx, task, files);
b52fda00 9090 if (!files)
9936c7c2 9091 io_uring_try_cancel_requests(ctx, task, NULL);
fdaf083c
JA
9092
9093 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
9094 atomic_dec(&task->io_uring->in_idle);
fdaf083c
JA
9095 io_sq_thread_unpark(ctx->sq_data);
9096 }
0f212204
JA
9097}
9098
9099/*
9100 * Note that this task has used io_uring. We use it for cancelation purposes.
9101 */
fdaf083c 9102static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
0f212204 9103{
236434c3 9104 struct io_uring_task *tctx = current->io_uring;
a528b04e 9105 int ret;
236434c3
MWO
9106
9107 if (unlikely(!tctx)) {
0f212204
JA
9108 ret = io_uring_alloc_task_context(current);
9109 if (unlikely(ret))
9110 return ret;
236434c3 9111 tctx = current->io_uring;
0f212204 9112 }
236434c3
MWO
9113 if (tctx->last != file) {
9114 void *old = xa_load(&tctx->xa, (unsigned long)file);
0f212204 9115
236434c3 9116 if (!old) {
0f212204 9117 get_file(file);
a528b04e
PB
9118 ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
9119 file, GFP_KERNEL));
9120 if (ret) {
9121 fput(file);
9122 return ret;
9123 }
ecfc8492
PB
9124
9125 /* one and only SQPOLL file note, held by sqo_task */
9126 WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
9127 current != ctx->sqo_task);
0f212204 9128 }
236434c3 9129 tctx->last = file;
0f212204
JA
9130 }
9131
fdaf083c
JA
9132 /*
9133 * This is race safe in that the task itself is doing this, hence it
9134 * cannot be going through the exit/cancel paths at the same time.
9135 * This cannot be modified while exit/cancel is running.
9136 */
9137 if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
9138 tctx->sqpoll = true;
9139
0f212204
JA
9140 return 0;
9141}
9142
9143/*
9144 * Remove this io_uring_file -> task mapping.
9145 */
9146static void io_uring_del_task_file(struct file *file)
9147{
9148 struct io_uring_task *tctx = current->io_uring;
0f212204
JA
9149
9150 if (tctx->last == file)
9151 tctx->last = NULL;
5e2ed8c4 9152 file = xa_erase(&tctx->xa, (unsigned long)file);
0f212204
JA
9153 if (file)
9154 fput(file);
9155}
9156
de7f1d9e
PB
9157static void io_uring_remove_task_files(struct io_uring_task *tctx)
9158{
9159 struct file *file;
9160 unsigned long index;
9161
9162 xa_for_each(&tctx->xa, index, file)
9163 io_uring_del_task_file(file);
9164}
9165
0f212204
JA
9166void __io_uring_files_cancel(struct files_struct *files)
9167{
9168 struct io_uring_task *tctx = current->io_uring;
ce765372
MWO
9169 struct file *file;
9170 unsigned long index;
0f212204
JA
9171
9172 /* make sure overflow events are dropped */
fdaf083c 9173 atomic_inc(&tctx->in_idle);
de7f1d9e
PB
9174 xa_for_each(&tctx->xa, index, file)
9175 io_uring_cancel_task_requests(file->private_data, files);
fdaf083c 9176 atomic_dec(&tctx->in_idle);
de7f1d9e
PB
9177
9178 if (files)
9179 io_uring_remove_task_files(tctx);
fdaf083c
JA
9180}
9181
9182static s64 tctx_inflight(struct io_uring_task *tctx)
9183{
0e9ddb39
PB
9184 return percpu_counter_sum(&tctx->inflight);
9185}
fdaf083c 9186
0e9ddb39
PB
9187static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
9188{
9189 struct io_uring_task *tctx;
9190 s64 inflight;
9191 DEFINE_WAIT(wait);
fdaf083c 9192
0e9ddb39
PB
9193 if (!ctx->sq_data)
9194 return;
9195 tctx = ctx->sq_data->thread->io_uring;
9196 io_disable_sqo_submit(ctx);
fdaf083c 9197
0e9ddb39
PB
9198 atomic_inc(&tctx->in_idle);
9199 do {
9200 /* read completions before cancelations */
9201 inflight = tctx_inflight(tctx);
9202 if (!inflight)
9203 break;
9204 io_uring_cancel_task_requests(ctx, NULL);
fdaf083c 9205
0e9ddb39
PB
9206 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9207 /*
9208 * If we've seen completions, retry without waiting. This
9209 * avoids a race where a completion comes in before we did
9210 * prepare_to_wait().
9211 */
9212 if (inflight == tctx_inflight(tctx))
9213 schedule();
9214 finish_wait(&tctx->wait, &wait);
9215 } while (1);
9216 atomic_dec(&tctx->in_idle);
0f212204
JA
9217}
9218
0f212204
JA
9219/*
9220 * Find any io_uring fd that this task has registered or done IO on, and cancel
9221 * requests.
9222 */
9223void __io_uring_task_cancel(void)
9224{
9225 struct io_uring_task *tctx = current->io_uring;
9226 DEFINE_WAIT(wait);
d8a6df10 9227 s64 inflight;
0f212204
JA
9228
9229 /* make sure overflow events are dropped */
fdaf083c 9230 atomic_inc(&tctx->in_idle);
0f212204 9231
0b5cd6c3 9232 /* trigger io_disable_sqo_submit() */
0e9ddb39
PB
9233 if (tctx->sqpoll) {
9234 struct file *file;
9235 unsigned long index;
9236
9237 xa_for_each(&tctx->xa, index, file)
9238 io_uring_cancel_sqpoll(file->private_data);
9239 }
0b5cd6c3 9240
d8a6df10 9241 do {
0f212204 9242 /* read completions before cancelations */
fdaf083c 9243 inflight = tctx_inflight(tctx);
d8a6df10
JA
9244 if (!inflight)
9245 break;
0f212204
JA
9246 __io_uring_files_cancel(NULL);
9247
9248 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9249
9250 /*
a1bb3cd5
PB
9251 * If we've seen completions, retry without waiting. This
9252 * avoids a race where a completion comes in before we did
9253 * prepare_to_wait().
0f212204 9254 */
a1bb3cd5
PB
9255 if (inflight == tctx_inflight(tctx))
9256 schedule();
f57555ed 9257 finish_wait(&tctx->wait, &wait);
d8a6df10 9258 } while (1);
0f212204 9259
fdaf083c 9260 atomic_dec(&tctx->in_idle);
de7f1d9e
PB
9261
9262 io_uring_remove_task_files(tctx);
44e728b8
PB
9263}
9264
fcb323cc
JA
9265static int io_uring_flush(struct file *file, void *data)
9266{
6b5733eb 9267 struct io_uring_task *tctx = current->io_uring;
d9d05217 9268 struct io_ring_ctx *ctx = file->private_data;
6b5733eb 9269
41be53e9 9270 if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
84965ff8 9271 io_uring_cancel_task_requests(ctx, NULL);
41be53e9
JA
9272 io_req_caches_free(ctx, current);
9273 }
84965ff8 9274
7c25c0d1
JA
9275 io_run_ctx_fallback(ctx);
9276
6b5733eb 9277 if (!tctx)
4f793dc4
PB
9278 return 0;
9279
6b5733eb
PB
9280 /* we should have cancelled and erased it before PF_EXITING */
9281 WARN_ON_ONCE((current->flags & PF_EXITING) &&
9282 xa_load(&tctx->xa, (unsigned long)file));
9283
4f793dc4
PB
9284 /*
9285 * fput() is pending, will be 2 if the only other ref is our potential
9286 * task file note. If the task is exiting, drop regardless of count.
9287 */
6b5733eb
PB
9288 if (atomic_long_read(&file->f_count) != 2)
9289 return 0;
4f793dc4 9290
d9d05217
PB
9291 if (ctx->flags & IORING_SETUP_SQPOLL) {
9292 /* there is only one file note, which is owned by sqo_task */
4325cb49
PB
9293 WARN_ON_ONCE(ctx->sqo_task != current &&
9294 xa_load(&tctx->xa, (unsigned long)file));
9295 /* sqo_dead check is for when this happens after cancellation */
9296 WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead &&
d9d05217
PB
9297 !xa_load(&tctx->xa, (unsigned long)file));
9298
9299 io_disable_sqo_submit(ctx);
9300 }
9301
9302 if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current)
9303 io_uring_del_task_file(file);
fcb323cc
JA
9304 return 0;
9305}
9306
6c5c240e
RP
9307static void *io_uring_validate_mmap_request(struct file *file,
9308 loff_t pgoff, size_t sz)
2b188cc1 9309{
2b188cc1 9310 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 9311 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
9312 struct page *page;
9313 void *ptr;
9314
9315 switch (offset) {
9316 case IORING_OFF_SQ_RING:
75b28aff
HV
9317 case IORING_OFF_CQ_RING:
9318 ptr = ctx->rings;
2b188cc1
JA
9319 break;
9320 case IORING_OFF_SQES:
9321 ptr = ctx->sq_sqes;
9322 break;
2b188cc1 9323 default:
6c5c240e 9324 return ERR_PTR(-EINVAL);
2b188cc1
JA
9325 }
9326
9327 page = virt_to_head_page(ptr);
a50b854e 9328 if (sz > page_size(page))
6c5c240e
RP
9329 return ERR_PTR(-EINVAL);
9330
9331 return ptr;
9332}
9333
9334#ifdef CONFIG_MMU
9335
9336static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9337{
9338 size_t sz = vma->vm_end - vma->vm_start;
9339 unsigned long pfn;
9340 void *ptr;
9341
9342 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9343 if (IS_ERR(ptr))
9344 return PTR_ERR(ptr);
2b188cc1
JA
9345
9346 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9347 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9348}
9349
6c5c240e
RP
9350#else /* !CONFIG_MMU */
9351
9352static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9353{
9354 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9355}
9356
9357static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9358{
9359 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9360}
9361
9362static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9363 unsigned long addr, unsigned long len,
9364 unsigned long pgoff, unsigned long flags)
9365{
9366 void *ptr;
9367
9368 ptr = io_uring_validate_mmap_request(file, pgoff, len);
9369 if (IS_ERR(ptr))
9370 return PTR_ERR(ptr);
9371
9372 return (unsigned long) ptr;
9373}
9374
9375#endif /* !CONFIG_MMU */
9376
d9d05217 9377static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200 9378{
d9d05217 9379 int ret = 0;
90554200
JA
9380 DEFINE_WAIT(wait);
9381
9382 do {
9383 if (!io_sqring_full(ctx))
9384 break;
9385
9386 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9387
d9d05217
PB
9388 if (unlikely(ctx->sqo_dead)) {
9389 ret = -EOWNERDEAD;
9390 goto out;
9391 }
9392
90554200
JA
9393 if (!io_sqring_full(ctx))
9394 break;
9395
9396 schedule();
9397 } while (!signal_pending(current));
9398
9399 finish_wait(&ctx->sqo_sq_wait, &wait);
d9d05217
PB
9400out:
9401 return ret;
90554200
JA
9402}
9403
c73ebb68
HX
9404static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9405 struct __kernel_timespec __user **ts,
9406 const sigset_t __user **sig)
9407{
9408 struct io_uring_getevents_arg arg;
9409
9410 /*
9411 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9412 * is just a pointer to the sigset_t.
9413 */
9414 if (!(flags & IORING_ENTER_EXT_ARG)) {
9415 *sig = (const sigset_t __user *) argp;
9416 *ts = NULL;
9417 return 0;
9418 }
9419
9420 /*
9421 * EXT_ARG is set - ensure we agree on the size of it and copy in our
9422 * timespec and sigset_t pointers if good.
9423 */
9424 if (*argsz != sizeof(arg))
9425 return -EINVAL;
9426 if (copy_from_user(&arg, argp, sizeof(arg)))
9427 return -EFAULT;
9428 *sig = u64_to_user_ptr(arg.sigmask);
9429 *argsz = arg.sigmask_sz;
9430 *ts = u64_to_user_ptr(arg.ts);
9431 return 0;
9432}
9433
2b188cc1 9434SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
9435 u32, min_complete, u32, flags, const void __user *, argp,
9436 size_t, argsz)
2b188cc1
JA
9437{
9438 struct io_ring_ctx *ctx;
9439 long ret = -EBADF;
9440 int submitted = 0;
9441 struct fd f;
9442
4c6e277c 9443 io_run_task_work();
b41e9852 9444
90554200 9445 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
c73ebb68 9446 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
2b188cc1
JA
9447 return -EINVAL;
9448
9449 f = fdget(fd);
9450 if (!f.file)
9451 return -EBADF;
9452
9453 ret = -EOPNOTSUPP;
9454 if (f.file->f_op != &io_uring_fops)
9455 goto out_fput;
9456
9457 ret = -ENXIO;
9458 ctx = f.file->private_data;
9459 if (!percpu_ref_tryget(&ctx->refs))
9460 goto out_fput;
9461
7e84e1c7
SG
9462 ret = -EBADFD;
9463 if (ctx->flags & IORING_SETUP_R_DISABLED)
9464 goto out;
9465
6c271ce2
JA
9466 /*
9467 * For SQ polling, the thread will do all submissions and completions.
9468 * Just return the requested submit count, and wake the thread if
9469 * we were asked to.
9470 */
b2a9eada 9471 ret = 0;
6c271ce2 9472 if (ctx->flags & IORING_SETUP_SQPOLL) {
6c503150 9473 io_cqring_overflow_flush(ctx, false, NULL, NULL);
89448c47 9474
d9d05217
PB
9475 ret = -EOWNERDEAD;
9476 if (unlikely(ctx->sqo_dead))
9477 goto out;
6c271ce2 9478 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 9479 wake_up(&ctx->sq_data->wait);
d9d05217
PB
9480 if (flags & IORING_ENTER_SQ_WAIT) {
9481 ret = io_sqpoll_wait_sq(ctx);
9482 if (ret)
9483 goto out;
9484 }
6c271ce2 9485 submitted = to_submit;
b2a9eada 9486 } else if (to_submit) {
fdaf083c 9487 ret = io_uring_add_task_file(ctx, f.file);
0f212204
JA
9488 if (unlikely(ret))
9489 goto out;
2b188cc1 9490 mutex_lock(&ctx->uring_lock);
0f212204 9491 submitted = io_submit_sqes(ctx, to_submit);
2b188cc1 9492 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
9493
9494 if (submitted != to_submit)
9495 goto out;
2b188cc1
JA
9496 }
9497 if (flags & IORING_ENTER_GETEVENTS) {
c73ebb68
HX
9498 const sigset_t __user *sig;
9499 struct __kernel_timespec __user *ts;
9500
9501 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9502 if (unlikely(ret))
9503 goto out;
9504
2b188cc1
JA
9505 min_complete = min(min_complete, ctx->cq_entries);
9506
32b2244a
XW
9507 /*
9508 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9509 * space applications don't need to do io completion events
9510 * polling again, they can rely on io_sq_thread to do polling
9511 * work, which can reduce cpu usage and uring_lock contention.
9512 */
9513 if (ctx->flags & IORING_SETUP_IOPOLL &&
9514 !(ctx->flags & IORING_SETUP_SQPOLL)) {
7668b92a 9515 ret = io_iopoll_check(ctx, min_complete);
def596e9 9516 } else {
c73ebb68 9517 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
def596e9 9518 }
2b188cc1
JA
9519 }
9520
7c504e65 9521out:
6805b32e 9522 percpu_ref_put(&ctx->refs);
2b188cc1
JA
9523out_fput:
9524 fdput(f);
9525 return submitted ? submitted : ret;
9526}
9527
bebdb65e 9528#ifdef CONFIG_PROC_FS
87ce955b
JA
9529static int io_uring_show_cred(int id, void *p, void *data)
9530{
6b47ab81
JA
9531 struct io_identity *iod = p;
9532 const struct cred *cred = iod->creds;
87ce955b
JA
9533 struct seq_file *m = data;
9534 struct user_namespace *uns = seq_user_ns(m);
9535 struct group_info *gi;
9536 kernel_cap_t cap;
9537 unsigned __capi;
9538 int g;
9539
9540 seq_printf(m, "%5d\n", id);
9541 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9542 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9543 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9544 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9545 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9546 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9547 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9548 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9549 seq_puts(m, "\n\tGroups:\t");
9550 gi = cred->group_info;
9551 for (g = 0; g < gi->ngroups; g++) {
9552 seq_put_decimal_ull(m, g ? " " : "",
9553 from_kgid_munged(uns, gi->gid[g]));
9554 }
9555 seq_puts(m, "\n\tCapEff:\t");
9556 cap = cred->cap_effective;
9557 CAP_FOR_EACH_U32(__capi)
9558 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9559 seq_putc(m, '\n');
9560 return 0;
9561}
9562
9563static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9564{
dbbe9c64 9565 struct io_sq_data *sq = NULL;
fad8e0de 9566 bool has_lock;
87ce955b
JA
9567 int i;
9568
fad8e0de
JA
9569 /*
9570 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9571 * since fdinfo case grabs it in the opposite direction of normal use
9572 * cases. If we fail to get the lock, we just don't iterate any
9573 * structures that could be going away outside the io_uring mutex.
9574 */
9575 has_lock = mutex_trylock(&ctx->uring_lock);
9576
dbbe9c64
JQ
9577 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
9578 sq = ctx->sq_data;
9579
9580 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9581 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 9582 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 9583 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
ea64ec02 9584 struct file *f = *io_fixed_file_slot(ctx->file_data, i);
87ce955b 9585
87ce955b
JA
9586 if (f)
9587 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9588 else
9589 seq_printf(m, "%5u: <none>\n", i);
9590 }
9591 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 9592 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
87ce955b
JA
9593 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9594
9595 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9596 (unsigned int) buf->len);
9597 }
fad8e0de 9598 if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
87ce955b
JA
9599 seq_printf(m, "Personalities:\n");
9600 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9601 }
d7718a9d
JA
9602 seq_printf(m, "PollList:\n");
9603 spin_lock_irq(&ctx->completion_lock);
9604 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9605 struct hlist_head *list = &ctx->cancel_hash[i];
9606 struct io_kiocb *req;
9607
9608 hlist_for_each_entry(req, list, hash_node)
9609 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
9610 req->task->task_works != NULL);
9611 }
9612 spin_unlock_irq(&ctx->completion_lock);
fad8e0de
JA
9613 if (has_lock)
9614 mutex_unlock(&ctx->uring_lock);
87ce955b
JA
9615}
9616
9617static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9618{
9619 struct io_ring_ctx *ctx = f->private_data;
9620
9621 if (percpu_ref_tryget(&ctx->refs)) {
9622 __io_uring_show_fdinfo(ctx, m);
9623 percpu_ref_put(&ctx->refs);
9624 }
9625}
bebdb65e 9626#endif
87ce955b 9627
2b188cc1
JA
9628static const struct file_operations io_uring_fops = {
9629 .release = io_uring_release,
fcb323cc 9630 .flush = io_uring_flush,
2b188cc1 9631 .mmap = io_uring_mmap,
6c5c240e
RP
9632#ifndef CONFIG_MMU
9633 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9634 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9635#endif
2b188cc1
JA
9636 .poll = io_uring_poll,
9637 .fasync = io_uring_fasync,
bebdb65e 9638#ifdef CONFIG_PROC_FS
87ce955b 9639 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 9640#endif
2b188cc1
JA
9641};
9642
9643static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9644 struct io_uring_params *p)
9645{
75b28aff
HV
9646 struct io_rings *rings;
9647 size_t size, sq_array_offset;
2b188cc1 9648
bd740481
JA
9649 /* make sure these are sane, as we already accounted them */
9650 ctx->sq_entries = p->sq_entries;
9651 ctx->cq_entries = p->cq_entries;
9652
75b28aff
HV
9653 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9654 if (size == SIZE_MAX)
9655 return -EOVERFLOW;
9656
9657 rings = io_mem_alloc(size);
9658 if (!rings)
2b188cc1
JA
9659 return -ENOMEM;
9660
75b28aff
HV
9661 ctx->rings = rings;
9662 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9663 rings->sq_ring_mask = p->sq_entries - 1;
9664 rings->cq_ring_mask = p->cq_entries - 1;
9665 rings->sq_ring_entries = p->sq_entries;
9666 rings->cq_ring_entries = p->cq_entries;
9667 ctx->sq_mask = rings->sq_ring_mask;
9668 ctx->cq_mask = rings->cq_ring_mask;
2b188cc1
JA
9669
9670 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
9671 if (size == SIZE_MAX) {
9672 io_mem_free(ctx->rings);
9673 ctx->rings = NULL;
2b188cc1 9674 return -EOVERFLOW;
eb065d30 9675 }
2b188cc1
JA
9676
9677 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
9678 if (!ctx->sq_sqes) {
9679 io_mem_free(ctx->rings);
9680 ctx->rings = NULL;
2b188cc1 9681 return -ENOMEM;
eb065d30 9682 }
2b188cc1 9683
2b188cc1
JA
9684 return 0;
9685}
9686
9faadcc8
PB
9687static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9688{
9689 int ret, fd;
9690
9691 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9692 if (fd < 0)
9693 return fd;
9694
9695 ret = io_uring_add_task_file(ctx, file);
9696 if (ret) {
9697 put_unused_fd(fd);
9698 return ret;
9699 }
9700 fd_install(fd, file);
9701 return fd;
9702}
9703
2b188cc1
JA
9704/*
9705 * Allocate an anonymous fd, this is what constitutes the application
9706 * visible backing of an io_uring instance. The application mmaps this
9707 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9708 * we have to tie this fd to a socket for file garbage collection purposes.
9709 */
9faadcc8 9710static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
9711{
9712 struct file *file;
9faadcc8 9713#if defined(CONFIG_UNIX)
2b188cc1
JA
9714 int ret;
9715
2b188cc1
JA
9716 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9717 &ctx->ring_sock);
9718 if (ret)
9faadcc8 9719 return ERR_PTR(ret);
2b188cc1
JA
9720#endif
9721
2b188cc1
JA
9722 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9723 O_RDWR | O_CLOEXEC);
2b188cc1 9724#if defined(CONFIG_UNIX)
9faadcc8
PB
9725 if (IS_ERR(file)) {
9726 sock_release(ctx->ring_sock);
9727 ctx->ring_sock = NULL;
9728 } else {
9729 ctx->ring_sock->file = file;
0f212204 9730 }
2b188cc1 9731#endif
9faadcc8 9732 return file;
2b188cc1
JA
9733}
9734
7f13657d
XW
9735static int io_uring_create(unsigned entries, struct io_uring_params *p,
9736 struct io_uring_params __user *params)
2b188cc1
JA
9737{
9738 struct user_struct *user = NULL;
9739 struct io_ring_ctx *ctx;
9faadcc8 9740 struct file *file;
2b188cc1
JA
9741 int ret;
9742
8110c1a6 9743 if (!entries)
2b188cc1 9744 return -EINVAL;
8110c1a6
JA
9745 if (entries > IORING_MAX_ENTRIES) {
9746 if (!(p->flags & IORING_SETUP_CLAMP))
9747 return -EINVAL;
9748 entries = IORING_MAX_ENTRIES;
9749 }
2b188cc1
JA
9750
9751 /*
9752 * Use twice as many entries for the CQ ring. It's possible for the
9753 * application to drive a higher depth than the size of the SQ ring,
9754 * since the sqes are only used at submission time. This allows for
33a107f0
JA
9755 * some flexibility in overcommitting a bit. If the application has
9756 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9757 * of CQ ring entries manually.
2b188cc1
JA
9758 */
9759 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
9760 if (p->flags & IORING_SETUP_CQSIZE) {
9761 /*
9762 * If IORING_SETUP_CQSIZE is set, we do the same roundup
9763 * to a power-of-two, if it isn't already. We do NOT impose
9764 * any cq vs sq ring sizing.
9765 */
eb2667b3 9766 if (!p->cq_entries)
33a107f0 9767 return -EINVAL;
8110c1a6
JA
9768 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9769 if (!(p->flags & IORING_SETUP_CLAMP))
9770 return -EINVAL;
9771 p->cq_entries = IORING_MAX_CQ_ENTRIES;
9772 }
eb2667b3
JQ
9773 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9774 if (p->cq_entries < p->sq_entries)
9775 return -EINVAL;
33a107f0
JA
9776 } else {
9777 p->cq_entries = 2 * p->sq_entries;
9778 }
2b188cc1
JA
9779
9780 user = get_uid(current_user());
2b188cc1
JA
9781
9782 ctx = io_ring_ctx_alloc(p);
9783 if (!ctx) {
2b188cc1
JA
9784 free_uid(user);
9785 return -ENOMEM;
9786 }
9787 ctx->compat = in_compat_syscall();
26bfa89e 9788 ctx->limit_mem = !capable(CAP_IPC_LOCK);
2b188cc1 9789 ctx->user = user;
0b8c0ec7 9790 ctx->creds = get_current_cred();
4ea33a97
JA
9791#ifdef CONFIG_AUDIT
9792 ctx->loginuid = current->loginuid;
9793 ctx->sessionid = current->sessionid;
9794#endif
2aede0e4
JA
9795 ctx->sqo_task = get_task_struct(current);
9796
9797 /*
9798 * This is just grabbed for accounting purposes. When a process exits,
9799 * the mm is exited and dropped before the files, hence we need to hang
9800 * on to this mm purely for the purposes of being able to unaccount
9801 * memory (locked/pinned vm). It's not used for anything else.
9802 */
6b7898eb 9803 mmgrab(current->mm);
2aede0e4 9804 ctx->mm_account = current->mm;
6b7898eb 9805
91d8f519
DZ
9806#ifdef CONFIG_BLK_CGROUP
9807 /*
9808 * The sq thread will belong to the original cgroup it was inited in.
9809 * If the cgroup goes offline (e.g. disabling the io controller), then
9810 * issued bios will be associated with the closest cgroup later in the
9811 * block layer.
9812 */
9813 rcu_read_lock();
9814 ctx->sqo_blkcg_css = blkcg_css();
9815 ret = css_tryget_online(ctx->sqo_blkcg_css);
9816 rcu_read_unlock();
9817 if (!ret) {
9818 /* don't init against a dying cgroup, have the user try again */
9819 ctx->sqo_blkcg_css = NULL;
9820 ret = -ENODEV;
9821 goto err;
9822 }
9823#endif
2b188cc1
JA
9824 ret = io_allocate_scq_urings(ctx, p);
9825 if (ret)
9826 goto err;
9827
7e84e1c7 9828 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
9829 if (ret)
9830 goto err;
9831
7e84e1c7
SG
9832 if (!(p->flags & IORING_SETUP_R_DISABLED))
9833 io_sq_offload_start(ctx);
9834
2b188cc1 9835 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
9836 p->sq_off.head = offsetof(struct io_rings, sq.head);
9837 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9838 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9839 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9840 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9841 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9842 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
9843
9844 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
9845 p->cq_off.head = offsetof(struct io_rings, cq.head);
9846 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9847 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9848 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9849 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9850 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 9851 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 9852
7f13657d
XW
9853 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9854 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 9855 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68
HX
9856 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9857 IORING_FEAT_EXT_ARG;
7f13657d
XW
9858
9859 if (copy_to_user(params, p, sizeof(*p))) {
9860 ret = -EFAULT;
9861 goto err;
9862 }
d1719f70 9863
9faadcc8
PB
9864 file = io_uring_get_file(ctx);
9865 if (IS_ERR(file)) {
9866 ret = PTR_ERR(file);
9867 goto err;
9868 }
9869
044c1ab3
JA
9870 /*
9871 * Install ring fd as the very last thing, so we don't risk someone
9872 * having closed it before we finish setup
9873 */
9faadcc8
PB
9874 ret = io_uring_install_fd(ctx, file);
9875 if (ret < 0) {
06585c49 9876 io_disable_sqo_submit(ctx);
9faadcc8
PB
9877 /* fput will clean it up */
9878 fput(file);
9879 return ret;
9880 }
044c1ab3 9881
c826bd7a 9882 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
9883 return ret;
9884err:
d9d05217 9885 io_disable_sqo_submit(ctx);
2b188cc1
JA
9886 io_ring_ctx_wait_and_kill(ctx);
9887 return ret;
9888}
9889
9890/*
9891 * Sets up an aio uring context, and returns the fd. Applications asks for a
9892 * ring size, we return the actual sq/cq ring sizes (among other things) in the
9893 * params structure passed in.
9894 */
9895static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9896{
9897 struct io_uring_params p;
2b188cc1
JA
9898 int i;
9899
9900 if (copy_from_user(&p, params, sizeof(p)))
9901 return -EFAULT;
9902 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9903 if (p.resv[i])
9904 return -EINVAL;
9905 }
9906
6c271ce2 9907 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 9908 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7
SG
9909 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9910 IORING_SETUP_R_DISABLED))
2b188cc1
JA
9911 return -EINVAL;
9912
7f13657d 9913 return io_uring_create(entries, &p, params);
2b188cc1
JA
9914}
9915
9916SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9917 struct io_uring_params __user *, params)
9918{
9919 return io_uring_setup(entries, params);
9920}
9921
66f4af93
JA
9922static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9923{
9924 struct io_uring_probe *p;
9925 size_t size;
9926 int i, ret;
9927
9928 size = struct_size(p, ops, nr_args);
9929 if (size == SIZE_MAX)
9930 return -EOVERFLOW;
9931 p = kzalloc(size, GFP_KERNEL);
9932 if (!p)
9933 return -ENOMEM;
9934
9935 ret = -EFAULT;
9936 if (copy_from_user(p, arg, size))
9937 goto out;
9938 ret = -EINVAL;
9939 if (memchr_inv(p, 0, size))
9940 goto out;
9941
9942 p->last_op = IORING_OP_LAST - 1;
9943 if (nr_args > IORING_OP_LAST)
9944 nr_args = IORING_OP_LAST;
9945
9946 for (i = 0; i < nr_args; i++) {
9947 p->ops[i].op = i;
9948 if (!io_op_defs[i].not_supported)
9949 p->ops[i].flags = IO_URING_OP_SUPPORTED;
9950 }
9951 p->ops_len = i;
9952
9953 ret = 0;
9954 if (copy_to_user(arg, p, size))
9955 ret = -EFAULT;
9956out:
9957 kfree(p);
9958 return ret;
9959}
9960
071698e1
JA
9961static int io_register_personality(struct io_ring_ctx *ctx)
9962{
1e6fa521
JA
9963 struct io_identity *id;
9964 int ret;
071698e1 9965
1e6fa521
JA
9966 id = kmalloc(sizeof(*id), GFP_KERNEL);
9967 if (unlikely(!id))
9968 return -ENOMEM;
9969
9970 io_init_identity(id);
9971 id->creds = get_current_cred();
9972
9973 ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
9974 if (ret < 0) {
9975 put_cred(id->creds);
9976 kfree(id);
9977 }
9978 return ret;
071698e1
JA
9979}
9980
21b55dbc
SG
9981static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9982 unsigned int nr_args)
9983{
9984 struct io_uring_restriction *res;
9985 size_t size;
9986 int i, ret;
9987
7e84e1c7
SG
9988 /* Restrictions allowed only if rings started disabled */
9989 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9990 return -EBADFD;
9991
21b55dbc 9992 /* We allow only a single restrictions registration */
7e84e1c7 9993 if (ctx->restrictions.registered)
21b55dbc
SG
9994 return -EBUSY;
9995
9996 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9997 return -EINVAL;
9998
9999 size = array_size(nr_args, sizeof(*res));
10000 if (size == SIZE_MAX)
10001 return -EOVERFLOW;
10002
10003 res = memdup_user(arg, size);
10004 if (IS_ERR(res))
10005 return PTR_ERR(res);
10006
10007 ret = 0;
10008
10009 for (i = 0; i < nr_args; i++) {
10010 switch (res[i].opcode) {
10011 case IORING_RESTRICTION_REGISTER_OP:
10012 if (res[i].register_op >= IORING_REGISTER_LAST) {
10013 ret = -EINVAL;
10014 goto out;
10015 }
10016
10017 __set_bit(res[i].register_op,
10018 ctx->restrictions.register_op);
10019 break;
10020 case IORING_RESTRICTION_SQE_OP:
10021 if (res[i].sqe_op >= IORING_OP_LAST) {
10022 ret = -EINVAL;
10023 goto out;
10024 }
10025
10026 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10027 break;
10028 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10029 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10030 break;
10031 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10032 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10033 break;
10034 default:
10035 ret = -EINVAL;
10036 goto out;
10037 }
10038 }
10039
10040out:
10041 /* Reset all restrictions if an error happened */
10042 if (ret != 0)
10043 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10044 else
7e84e1c7 10045 ctx->restrictions.registered = true;
21b55dbc
SG
10046
10047 kfree(res);
10048 return ret;
10049}
10050
7e84e1c7
SG
10051static int io_register_enable_rings(struct io_ring_ctx *ctx)
10052{
10053 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10054 return -EBADFD;
10055
10056 if (ctx->restrictions.registered)
10057 ctx->restricted = 1;
10058
10059 ctx->flags &= ~IORING_SETUP_R_DISABLED;
10060
10061 io_sq_offload_start(ctx);
10062
10063 return 0;
10064}
10065
071698e1
JA
10066static bool io_register_op_must_quiesce(int op)
10067{
10068 switch (op) {
10069 case IORING_UNREGISTER_FILES:
10070 case IORING_REGISTER_FILES_UPDATE:
10071 case IORING_REGISTER_PROBE:
10072 case IORING_REGISTER_PERSONALITY:
10073 case IORING_UNREGISTER_PERSONALITY:
10074 return false;
10075 default:
10076 return true;
10077 }
10078}
10079
edafccee
JA
10080static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10081 void __user *arg, unsigned nr_args)
b19062a5
JA
10082 __releases(ctx->uring_lock)
10083 __acquires(ctx->uring_lock)
edafccee
JA
10084{
10085 int ret;
10086
35fa71a0
JA
10087 /*
10088 * We're inside the ring mutex, if the ref is already dying, then
10089 * someone else killed the ctx or is already going through
10090 * io_uring_register().
10091 */
10092 if (percpu_ref_is_dying(&ctx->refs))
10093 return -ENXIO;
10094
071698e1 10095 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 10096 percpu_ref_kill(&ctx->refs);
b19062a5 10097
05f3fb3c
JA
10098 /*
10099 * Drop uring mutex before waiting for references to exit. If
10100 * another thread is currently inside io_uring_enter() it might
10101 * need to grab the uring_lock to make progress. If we hold it
10102 * here across the drain wait, then we can deadlock. It's safe
10103 * to drop the mutex here, since no new references will come in
10104 * after we've killed the percpu ref.
10105 */
10106 mutex_unlock(&ctx->uring_lock);
af9c1a44
JA
10107 do {
10108 ret = wait_for_completion_interruptible(&ctx->ref_comp);
10109 if (!ret)
10110 break;
ed6930c9
JA
10111 ret = io_run_task_work_sig();
10112 if (ret < 0)
10113 break;
af9c1a44
JA
10114 } while (1);
10115
05f3fb3c 10116 mutex_lock(&ctx->uring_lock);
af9c1a44 10117
88f171ab
PB
10118 if (ret && io_refs_resurrect(&ctx->refs, &ctx->ref_comp))
10119 return ret;
21b55dbc
SG
10120 }
10121
10122 if (ctx->restricted) {
10123 if (opcode >= IORING_REGISTER_LAST) {
10124 ret = -EINVAL;
10125 goto out;
10126 }
10127
10128 if (!test_bit(opcode, ctx->restrictions.register_op)) {
10129 ret = -EACCES;
c150368b
JA
10130 goto out;
10131 }
05f3fb3c 10132 }
edafccee
JA
10133
10134 switch (opcode) {
10135 case IORING_REGISTER_BUFFERS:
0a96bbe4 10136 ret = io_sqe_buffers_register(ctx, arg, nr_args);
edafccee
JA
10137 break;
10138 case IORING_UNREGISTER_BUFFERS:
10139 ret = -EINVAL;
10140 if (arg || nr_args)
10141 break;
0a96bbe4 10142 ret = io_sqe_buffers_unregister(ctx);
edafccee 10143 break;
6b06314c
JA
10144 case IORING_REGISTER_FILES:
10145 ret = io_sqe_files_register(ctx, arg, nr_args);
10146 break;
10147 case IORING_UNREGISTER_FILES:
10148 ret = -EINVAL;
10149 if (arg || nr_args)
10150 break;
10151 ret = io_sqe_files_unregister(ctx);
10152 break;
c3a31e60
JA
10153 case IORING_REGISTER_FILES_UPDATE:
10154 ret = io_sqe_files_update(ctx, arg, nr_args);
10155 break;
9b402849 10156 case IORING_REGISTER_EVENTFD:
f2842ab5 10157 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
10158 ret = -EINVAL;
10159 if (nr_args != 1)
10160 break;
10161 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
10162 if (ret)
10163 break;
10164 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10165 ctx->eventfd_async = 1;
10166 else
10167 ctx->eventfd_async = 0;
9b402849
JA
10168 break;
10169 case IORING_UNREGISTER_EVENTFD:
10170 ret = -EINVAL;
10171 if (arg || nr_args)
10172 break;
10173 ret = io_eventfd_unregister(ctx);
10174 break;
66f4af93
JA
10175 case IORING_REGISTER_PROBE:
10176 ret = -EINVAL;
10177 if (!arg || nr_args > 256)
10178 break;
10179 ret = io_probe(ctx, arg, nr_args);
10180 break;
071698e1
JA
10181 case IORING_REGISTER_PERSONALITY:
10182 ret = -EINVAL;
10183 if (arg || nr_args)
10184 break;
10185 ret = io_register_personality(ctx);
10186 break;
10187 case IORING_UNREGISTER_PERSONALITY:
10188 ret = -EINVAL;
10189 if (arg)
10190 break;
10191 ret = io_unregister_personality(ctx, nr_args);
10192 break;
7e84e1c7
SG
10193 case IORING_REGISTER_ENABLE_RINGS:
10194 ret = -EINVAL;
10195 if (arg || nr_args)
10196 break;
10197 ret = io_register_enable_rings(ctx);
10198 break;
21b55dbc
SG
10199 case IORING_REGISTER_RESTRICTIONS:
10200 ret = io_register_restrictions(ctx, arg, nr_args);
10201 break;
edafccee
JA
10202 default:
10203 ret = -EINVAL;
10204 break;
10205 }
10206
21b55dbc 10207out:
071698e1 10208 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 10209 /* bring the ctx back to life */
05f3fb3c 10210 percpu_ref_reinit(&ctx->refs);
0f158b4c 10211 reinit_completion(&ctx->ref_comp);
05f3fb3c 10212 }
edafccee
JA
10213 return ret;
10214}
10215
10216SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10217 void __user *, arg, unsigned int, nr_args)
10218{
10219 struct io_ring_ctx *ctx;
10220 long ret = -EBADF;
10221 struct fd f;
10222
10223 f = fdget(fd);
10224 if (!f.file)
10225 return -EBADF;
10226
10227 ret = -EOPNOTSUPP;
10228 if (f.file->f_op != &io_uring_fops)
10229 goto out_fput;
10230
10231 ctx = f.file->private_data;
10232
b6c23dd5
PB
10233 io_run_task_work();
10234
edafccee
JA
10235 mutex_lock(&ctx->uring_lock);
10236 ret = __io_uring_register(ctx, opcode, arg, nr_args);
10237 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
10238 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10239 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
10240out_fput:
10241 fdput(f);
10242 return ret;
10243}
10244
2b188cc1
JA
10245static int __init io_uring_init(void)
10246{
d7f62e82
SM
10247#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10248 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10249 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10250} while (0)
10251
10252#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10253 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10254 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10255 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
10256 BUILD_BUG_SQE_ELEM(1, __u8, flags);
10257 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
10258 BUILD_BUG_SQE_ELEM(4, __s32, fd);
10259 BUILD_BUG_SQE_ELEM(8, __u64, off);
10260 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
10261 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 10262 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
10263 BUILD_BUG_SQE_ELEM(24, __u32, len);
10264 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
10265 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
10266 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10267 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
10268 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
10269 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
10270 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
10271 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
10272 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
10273 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
10274 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
10275 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
10276 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
10277 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 10278 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
10279 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
10280 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
10281 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 10282 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
d7f62e82 10283
d3656344 10284 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
84557871 10285 BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
91f245d5
JA
10286 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
10287 SLAB_ACCOUNT);
2b188cc1
JA
10288 return 0;
10289};
10290__initcall(io_uring_init);