]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/io_uring.c
io_uring: move io_rsrc_node_alloc() definition
[mirror_ubuntu-jammy-kernel.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
d068b506 14 * through a control-dependency in io_get_cqe (smp_store_release to
1e84b97b
SB
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
2b188cc1 60#include <linux/blkdev.h>
edafccee 61#include <linux/bvec.h>
2b188cc1
JA
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
6b06314c 65#include <net/scm.h>
2b188cc1
JA
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
edafccee
JA
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
aa4c3967 72#include <linux/highmem.h>
15b71abe
JA
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
4840e418 75#include <linux/fadvise.h>
3e4827b0 76#include <linux/eventpoll.h>
7d67af2c 77#include <linux/splice.h>
b41e9852 78#include <linux/task_work.h>
bcf5a063 79#include <linux/pagemap.h>
0f212204 80#include <linux/io_uring.h>
ef98eb04 81#include <linux/tracehook.h>
2b188cc1 82
c826bd7a
DD
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
2b188cc1
JA
86#include <uapi/linux/io_uring.h>
87
88#include "internal.h"
561fb04a 89#include "io-wq.h"
2b188cc1 90
5277deaa 91#define IORING_MAX_ENTRIES 32768
33a107f0 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
4ce8ad95 93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
65e19f54 94
042b0d85
PB
95/* 512 entries per page on 64-bit archs, 64 pages max */
96#define IORING_MAX_FIXED_FILES (1U << 15)
21b55dbc
SG
97#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
98 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 99
2d091d62
PB
100#define IO_RSRC_TAG_TABLE_SHIFT 9
101#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
102#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
103
489809e2
PB
104#define IORING_MAX_REG_BUFFERS (1U << 14)
105
b16fed66
PB
106#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
107 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
108 IOSQE_BUFFER_SELECT)
c854357b
PB
109#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
110 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
b16fed66 111
09899b19
PB
112#define IO_TCTX_REFS_CACHE_NR (1U << 10)
113
2b188cc1
JA
114struct io_uring {
115 u32 head ____cacheline_aligned_in_smp;
116 u32 tail ____cacheline_aligned_in_smp;
117};
118
1e84b97b 119/*
75b28aff
HV
120 * This data is shared with the application through the mmap at offsets
121 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
122 *
123 * The offsets to the member fields are published through struct
124 * io_sqring_offsets when calling io_uring_setup.
125 */
75b28aff 126struct io_rings {
1e84b97b
SB
127 /*
128 * Head and tail offsets into the ring; the offsets need to be
129 * masked to get valid indices.
130 *
75b28aff
HV
131 * The kernel controls head of the sq ring and the tail of the cq ring,
132 * and the application controls tail of the sq ring and the head of the
133 * cq ring.
1e84b97b 134 */
75b28aff 135 struct io_uring sq, cq;
1e84b97b 136 /*
75b28aff 137 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
138 * ring_entries - 1)
139 */
75b28aff
HV
140 u32 sq_ring_mask, cq_ring_mask;
141 /* Ring sizes (constant, power of 2) */
142 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
143 /*
144 * Number of invalid entries dropped by the kernel due to
145 * invalid index stored in array
146 *
147 * Written by the kernel, shouldn't be modified by the
148 * application (i.e. get number of "new events" by comparing to
149 * cached value).
150 *
151 * After a new SQ head value was read by the application this
152 * counter includes all submissions that were dropped reaching
153 * the new SQ head (and possibly more).
154 */
75b28aff 155 u32 sq_dropped;
1e84b97b 156 /*
0d9b5b3a 157 * Runtime SQ flags
1e84b97b
SB
158 *
159 * Written by the kernel, shouldn't be modified by the
160 * application.
161 *
162 * The application needs a full memory barrier before checking
163 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
164 */
75b28aff 165 u32 sq_flags;
0d9b5b3a
SG
166 /*
167 * Runtime CQ flags
168 *
169 * Written by the application, shouldn't be modified by the
170 * kernel.
171 */
fe7e3257 172 u32 cq_flags;
1e84b97b
SB
173 /*
174 * Number of completion events lost because the queue was full;
175 * this should be avoided by the application by making sure
0b4295b5 176 * there are not more requests pending than there is space in
1e84b97b
SB
177 * the completion queue.
178 *
179 * Written by the kernel, shouldn't be modified by the
180 * application (i.e. get number of "new events" by comparing to
181 * cached value).
182 *
183 * As completion events come in out of order this counter is not
184 * ordered with any other data.
185 */
75b28aff 186 u32 cq_overflow;
1e84b97b
SB
187 /*
188 * Ring buffer of completion events.
189 *
190 * The kernel writes completion events fresh every time they are
191 * produced, so the application is allowed to modify pending
192 * entries.
193 */
75b28aff 194 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
195};
196
45d189c6
PB
197enum io_uring_cmd_flags {
198 IO_URING_F_NONBLOCK = 1,
889fca73 199 IO_URING_F_COMPLETE_DEFER = 2,
45d189c6
PB
200};
201
edafccee
JA
202struct io_mapped_ubuf {
203 u64 ubuf;
4751f53d 204 u64 ubuf_end;
edafccee 205 unsigned int nr_bvecs;
de293938 206 unsigned long acct_pages;
41edf1a5 207 struct bio_vec bvec[];
edafccee
JA
208};
209
50238531
BM
210struct io_ring_ctx;
211
6c2450ae
PB
212struct io_overflow_cqe {
213 struct io_uring_cqe cqe;
214 struct list_head list;
215};
216
a04b0ac0
PB
217struct io_fixed_file {
218 /* file * with additional FFS_* flags */
219 unsigned long file_ptr;
220};
221
269bbe5f
BM
222struct io_rsrc_put {
223 struct list_head list;
b60c8dce 224 u64 tag;
50238531
BM
225 union {
226 void *rsrc;
227 struct file *file;
bd54b6fe 228 struct io_mapped_ubuf *buf;
50238531 229 };
269bbe5f
BM
230};
231
aeca241b 232struct io_file_table {
042b0d85 233 struct io_fixed_file *files;
31b51510
JA
234};
235
b895c9a6 236struct io_rsrc_node {
05589553
XW
237 struct percpu_ref refs;
238 struct list_head node;
269bbe5f 239 struct list_head rsrc_list;
b895c9a6 240 struct io_rsrc_data *rsrc_data;
4a38aed2 241 struct llist_node llist;
e297822b 242 bool done;
05589553
XW
243};
244
40ae0ff7
PB
245typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
246
b895c9a6 247struct io_rsrc_data {
05f3fb3c
JA
248 struct io_ring_ctx *ctx;
249
2d091d62
PB
250 u64 **tags;
251 unsigned int nr;
40ae0ff7 252 rsrc_put_fn *do_put;
3e942498 253 atomic_t refs;
05f3fb3c 254 struct completion done;
8bad28d8 255 bool quiesce;
05f3fb3c
JA
256};
257
5a2e745d
JA
258struct io_buffer {
259 struct list_head list;
260 __u64 addr;
d1f82808 261 __u32 len;
5a2e745d
JA
262 __u16 bid;
263};
264
21b55dbc
SG
265struct io_restriction {
266 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
267 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
268 u8 sqe_flags_allowed;
269 u8 sqe_flags_required;
7e84e1c7 270 bool registered;
21b55dbc
SG
271};
272
37d1e2e3
JA
273enum {
274 IO_SQ_THREAD_SHOULD_STOP = 0,
275 IO_SQ_THREAD_SHOULD_PARK,
276};
277
534ca6d6
JA
278struct io_sq_data {
279 refcount_t refs;
9e138a48 280 atomic_t park_pending;
09a6f4ef 281 struct mutex lock;
69fb2131
JA
282
283 /* ctx's that are using this sqd */
284 struct list_head ctx_list;
69fb2131 285
534ca6d6
JA
286 struct task_struct *thread;
287 struct wait_queue_head wait;
08369246
XW
288
289 unsigned sq_thread_idle;
37d1e2e3
JA
290 int sq_cpu;
291 pid_t task_pid;
5c2469e0 292 pid_t task_tgid;
37d1e2e3
JA
293
294 unsigned long state;
37d1e2e3 295 struct completion exited;
534ca6d6
JA
296};
297
258b29a9 298#define IO_IOPOLL_BATCH 8
6dd0be1e 299#define IO_COMPL_BATCH 32
6ff119a6 300#define IO_REQ_CACHE_SIZE 32
bf019da7 301#define IO_REQ_ALLOC_BATCH 8
258b29a9
PB
302
303struct io_comp_state {
6dd0be1e 304 struct io_kiocb *reqs[IO_COMPL_BATCH];
1b4c351f 305 unsigned int nr;
c7dae4ba 306 /* inline/task_work completion list, under ->uring_lock */
1b4c351f 307 struct list_head free_list;
258b29a9
PB
308};
309
a1ab7b35
PB
310struct io_submit_link {
311 struct io_kiocb *head;
312 struct io_kiocb *last;
313};
314
258b29a9
PB
315struct io_submit_state {
316 struct blk_plug plug;
a1ab7b35 317 struct io_submit_link link;
258b29a9
PB
318
319 /*
320 * io_kiocb alloc cache
321 */
bf019da7 322 void *reqs[IO_REQ_CACHE_SIZE];
258b29a9
PB
323 unsigned int free_reqs;
324
325 bool plug_started;
326
327 /*
328 * Batch completion logic
329 */
330 struct io_comp_state comp;
331
332 /*
333 * File reference cache
334 */
335 struct file *file;
336 unsigned int fd;
337 unsigned int file_refs;
338 unsigned int ios_left;
339};
340
2b188cc1 341struct io_ring_ctx {
b52ecf8c 342 /* const or read-mostly hot data */
2b188cc1
JA
343 struct {
344 struct percpu_ref refs;
2b188cc1 345
b52ecf8c 346 struct io_rings *rings;
2b188cc1 347 unsigned int flags;
e1d85334 348 unsigned int compat: 1;
e1d85334
RD
349 unsigned int drain_next: 1;
350 unsigned int eventfd_async: 1;
21b55dbc 351 unsigned int restricted: 1;
f18ee4cf 352 unsigned int off_timeout_used: 1;
10c66904 353 unsigned int drain_active: 1;
b52ecf8c 354 } ____cacheline_aligned_in_smp;
2b188cc1 355
7f1129d2 356 /* submission data */
b52ecf8c 357 struct {
0499e582
PB
358 struct mutex uring_lock;
359
75b28aff
HV
360 /*
361 * Ring buffer of indices into array of io_uring_sqe, which is
362 * mmapped by the application using the IORING_OFF_SQES offset.
363 *
364 * This indirection could e.g. be used to assign fixed
365 * io_uring_sqe entries to operations and only submit them to
366 * the queue when needed.
367 *
368 * The kernel modifies neither the indices array nor the entries
369 * array.
370 */
371 u32 *sq_array;
c7af47cf 372 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
373 unsigned cached_sq_head;
374 unsigned sq_entries;
de0617e4 375 struct list_head defer_list;
7f1129d2
PB
376
377 /*
378 * Fixed resources fast path, should be accessed only under
379 * uring_lock, and updated through io_uring_register(2)
380 */
381 struct io_rsrc_node *rsrc_node;
382 struct io_file_table file_table;
383 unsigned nr_user_files;
384 unsigned nr_user_bufs;
385 struct io_mapped_ubuf **user_bufs;
386
387 struct io_submit_state submit_state;
5262f567 388 struct list_head timeout_list;
1d7bb1d5 389 struct list_head cq_overflow_list;
7f1129d2
PB
390 struct xarray io_buffers;
391 struct xarray personalities;
392 u32 pers_next;
393 unsigned sq_thread_idle;
2b188cc1
JA
394 } ____cacheline_aligned_in_smp;
395
d0acdee2
PB
396 /* IRQ completion list, under ->completion_lock */
397 struct list_head locked_free_list;
398 unsigned int locked_free_nr;
3c1a2ead 399
7c30f36a 400 const struct cred *sq_creds; /* cred used for __io_sq_thread() */
534ca6d6
JA
401 struct io_sq_data *sq_data; /* if using sq thread polling */
402
90554200 403 struct wait_queue_head sqo_sq_wait;
69fb2131 404 struct list_head sqd_list;
75b28aff 405
5ed7a37d
PB
406 unsigned long check_cq_overflow;
407
206aefde
JA
408 struct {
409 unsigned cached_cq_tail;
410 unsigned cq_entries;
0499e582 411 struct eventfd_ctx *cq_ev_fd;
311997b3 412 struct wait_queue_head poll_wait;
0499e582
PB
413 struct wait_queue_head cq_wait;
414 unsigned cq_extra;
415 atomic_t cq_timeouts;
206aefde 416 struct fasync_struct *cq_fasync;
0499e582 417 unsigned cq_last_tm_flush;
206aefde 418 } ____cacheline_aligned_in_smp;
2b188cc1 419
2b188cc1
JA
420 struct {
421 spinlock_t completion_lock;
e94f141b 422
def596e9 423 /*
540e32a0 424 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
425 * io_uring instances that don't use IORING_SETUP_SQPOLL.
426 * For SQPOLL, only the single threaded io_sq_thread() will
427 * manipulate the list, hence no extra locking is needed there.
428 */
540e32a0 429 struct list_head iopoll_list;
78076bb6
JA
430 struct hlist_head *cancel_hash;
431 unsigned cancel_hash_bits;
915b3dde 432 bool poll_multi_queue;
2b188cc1 433 } ____cacheline_aligned_in_smp;
85faa7b8 434
21b55dbc 435 struct io_restriction restrictions;
3c1a2ead 436
b13a8918
PB
437 /* slow path rsrc auxilary data, used by update/register */
438 struct {
439 struct io_rsrc_node *rsrc_backup_node;
440 struct io_mapped_ubuf *dummy_ubuf;
441 struct io_rsrc_data *file_data;
442 struct io_rsrc_data *buf_data;
443
444 struct delayed_work rsrc_put_work;
445 struct llist_head rsrc_put_llist;
446 struct list_head rsrc_ref_list;
447 spinlock_t rsrc_ref_lock;
448 };
449
3c1a2ead 450 /* Keep this last, we don't need it for the fast path */
b986af7e
PB
451 struct {
452 #if defined(CONFIG_UNIX)
453 struct socket *ring_sock;
454 #endif
455 /* hashed buffered write serialization */
456 struct io_wq_hash *hash_map;
457
458 /* Only used for accounting purposes */
459 struct user_struct *user;
460 struct mm_struct *mm_account;
461
462 /* ctx exit and cancelation */
9011bf9a
PB
463 struct llist_head fallback_llist;
464 struct delayed_work fallback_work;
b986af7e
PB
465 struct work_struct exit_work;
466 struct list_head tctx_list;
467 struct completion ref_comp;
468 };
2b188cc1
JA
469};
470
53e043b2
SM
471struct io_uring_task {
472 /* submission side */
09899b19 473 int cached_refs;
53e043b2
SM
474 struct xarray xa;
475 struct wait_queue_head wait;
ee53fb2b
SM
476 const struct io_ring_ctx *last;
477 struct io_wq *io_wq;
53e043b2 478 struct percpu_counter inflight;
b303fe2e 479 atomic_t inflight_tracked;
53e043b2 480 atomic_t in_idle;
53e043b2
SM
481
482 spinlock_t task_lock;
483 struct io_wq_work_list task_list;
484 unsigned long task_state;
485 struct callback_head task_work;
486};
487
09bb8394
JA
488/*
489 * First field must be the file pointer in all the
490 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
491 */
221c5eb2
JA
492struct io_poll_iocb {
493 struct file *file;
018043be 494 struct wait_queue_head *head;
221c5eb2 495 __poll_t events;
8c838788 496 bool done;
221c5eb2 497 bool canceled;
392edb45 498 struct wait_queue_entry wait;
221c5eb2
JA
499};
500
9d805892 501struct io_poll_update {
018043be 502 struct file *file;
9d805892
PB
503 u64 old_user_data;
504 u64 new_user_data;
505 __poll_t events;
b69de288
JA
506 bool update_events;
507 bool update_user_data;
018043be
PB
508};
509
b5dba59e
JA
510struct io_close {
511 struct file *file;
b5dba59e
JA
512 int fd;
513};
514
ad8a48ac
JA
515struct io_timeout_data {
516 struct io_kiocb *req;
517 struct hrtimer timer;
518 struct timespec64 ts;
519 enum hrtimer_mode mode;
520};
521
8ed8d3c3
JA
522struct io_accept {
523 struct file *file;
524 struct sockaddr __user *addr;
525 int __user *addr_len;
526 int flags;
09952e3e 527 unsigned long nofile;
8ed8d3c3
JA
528};
529
530struct io_sync {
531 struct file *file;
532 loff_t len;
533 loff_t off;
534 int flags;
d63d1b5e 535 int mode;
8ed8d3c3
JA
536};
537
fbf23849
JA
538struct io_cancel {
539 struct file *file;
540 u64 addr;
541};
542
b29472ee
JA
543struct io_timeout {
544 struct file *file;
bfe68a22
PB
545 u32 off;
546 u32 target_seq;
135fcde8 547 struct list_head list;
90cd7e42
PB
548 /* head of the link, used by linked timeouts only */
549 struct io_kiocb *head;
b29472ee
JA
550};
551
0bdf7a2d
PB
552struct io_timeout_rem {
553 struct file *file;
554 u64 addr;
9c8e11b3
PB
555
556 /* timeout update */
557 struct timespec64 ts;
558 u32 flags;
0bdf7a2d
PB
559};
560
9adbd45d
JA
561struct io_rw {
562 /* NOTE: kiocb has the file as the first member, so don't do it here */
563 struct kiocb kiocb;
564 u64 addr;
565 u64 len;
566};
567
3fbb51c1
JA
568struct io_connect {
569 struct file *file;
570 struct sockaddr __user *addr;
571 int addr_len;
572};
573
e47293fd
JA
574struct io_sr_msg {
575 struct file *file;
fddaface 576 union {
4af3417a
PB
577 struct compat_msghdr __user *umsg_compat;
578 struct user_msghdr __user *umsg;
579 void __user *buf;
fddaface 580 };
e47293fd 581 int msg_flags;
bcda7baa 582 int bgid;
fddaface 583 size_t len;
bcda7baa 584 struct io_buffer *kbuf;
e47293fd
JA
585};
586
15b71abe
JA
587struct io_open {
588 struct file *file;
589 int dfd;
15b71abe 590 struct filename *filename;
c12cedf2 591 struct open_how how;
4022e7af 592 unsigned long nofile;
15b71abe
JA
593};
594
269bbe5f 595struct io_rsrc_update {
05f3fb3c
JA
596 struct file *file;
597 u64 arg;
598 u32 nr_args;
599 u32 offset;
600};
601
4840e418
JA
602struct io_fadvise {
603 struct file *file;
604 u64 offset;
605 u32 len;
606 u32 advice;
607};
608
c1ca757b
JA
609struct io_madvise {
610 struct file *file;
611 u64 addr;
612 u32 len;
613 u32 advice;
614};
615
3e4827b0
JA
616struct io_epoll {
617 struct file *file;
618 int epfd;
619 int op;
620 int fd;
621 struct epoll_event event;
e47293fd
JA
622};
623
7d67af2c
PB
624struct io_splice {
625 struct file *file_out;
626 struct file *file_in;
627 loff_t off_out;
628 loff_t off_in;
629 u64 len;
630 unsigned int flags;
631};
632
ddf0322d
JA
633struct io_provide_buf {
634 struct file *file;
635 __u64 addr;
38134ada 636 __u32 len;
ddf0322d
JA
637 __u32 bgid;
638 __u16 nbufs;
639 __u16 bid;
640};
641
1d9e1288
BM
642struct io_statx {
643 struct file *file;
644 int dfd;
645 unsigned int mask;
646 unsigned int flags;
e62753e4 647 const char __user *filename;
1d9e1288
BM
648 struct statx __user *buffer;
649};
650
36f4fa68
JA
651struct io_shutdown {
652 struct file *file;
653 int how;
654};
655
80a261fd
JA
656struct io_rename {
657 struct file *file;
658 int old_dfd;
659 int new_dfd;
660 struct filename *oldpath;
661 struct filename *newpath;
662 int flags;
663};
664
14a1143b
JA
665struct io_unlink {
666 struct file *file;
667 int dfd;
668 int flags;
669 struct filename *filename;
670};
671
3ca405eb
PB
672struct io_completion {
673 struct file *file;
674 struct list_head list;
8c3f9cd1 675 u32 cflags;
3ca405eb
PB
676};
677
f499a021
JA
678struct io_async_connect {
679 struct sockaddr_storage address;
680};
681
03b1230c
JA
682struct io_async_msghdr {
683 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
684 /* points to an allocated iov, if NULL we use fast_iov instead */
685 struct iovec *free_iov;
03b1230c
JA
686 struct sockaddr __user *uaddr;
687 struct msghdr msg;
b537916c 688 struct sockaddr_storage addr;
03b1230c
JA
689};
690
f67676d1
JA
691struct io_async_rw {
692 struct iovec fast_iov[UIO_FASTIOV];
ff6165b2
JA
693 const struct iovec *free_iovec;
694 struct iov_iter iter;
227c0c96 695 size_t bytes_done;
bcf5a063 696 struct wait_page_queue wpq;
f67676d1
JA
697};
698
6b47ee6e
PB
699enum {
700 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
701 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
702 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
703 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
704 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 705 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
6b47ee6e 706
dddca226 707 /* first byte is taken by user flags, shift it to not overlap */
93d2bcd2 708 REQ_F_FAIL_BIT = 8,
6b47ee6e
PB
709 REQ_F_INFLIGHT_BIT,
710 REQ_F_CUR_POS_BIT,
711 REQ_F_NOWAIT_BIT,
6b47ee6e 712 REQ_F_LINK_TIMEOUT_BIT,
99bc4c38 713 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 714 REQ_F_POLLED_BIT,
bcda7baa 715 REQ_F_BUFFER_SELECTED_BIT,
900fad45 716 REQ_F_LTIMEOUT_ACTIVE_BIT,
e342c807 717 REQ_F_COMPLETE_INLINE_BIT,
230d50d4 718 REQ_F_REISSUE_BIT,
8c130827 719 REQ_F_DONT_REISSUE_BIT,
b8e64b53 720 REQ_F_CREDS_BIT,
7b29f92d 721 /* keep async read/write and isreg together and in order */
b191e2df
PB
722 REQ_F_NOWAIT_READ_BIT,
723 REQ_F_NOWAIT_WRITE_BIT,
7b29f92d 724 REQ_F_ISREG_BIT,
84557871
JA
725
726 /* not a real bit, just to check we're not overflowing the space */
727 __REQ_F_LAST_BIT,
6b47ee6e
PB
728};
729
730enum {
731 /* ctx owns file */
732 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
733 /* drain existing IO first */
734 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
735 /* linked sqes */
736 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
737 /* doesn't sever on completion < 0 */
738 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
739 /* IOSQE_ASYNC */
740 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
741 /* IOSQE_BUFFER_SELECT */
742 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
6b47ee6e 743
6b47ee6e 744 /* fail rest of links */
93d2bcd2 745 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
b05a1bcd 746 /* on inflight list, should be cancelled and waited on exit reliably */
6b47ee6e
PB
747 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
748 /* read/write uses file position */
749 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
750 /* must not punt to workers */
751 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 752 /* has or had linked timeout */
6b47ee6e 753 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
99bc4c38
PB
754 /* needs cleanup */
755 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
756 /* already went through poll handler */
757 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
758 /* buffer already selected */
759 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
900fad45
PB
760 /* linked timeout is active, i.e. prepared by link's head */
761 REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
e342c807
PB
762 /* completion is deferred through io_comp_state */
763 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
230d50d4
JA
764 /* caller should reissue async */
765 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
8c130827
PB
766 /* don't attempt request reissue, see io_rw_reissue() */
767 REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT),
7b29f92d 768 /* supports async reads */
b191e2df 769 REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
7b29f92d 770 /* supports async writes */
b191e2df 771 REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
7b29f92d
JA
772 /* regular file */
773 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
b8e64b53
PB
774 /* has creds assigned */
775 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
d7718a9d
JA
776};
777
778struct async_poll {
779 struct io_poll_iocb poll;
807abcb0 780 struct io_poll_iocb *double_poll;
6b47ee6e
PB
781};
782
5b0a6acc
PB
783typedef void (*io_req_tw_func_t)(struct io_kiocb *req);
784
7cbf1722 785struct io_task_work {
5b0a6acc
PB
786 union {
787 struct io_wq_work_node node;
788 struct llist_node fallback_node;
789 };
790 io_req_tw_func_t func;
7cbf1722
JA
791};
792
992da01a
PB
793enum {
794 IORING_RSRC_FILE = 0,
795 IORING_RSRC_BUFFER = 1,
796};
797
09bb8394
JA
798/*
799 * NOTE! Each of the iocb union members has the file pointer
800 * as the first entry in their struct definition. So you can
801 * access the file pointer through any of the sub-structs,
802 * or directly as just 'ki_filp' in this struct.
803 */
2b188cc1 804struct io_kiocb {
221c5eb2 805 union {
09bb8394 806 struct file *file;
9adbd45d 807 struct io_rw rw;
221c5eb2 808 struct io_poll_iocb poll;
9d805892 809 struct io_poll_update poll_update;
8ed8d3c3
JA
810 struct io_accept accept;
811 struct io_sync sync;
fbf23849 812 struct io_cancel cancel;
b29472ee 813 struct io_timeout timeout;
0bdf7a2d 814 struct io_timeout_rem timeout_rem;
3fbb51c1 815 struct io_connect connect;
e47293fd 816 struct io_sr_msg sr_msg;
15b71abe 817 struct io_open open;
b5dba59e 818 struct io_close close;
269bbe5f 819 struct io_rsrc_update rsrc_update;
4840e418 820 struct io_fadvise fadvise;
c1ca757b 821 struct io_madvise madvise;
3e4827b0 822 struct io_epoll epoll;
7d67af2c 823 struct io_splice splice;
ddf0322d 824 struct io_provide_buf pbuf;
1d9e1288 825 struct io_statx statx;
36f4fa68 826 struct io_shutdown shutdown;
80a261fd 827 struct io_rename rename;
14a1143b 828 struct io_unlink unlink;
3ca405eb
PB
829 /* use only after cleaning per-op data, see io_clean_op() */
830 struct io_completion compl;
221c5eb2 831 };
2b188cc1 832
e8c2bc1f
JA
833 /* opcode allocated if it needs to store data for async defer */
834 void *async_data;
d625c6ee 835 u8 opcode;
65a6543d
XW
836 /* polled IO has completed */
837 u8 iopoll_completed;
2b188cc1 838
4f4eeba8 839 u16 buf_index;
9cf7c104 840 u32 result;
4f4eeba8 841
010e8e6b
PB
842 struct io_ring_ctx *ctx;
843 unsigned int flags;
abc54d63 844 atomic_t refs;
010e8e6b
PB
845 struct task_struct *task;
846 u64 user_data;
d7718a9d 847
f2f87370 848 struct io_kiocb *link;
269bbe5f 849 struct percpu_ref *fixed_rsrc_refs;
fcb323cc 850
b303fe2e 851 /* used with ctx->iopoll_list with reads/writes */
010e8e6b 852 struct list_head inflight_entry;
5b0a6acc 853 struct io_task_work io_task_work;
010e8e6b
PB
854 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
855 struct hlist_node hash_node;
856 struct async_poll *apoll;
857 struct io_wq_work work;
fe7e3257 858 const struct cred *creds;
c10d1f98 859
eae071c9
PB
860 /* store used ubuf, so we can prevent reloading */
861 struct io_mapped_ubuf *imu;
2b188cc1 862};
05589553 863
13bf43f5
PB
864struct io_tctx_node {
865 struct list_head ctx_node;
866 struct task_struct *task;
13bf43f5
PB
867 struct io_ring_ctx *ctx;
868};
869
27dc8338
PB
870struct io_defer_entry {
871 struct list_head list;
872 struct io_kiocb *req;
9cf7c104 873 u32 seq;
2b188cc1
JA
874};
875
d3656344 876struct io_op_def {
d3656344
JA
877 /* needs req->file assigned */
878 unsigned needs_file : 1;
d3656344
JA
879 /* hash wq insertion if file is a regular file */
880 unsigned hash_reg_file : 1;
881 /* unbound wq insertion if file is a non-regular file */
882 unsigned unbound_nonreg_file : 1;
66f4af93
JA
883 /* opcode is not supported by this kernel */
884 unsigned not_supported : 1;
8a72758c
JA
885 /* set if opcode supports polled "wait" */
886 unsigned pollin : 1;
887 unsigned pollout : 1;
bcda7baa
JA
888 /* op supports buffer selection */
889 unsigned buffer_select : 1;
26f0505a
PB
890 /* do prep async if is going to be punted */
891 unsigned needs_async_setup : 1;
27926b68
JA
892 /* should block plug */
893 unsigned plug : 1;
e8c2bc1f
JA
894 /* size of async data needed, if any */
895 unsigned short async_size;
d3656344
JA
896};
897
0918682b 898static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
899 [IORING_OP_NOP] = {},
900 [IORING_OP_READV] = {
d3656344
JA
901 .needs_file = 1,
902 .unbound_nonreg_file = 1,
8a72758c 903 .pollin = 1,
4d954c25 904 .buffer_select = 1,
26f0505a 905 .needs_async_setup = 1,
27926b68 906 .plug = 1,
e8c2bc1f 907 .async_size = sizeof(struct io_async_rw),
d3656344 908 },
0463b6c5 909 [IORING_OP_WRITEV] = {
d3656344
JA
910 .needs_file = 1,
911 .hash_reg_file = 1,
912 .unbound_nonreg_file = 1,
8a72758c 913 .pollout = 1,
26f0505a 914 .needs_async_setup = 1,
27926b68 915 .plug = 1,
e8c2bc1f 916 .async_size = sizeof(struct io_async_rw),
d3656344 917 },
0463b6c5 918 [IORING_OP_FSYNC] = {
d3656344
JA
919 .needs_file = 1,
920 },
0463b6c5 921 [IORING_OP_READ_FIXED] = {
d3656344
JA
922 .needs_file = 1,
923 .unbound_nonreg_file = 1,
8a72758c 924 .pollin = 1,
27926b68 925 .plug = 1,
e8c2bc1f 926 .async_size = sizeof(struct io_async_rw),
d3656344 927 },
0463b6c5 928 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
929 .needs_file = 1,
930 .hash_reg_file = 1,
931 .unbound_nonreg_file = 1,
8a72758c 932 .pollout = 1,
27926b68 933 .plug = 1,
e8c2bc1f 934 .async_size = sizeof(struct io_async_rw),
d3656344 935 },
0463b6c5 936 [IORING_OP_POLL_ADD] = {
d3656344
JA
937 .needs_file = 1,
938 .unbound_nonreg_file = 1,
939 },
0463b6c5
PB
940 [IORING_OP_POLL_REMOVE] = {},
941 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
942 .needs_file = 1,
943 },
0463b6c5 944 [IORING_OP_SENDMSG] = {
d3656344
JA
945 .needs_file = 1,
946 .unbound_nonreg_file = 1,
8a72758c 947 .pollout = 1,
26f0505a 948 .needs_async_setup = 1,
e8c2bc1f 949 .async_size = sizeof(struct io_async_msghdr),
d3656344 950 },
0463b6c5 951 [IORING_OP_RECVMSG] = {
d3656344
JA
952 .needs_file = 1,
953 .unbound_nonreg_file = 1,
8a72758c 954 .pollin = 1,
52de1fe1 955 .buffer_select = 1,
26f0505a 956 .needs_async_setup = 1,
e8c2bc1f 957 .async_size = sizeof(struct io_async_msghdr),
d3656344 958 },
0463b6c5 959 [IORING_OP_TIMEOUT] = {
e8c2bc1f 960 .async_size = sizeof(struct io_timeout_data),
d3656344 961 },
9c8e11b3
PB
962 [IORING_OP_TIMEOUT_REMOVE] = {
963 /* used by timeout updates' prep() */
9c8e11b3 964 },
0463b6c5 965 [IORING_OP_ACCEPT] = {
d3656344
JA
966 .needs_file = 1,
967 .unbound_nonreg_file = 1,
8a72758c 968 .pollin = 1,
d3656344 969 },
0463b6c5
PB
970 [IORING_OP_ASYNC_CANCEL] = {},
971 [IORING_OP_LINK_TIMEOUT] = {
e8c2bc1f 972 .async_size = sizeof(struct io_timeout_data),
d3656344 973 },
0463b6c5 974 [IORING_OP_CONNECT] = {
d3656344
JA
975 .needs_file = 1,
976 .unbound_nonreg_file = 1,
8a72758c 977 .pollout = 1,
26f0505a 978 .needs_async_setup = 1,
e8c2bc1f 979 .async_size = sizeof(struct io_async_connect),
d3656344 980 },
0463b6c5 981 [IORING_OP_FALLOCATE] = {
d3656344 982 .needs_file = 1,
d3656344 983 },
44526bed
JA
984 [IORING_OP_OPENAT] = {},
985 [IORING_OP_CLOSE] = {},
986 [IORING_OP_FILES_UPDATE] = {},
987 [IORING_OP_STATX] = {},
0463b6c5 988 [IORING_OP_READ] = {
3a6820f2
JA
989 .needs_file = 1,
990 .unbound_nonreg_file = 1,
8a72758c 991 .pollin = 1,
bcda7baa 992 .buffer_select = 1,
27926b68 993 .plug = 1,
e8c2bc1f 994 .async_size = sizeof(struct io_async_rw),
3a6820f2 995 },
0463b6c5 996 [IORING_OP_WRITE] = {
3a6820f2
JA
997 .needs_file = 1,
998 .unbound_nonreg_file = 1,
8a72758c 999 .pollout = 1,
27926b68 1000 .plug = 1,
e8c2bc1f 1001 .async_size = sizeof(struct io_async_rw),
3a6820f2 1002 },
0463b6c5 1003 [IORING_OP_FADVISE] = {
4840e418 1004 .needs_file = 1,
c1ca757b 1005 },
44526bed 1006 [IORING_OP_MADVISE] = {},
0463b6c5 1007 [IORING_OP_SEND] = {
fddaface
JA
1008 .needs_file = 1,
1009 .unbound_nonreg_file = 1,
8a72758c 1010 .pollout = 1,
fddaface 1011 },
0463b6c5 1012 [IORING_OP_RECV] = {
fddaface
JA
1013 .needs_file = 1,
1014 .unbound_nonreg_file = 1,
8a72758c 1015 .pollin = 1,
bcda7baa 1016 .buffer_select = 1,
fddaface 1017 },
0463b6c5 1018 [IORING_OP_OPENAT2] = {
cebdb986 1019 },
3e4827b0
JA
1020 [IORING_OP_EPOLL_CTL] = {
1021 .unbound_nonreg_file = 1,
3e4827b0 1022 },
7d67af2c
PB
1023 [IORING_OP_SPLICE] = {
1024 .needs_file = 1,
1025 .hash_reg_file = 1,
1026 .unbound_nonreg_file = 1,
ddf0322d
JA
1027 },
1028 [IORING_OP_PROVIDE_BUFFERS] = {},
067524e9 1029 [IORING_OP_REMOVE_BUFFERS] = {},
f2a8d5c7
PB
1030 [IORING_OP_TEE] = {
1031 .needs_file = 1,
1032 .hash_reg_file = 1,
1033 .unbound_nonreg_file = 1,
1034 },
36f4fa68
JA
1035 [IORING_OP_SHUTDOWN] = {
1036 .needs_file = 1,
1037 },
44526bed
JA
1038 [IORING_OP_RENAMEAT] = {},
1039 [IORING_OP_UNLINKAT] = {},
d3656344
JA
1040};
1041
7a612350 1042static bool io_disarm_next(struct io_kiocb *req);
eef51daa 1043static void io_uring_del_tctx_node(unsigned long index);
9936c7c2
PB
1044static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1045 struct task_struct *task,
3dd0c97a 1046 bool cancel_all);
78cc687b 1047static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1ffc5422 1048
d4d19c19
PB
1049static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1050 long res, unsigned int cflags);
ec9c02ad 1051static void io_put_req(struct io_kiocb *req);
216578e5 1052static void io_put_req_deferred(struct io_kiocb *req, int nr);
c7dae4ba 1053static void io_dismantle_req(struct io_kiocb *req);
94ae5e77
JA
1054static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1055static void io_queue_linked_timeout(struct io_kiocb *req);
fdecb662 1056static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 1057 struct io_uring_rsrc_update2 *up,
98f0b3b4 1058 unsigned nr_args);
68fb8979 1059static void io_clean_op(struct io_kiocb *req);
ac177053
PB
1060static struct file *io_file_get(struct io_ring_ctx *ctx,
1061 struct io_submit_state *state,
8371adf5 1062 struct io_kiocb *req, int fd, bool fixed);
c5eef2b9 1063static void __io_queue_sqe(struct io_kiocb *req);
269bbe5f 1064static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1065
907d1df3 1066static void io_req_task_queue(struct io_kiocb *req);
2a2758f2 1067static void io_submit_flush_completions(struct io_ring_ctx *ctx);
5082620f 1068static bool io_poll_remove_waitqs(struct io_kiocb *req);
179ae0d1 1069static int io_req_prep_async(struct io_kiocb *req);
de0617e4 1070
9011bf9a
PB
1071static void io_fallback_req_func(struct work_struct *unused);
1072
2b188cc1
JA
1073static struct kmem_cache *req_cachep;
1074
0918682b 1075static const struct file_operations io_uring_fops;
2b188cc1
JA
1076
1077struct sock *io_uring_get_socket(struct file *file)
1078{
1079#if defined(CONFIG_UNIX)
1080 if (file->f_op == &io_uring_fops) {
1081 struct io_ring_ctx *ctx = file->private_data;
1082
1083 return ctx->ring_sock->sk;
1084 }
1085#endif
1086 return NULL;
1087}
1088EXPORT_SYMBOL(io_uring_get_socket);
1089
f2f87370
PB
1090#define io_for_each_link(pos, head) \
1091 for (pos = (head); pos; pos = pos->link)
1092
b895c9a6 1093static inline void io_req_set_rsrc_node(struct io_kiocb *req)
36f72fe2
PB
1094{
1095 struct io_ring_ctx *ctx = req->ctx;
1096
269bbe5f 1097 if (!req->fixed_rsrc_refs) {
a7f0ed5a 1098 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
269bbe5f 1099 percpu_ref_get(req->fixed_rsrc_refs);
36f72fe2
PB
1100 }
1101}
1102
f70865db
PB
1103static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1104{
1105 bool got = percpu_ref_tryget(ref);
1106
1107 /* already at zero, wait for ->release() */
1108 if (!got)
1109 wait_for_completion(compl);
1110 percpu_ref_resurrect(ref);
1111 if (got)
1112 percpu_ref_put(ref);
1113}
1114
3dd0c97a
PB
1115static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1116 bool cancel_all)
08d23634
PB
1117{
1118 struct io_kiocb *req;
1119
68207680 1120 if (task && head->task != task)
08d23634 1121 return false;
3dd0c97a 1122 if (cancel_all)
08d23634
PB
1123 return true;
1124
1125 io_for_each_link(req, head) {
b05a1bcd 1126 if (req->flags & REQ_F_INFLIGHT)
02a13674 1127 return true;
08d23634
PB
1128 }
1129 return false;
1130}
1131
93d2bcd2 1132static inline void req_set_fail(struct io_kiocb *req)
c40f6379 1133{
93d2bcd2 1134 req->flags |= REQ_F_FAIL;
c40f6379 1135}
4a38aed2 1136
2b188cc1
JA
1137static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1138{
1139 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1140
0f158b4c 1141 complete(&ctx->ref_comp);
2b188cc1
JA
1142}
1143
8eb7e2d0
PB
1144static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1145{
1146 return !req->timeout.off;
1147}
1148
2b188cc1
JA
1149static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1150{
1151 struct io_ring_ctx *ctx;
78076bb6 1152 int hash_bits;
2b188cc1
JA
1153
1154 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1155 if (!ctx)
1156 return NULL;
1157
78076bb6
JA
1158 /*
1159 * Use 5 bits less than the max cq entries, that should give us around
1160 * 32 entries per hash list if totally full and uniformly spread.
1161 */
1162 hash_bits = ilog2(p->cq_entries);
1163 hash_bits -= 5;
1164 if (hash_bits <= 0)
1165 hash_bits = 1;
1166 ctx->cancel_hash_bits = hash_bits;
1167 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1168 GFP_KERNEL);
1169 if (!ctx->cancel_hash)
1170 goto err;
1171 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1172
6224843d
PB
1173 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1174 if (!ctx->dummy_ubuf)
1175 goto err;
1176 /* set invalid range, so io_import_fixed() fails meeting it */
1177 ctx->dummy_ubuf->ubuf = -1UL;
1178
21482896 1179 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1180 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1181 goto err;
2b188cc1
JA
1182
1183 ctx->flags = p->flags;
90554200 1184 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1185 INIT_LIST_HEAD(&ctx->sqd_list);
311997b3 1186 init_waitqueue_head(&ctx->poll_wait);
1d7bb1d5 1187 INIT_LIST_HEAD(&ctx->cq_overflow_list);
0f158b4c 1188 init_completion(&ctx->ref_comp);
9e15c3a0 1189 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
61cf9370 1190 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
2b188cc1 1191 mutex_init(&ctx->uring_lock);
311997b3 1192 init_waitqueue_head(&ctx->cq_wait);
2b188cc1 1193 spin_lock_init(&ctx->completion_lock);
540e32a0 1194 INIT_LIST_HEAD(&ctx->iopoll_list);
de0617e4 1195 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1196 INIT_LIST_HEAD(&ctx->timeout_list);
d67d2263
BM
1197 spin_lock_init(&ctx->rsrc_ref_lock);
1198 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1199 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1200 init_llist_head(&ctx->rsrc_put_llist);
13bf43f5 1201 INIT_LIST_HEAD(&ctx->tctx_list);
1b4c351f 1202 INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
d0acdee2 1203 INIT_LIST_HEAD(&ctx->locked_free_list);
9011bf9a 1204 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
2b188cc1 1205 return ctx;
206aefde 1206err:
6224843d 1207 kfree(ctx->dummy_ubuf);
78076bb6 1208 kfree(ctx->cancel_hash);
206aefde
JA
1209 kfree(ctx);
1210 return NULL;
2b188cc1
JA
1211}
1212
8f6ed49a
PB
1213static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1214{
1215 struct io_rings *r = ctx->rings;
1216
1217 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1218 ctx->cq_extra--;
1219}
1220
9cf7c104 1221static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1222{
2bc9930e
JA
1223 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1224 struct io_ring_ctx *ctx = req->ctx;
a197f664 1225
8f6ed49a 1226 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
2bc9930e 1227 }
de0617e4 1228
9d858b21 1229 return false;
de0617e4
JA
1230}
1231
c97d8a0f
PB
1232#define FFS_ASYNC_READ 0x1UL
1233#define FFS_ASYNC_WRITE 0x2UL
1234#ifdef CONFIG_64BIT
1235#define FFS_ISREG 0x4UL
1236#else
1237#define FFS_ISREG 0x0UL
1238#endif
1239#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1240
1241static inline bool io_req_ffs_set(struct io_kiocb *req)
1242{
1243 return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1244}
1245
ce3d5aae
PB
1246static void io_req_track_inflight(struct io_kiocb *req)
1247{
ce3d5aae 1248 if (!(req->flags & REQ_F_INFLIGHT)) {
ce3d5aae 1249 req->flags |= REQ_F_INFLIGHT;
b303fe2e 1250 atomic_inc(&current->io_uring->inflight_tracked);
ce3d5aae
PB
1251 }
1252}
1253
1e6fa521
JA
1254static void io_prep_async_work(struct io_kiocb *req)
1255{
1256 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1257 struct io_ring_ctx *ctx = req->ctx;
1258
b8e64b53
PB
1259 if (!(req->flags & REQ_F_CREDS)) {
1260 req->flags |= REQ_F_CREDS;
c10d1f98 1261 req->creds = get_current_cred();
b8e64b53 1262 }
003e8dcc 1263
e1d675df
PB
1264 req->work.list.next = NULL;
1265 req->work.flags = 0;
feaadc4f
PB
1266 if (req->flags & REQ_F_FORCE_ASYNC)
1267 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1268
1e6fa521
JA
1269 if (req->flags & REQ_F_ISREG) {
1270 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1271 io_wq_hash_work(&req->work, file_inode(req->file));
4b982bd0 1272 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1e6fa521
JA
1273 if (def->unbound_nonreg_file)
1274 req->work.flags |= IO_WQ_WORK_UNBOUND;
1275 }
e1d675df
PB
1276
1277 switch (req->opcode) {
1278 case IORING_OP_SPLICE:
1279 case IORING_OP_TEE:
e1d675df
PB
1280 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1281 req->work.flags |= IO_WQ_WORK_UNBOUND;
1282 break;
1283 }
561fb04a 1284}
cccf0ee8 1285
cbdcb435 1286static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1287{
cbdcb435 1288 struct io_kiocb *cur;
54a91f3b 1289
44eff40a
PB
1290 if (req->flags & REQ_F_LINK_TIMEOUT) {
1291 struct io_ring_ctx *ctx = req->ctx;
1292
1293 spin_lock_irq(&ctx->completion_lock);
1294 io_for_each_link(cur, req)
1295 io_prep_async_work(cur);
1296 spin_unlock_irq(&ctx->completion_lock);
1297 } else {
1298 io_for_each_link(cur, req)
1299 io_prep_async_work(cur);
1300 }
561fb04a
JA
1301}
1302
ebf93667 1303static void io_queue_async_work(struct io_kiocb *req)
561fb04a 1304{
a197f664 1305 struct io_ring_ctx *ctx = req->ctx;
cbdcb435 1306 struct io_kiocb *link = io_prep_linked_timeout(req);
5aa75ed5 1307 struct io_uring_task *tctx = req->task->io_uring;
561fb04a 1308
3bfe6106
JA
1309 BUG_ON(!tctx);
1310 BUG_ON(!tctx->io_wq);
561fb04a 1311
cbdcb435
PB
1312 /* init ->work of the whole link before punting */
1313 io_prep_async_link(req);
991468dc
JA
1314
1315 /*
1316 * Not expected to happen, but if we do have a bug where this _can_
1317 * happen, catch it here and ensure the request is marked as
1318 * canceled. That will make io-wq go through the usual work cancel
1319 * procedure rather than attempt to run this request (or create a new
1320 * worker for it).
1321 */
1322 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1323 req->work.flags |= IO_WQ_WORK_CANCEL;
1324
d07f1e8a
PB
1325 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1326 &req->work, req->flags);
ebf93667 1327 io_wq_enqueue(tctx->io_wq, &req->work);
7271ef3a
JA
1328 if (link)
1329 io_queue_linked_timeout(link);
cbdcb435
PB
1330}
1331
1ee4160c 1332static void io_kill_timeout(struct io_kiocb *req, int status)
8c855885 1333 __must_hold(&req->ctx->completion_lock)
5262f567 1334{
e8c2bc1f 1335 struct io_timeout_data *io = req->async_data;
5262f567 1336
fd9c7bc5 1337 if (hrtimer_try_to_cancel(&io->timer) != -1) {
01cec8c1
PB
1338 atomic_set(&req->ctx->cq_timeouts,
1339 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1340 list_del_init(&req->timeout.list);
d4d19c19 1341 io_cqring_fill_event(req->ctx, req->user_data, status, 0);
216578e5 1342 io_put_req_deferred(req, 1);
5262f567
JA
1343 }
1344}
1345
441b8a78 1346static void io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 1347{
441b8a78 1348 while (!list_empty(&ctx->defer_list)) {
27dc8338
PB
1349 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1350 struct io_defer_entry, list);
de0617e4 1351
9cf7c104 1352 if (req_need_defer(de->req, de->seq))
04518945 1353 break;
27dc8338 1354 list_del_init(&de->list);
907d1df3 1355 io_req_task_queue(de->req);
27dc8338 1356 kfree(de);
441b8a78 1357 }
04518945
PB
1358}
1359
360428f8 1360static void io_flush_timeouts(struct io_ring_ctx *ctx)
de0617e4 1361{
441b8a78 1362 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
f010505b 1363
f18ee4cf 1364 while (!list_empty(&ctx->timeout_list)) {
f010505b 1365 u32 events_needed, events_got;
360428f8 1366 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
135fcde8 1367 struct io_kiocb, timeout.list);
de0617e4 1368
8eb7e2d0 1369 if (io_is_timeout_noseq(req))
360428f8 1370 break;
f010505b
MDG
1371
1372 /*
1373 * Since seq can easily wrap around over time, subtract
1374 * the last seq at which timeouts were flushed before comparing.
1375 * Assuming not more than 2^31-1 events have happened since,
1376 * these subtractions won't have wrapped, so we can check if
1377 * target is in [last_seq, current_seq] by comparing the two.
1378 */
1379 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1380 events_got = seq - ctx->cq_last_tm_flush;
1381 if (events_got < events_needed)
360428f8 1382 break;
bfe68a22 1383
135fcde8 1384 list_del_init(&req->timeout.list);
1ee4160c 1385 io_kill_timeout(req, 0);
f18ee4cf 1386 }
f010505b 1387 ctx->cq_last_tm_flush = seq;
360428f8 1388}
5262f567 1389
2335f6f5 1390static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
360428f8 1391{
2335f6f5
PB
1392 if (ctx->off_timeout_used)
1393 io_flush_timeouts(ctx);
1394 if (ctx->drain_active)
1395 io_queue_deferred(ctx);
1396}
1397
1398static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1399{
1400 if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1401 __io_commit_cqring_flush(ctx);
ec30e04b
PB
1402 /* order cqe stores with ring update */
1403 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
de0617e4
JA
1404}
1405
90554200
JA
1406static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1407{
1408 struct io_rings *r = ctx->rings;
1409
a566c556 1410 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
90554200
JA
1411}
1412
888aae2e
PB
1413static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1414{
1415 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1416}
1417
d068b506 1418static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
2b188cc1 1419{
75b28aff 1420 struct io_rings *rings = ctx->rings;
ea5ab3b5 1421 unsigned tail, mask = ctx->cq_entries - 1;
2b188cc1 1422
115e12e5
SB
1423 /*
1424 * writes to the cq entry need to come after reading head; the
1425 * control dependency is enough as we're using WRITE_ONCE to
1426 * fill the cq entry
1427 */
a566c556 1428 if (__io_cqring_events(ctx) == ctx->cq_entries)
2b188cc1
JA
1429 return NULL;
1430
888aae2e 1431 tail = ctx->cached_cq_tail++;
ea5ab3b5 1432 return &rings->cqes[tail & mask];
2b188cc1
JA
1433}
1434
f2842ab5
JA
1435static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1436{
44c769de 1437 if (likely(!ctx->cq_ev_fd))
f0b493e6 1438 return false;
7e55a19c
SG
1439 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1440 return false;
44c769de 1441 return !ctx->eventfd_async || io_wq_current_is_worker();
f2842ab5
JA
1442}
1443
b41e9852 1444static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 1445{
5fd46178
JA
1446 /*
1447 * wake_up_all() may seem excessive, but io_wake_function() and
1448 * io_should_wake() handle the termination of the loop and only
1449 * wake as many waiters as we need to.
1450 */
1451 if (wq_has_sleeper(&ctx->cq_wait))
1452 wake_up_all(&ctx->cq_wait);
534ca6d6
JA
1453 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1454 wake_up(&ctx->sq_data->wait);
b41e9852 1455 if (io_should_trigger_evfd(ctx))
1d7bb1d5 1456 eventfd_signal(ctx->cq_ev_fd, 1);
311997b3
PB
1457 if (waitqueue_active(&ctx->poll_wait)) {
1458 wake_up_interruptible(&ctx->poll_wait);
4aa84f2f
PB
1459 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1460 }
1d7bb1d5
JA
1461}
1462
80c18e4a
PB
1463static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1464{
1465 if (ctx->flags & IORING_SETUP_SQPOLL) {
5fd46178
JA
1466 if (wq_has_sleeper(&ctx->cq_wait))
1467 wake_up_all(&ctx->cq_wait);
80c18e4a
PB
1468 }
1469 if (io_should_trigger_evfd(ctx))
1470 eventfd_signal(ctx->cq_ev_fd, 1);
311997b3
PB
1471 if (waitqueue_active(&ctx->poll_wait)) {
1472 wake_up_interruptible(&ctx->poll_wait);
4aa84f2f
PB
1473 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1474 }
80c18e4a
PB
1475}
1476
c4a2ed72 1477/* Returns true if there are no backlogged entries after the flush */
6c2450ae 1478static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5 1479{
1d7bb1d5 1480 unsigned long flags;
b18032bb 1481 bool all_flushed, posted;
1d7bb1d5 1482
a566c556 1483 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
e23de15f 1484 return false;
1d7bb1d5 1485
b18032bb 1486 posted = false;
1d7bb1d5 1487 spin_lock_irqsave(&ctx->completion_lock, flags);
6c2450ae 1488 while (!list_empty(&ctx->cq_overflow_list)) {
d068b506 1489 struct io_uring_cqe *cqe = io_get_cqe(ctx);
6c2450ae 1490 struct io_overflow_cqe *ocqe;
e6c8aa9a 1491
1d7bb1d5
JA
1492 if (!cqe && !force)
1493 break;
6c2450ae
PB
1494 ocqe = list_first_entry(&ctx->cq_overflow_list,
1495 struct io_overflow_cqe, list);
1496 if (cqe)
1497 memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1498 else
8f6ed49a
PB
1499 io_account_cq_overflow(ctx);
1500
b18032bb 1501 posted = true;
6c2450ae
PB
1502 list_del(&ocqe->list);
1503 kfree(ocqe);
1d7bb1d5
JA
1504 }
1505
09e88404
PB
1506 all_flushed = list_empty(&ctx->cq_overflow_list);
1507 if (all_flushed) {
5ed7a37d 1508 clear_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1509 WRITE_ONCE(ctx->rings->sq_flags,
1510 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
09e88404 1511 }
46930143 1512
b18032bb
JA
1513 if (posted)
1514 io_commit_cqring(ctx);
1d7bb1d5 1515 spin_unlock_irqrestore(&ctx->completion_lock, flags);
b18032bb
JA
1516 if (posted)
1517 io_cqring_ev_posted(ctx);
09e88404 1518 return all_flushed;
1d7bb1d5
JA
1519}
1520
6c2450ae 1521static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
6c503150 1522{
ca0a2651
JA
1523 bool ret = true;
1524
5ed7a37d 1525 if (test_bit(0, &ctx->check_cq_overflow)) {
6c503150
PB
1526 /* iopoll syncs against uring_lock, not completion_lock */
1527 if (ctx->flags & IORING_SETUP_IOPOLL)
1528 mutex_lock(&ctx->uring_lock);
6c2450ae 1529 ret = __io_cqring_overflow_flush(ctx, force);
6c503150
PB
1530 if (ctx->flags & IORING_SETUP_IOPOLL)
1531 mutex_unlock(&ctx->uring_lock);
1532 }
ca0a2651
JA
1533
1534 return ret;
6c503150
PB
1535}
1536
abc54d63
JA
1537/*
1538 * Shamelessly stolen from the mm implementation of page reference checking,
1539 * see commit f958d7b528b1 for details.
1540 */
1541#define req_ref_zero_or_close_to_overflow(req) \
1542 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1543
de9b4cca
JA
1544static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1545{
abc54d63 1546 return atomic_inc_not_zero(&req->refs);
de9b4cca
JA
1547}
1548
1549static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
1550{
abc54d63
JA
1551 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1552 return atomic_sub_and_test(refs, &req->refs);
de9b4cca
JA
1553}
1554
1555static inline bool req_ref_put_and_test(struct io_kiocb *req)
1556{
abc54d63
JA
1557 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1558 return atomic_dec_and_test(&req->refs);
de9b4cca
JA
1559}
1560
1561static inline void req_ref_put(struct io_kiocb *req)
1562{
abc54d63 1563 WARN_ON_ONCE(req_ref_put_and_test(req));
de9b4cca
JA
1564}
1565
1566static inline void req_ref_get(struct io_kiocb *req)
1567{
abc54d63
JA
1568 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1569 atomic_inc(&req->refs);
de9b4cca
JA
1570}
1571
6a290a14
PB
1572/* must to be called somewhat shortly after putting a request */
1573static inline void io_put_task(struct task_struct *task, int nr)
1574{
1575 struct io_uring_task *tctx = task->io_uring;
1576
1577 percpu_counter_sub(&tctx->inflight, nr);
1578 if (unlikely(atomic_read(&tctx->in_idle)))
1579 wake_up(&tctx->wait);
1580 put_task_struct_many(task, nr);
1581}
1582
d4d19c19
PB
1583static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1584 long res, unsigned int cflags)
2b188cc1 1585{
cce4b8b0 1586 struct io_overflow_cqe *ocqe;
2b188cc1 1587
cce4b8b0
PB
1588 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1589 if (!ocqe) {
1590 /*
1591 * If we're in ring overflow flush mode, or in task cancel mode,
1592 * or cannot allocate an overflow entry, then we need to drop it
1593 * on the floor.
1594 */
8f6ed49a 1595 io_account_cq_overflow(ctx);
cce4b8b0 1596 return false;
2b188cc1 1597 }
cce4b8b0 1598 if (list_empty(&ctx->cq_overflow_list)) {
5ed7a37d 1599 set_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1600 WRITE_ONCE(ctx->rings->sq_flags,
1601 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1602
cce4b8b0 1603 }
d4d19c19 1604 ocqe->cqe.user_data = user_data;
cce4b8b0
PB
1605 ocqe->cqe.res = res;
1606 ocqe->cqe.flags = cflags;
1607 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1608 return true;
2b188cc1
JA
1609}
1610
d4d19c19
PB
1611static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1612 long res, unsigned int cflags)
2b188cc1
JA
1613{
1614 struct io_uring_cqe *cqe;
1615
d4d19c19 1616 trace_io_uring_complete(ctx, user_data, res, cflags);
51c3ff62 1617
2b188cc1
JA
1618 /*
1619 * If we can't get a cq entry, userspace overflowed the
1620 * submission (by quite a lot). Increment the overflow count in
1621 * the ring.
1622 */
d068b506 1623 cqe = io_get_cqe(ctx);
1d7bb1d5 1624 if (likely(cqe)) {
d4d19c19 1625 WRITE_ONCE(cqe->user_data, user_data);
2b188cc1 1626 WRITE_ONCE(cqe->res, res);
bcda7baa 1627 WRITE_ONCE(cqe->flags, cflags);
8d13326e 1628 return true;
2b188cc1 1629 }
d4d19c19 1630 return io_cqring_event_overflow(ctx, user_data, res, cflags);
2b188cc1
JA
1631}
1632
8d13326e 1633/* not as hot to bloat with inlining */
d4d19c19
PB
1634static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1635 long res, unsigned int cflags)
bcda7baa 1636{
d4d19c19 1637 return __io_cqring_fill_event(ctx, user_data, res, cflags);
bcda7baa
JA
1638}
1639
7a612350
PB
1640static void io_req_complete_post(struct io_kiocb *req, long res,
1641 unsigned int cflags)
2b188cc1 1642{
78e19bbe 1643 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1644 unsigned long flags;
1645
1646 spin_lock_irqsave(&ctx->completion_lock, flags);
d4d19c19 1647 __io_cqring_fill_event(ctx, req->user_data, res, cflags);
c7dae4ba
JA
1648 /*
1649 * If we're the last reference to this request, add to our locked
1650 * free_list cache.
1651 */
de9b4cca 1652 if (req_ref_put_and_test(req)) {
7a612350 1653 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
93d2bcd2 1654 if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
7a612350
PB
1655 io_disarm_next(req);
1656 if (req->link) {
1657 io_req_task_queue(req->link);
1658 req->link = NULL;
1659 }
1660 }
c7dae4ba
JA
1661 io_dismantle_req(req);
1662 io_put_task(req->task, 1);
d0acdee2
PB
1663 list_add(&req->compl.list, &ctx->locked_free_list);
1664 ctx->locked_free_nr++;
180f829f
PB
1665 } else {
1666 if (!percpu_ref_tryget(&ctx->refs))
1667 req = NULL;
1668 }
7a612350 1669 io_commit_cqring(ctx);
2b188cc1 1670 spin_unlock_irqrestore(&ctx->completion_lock, flags);
7a612350 1671
180f829f
PB
1672 if (req) {
1673 io_cqring_ev_posted(ctx);
c7dae4ba 1674 percpu_ref_put(&ctx->refs);
180f829f 1675 }
229a7b63
JA
1676}
1677
4e3d9ff9
JA
1678static inline bool io_req_needs_clean(struct io_kiocb *req)
1679{
c854357b 1680 return req->flags & IO_REQ_CLEAN_FLAGS;
4e3d9ff9
JA
1681}
1682
a38d68db 1683static void io_req_complete_state(struct io_kiocb *req, long res,
889fca73 1684 unsigned int cflags)
229a7b63 1685{
4e3d9ff9 1686 if (io_req_needs_clean(req))
68fb8979 1687 io_clean_op(req);
a38d68db
PB
1688 req->result = res;
1689 req->compl.cflags = cflags;
e342c807 1690 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
1691}
1692
889fca73
PB
1693static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1694 long res, unsigned cflags)
bcda7baa 1695{
889fca73
PB
1696 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1697 io_req_complete_state(req, res, cflags);
a38d68db 1698 else
c7dae4ba 1699 io_req_complete_post(req, res, cflags);
bcda7baa
JA
1700}
1701
a38d68db 1702static inline void io_req_complete(struct io_kiocb *req, long res)
0ddf92e8 1703{
889fca73 1704 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
1705}
1706
f41db273
PB
1707static void io_req_complete_failed(struct io_kiocb *req, long res)
1708{
93d2bcd2 1709 req_set_fail(req);
f41db273
PB
1710 io_put_req(req);
1711 io_req_complete_post(req, res, 0);
1712}
1713
864ea921
PB
1714/*
1715 * Don't initialise the fields below on every allocation, but do that in
1716 * advance and keep them valid across allocations.
1717 */
1718static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1719{
1720 req->ctx = ctx;
1721 req->link = NULL;
1722 req->async_data = NULL;
1723 /* not necessary, but safer to zero */
1724 req->result = 0;
1725}
1726
dac7a098
PB
1727static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1728 struct io_comp_state *cs)
1729{
1730 spin_lock_irq(&ctx->completion_lock);
d0acdee2
PB
1731 list_splice_init(&ctx->locked_free_list, &cs->free_list);
1732 ctx->locked_free_nr = 0;
dac7a098
PB
1733 spin_unlock_irq(&ctx->completion_lock);
1734}
1735
dd78f492 1736/* Returns true IFF there are requests in the cache */
c7dae4ba 1737static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
0ddf92e8 1738{
c7dae4ba
JA
1739 struct io_submit_state *state = &ctx->submit_state;
1740 struct io_comp_state *cs = &state->comp;
dd78f492 1741 int nr;
0ddf92e8 1742
c7dae4ba
JA
1743 /*
1744 * If we have more than a batch's worth of requests in our IRQ side
1745 * locked cache, grab the lock and move them over to our submission
1746 * side cache.
1747 */
d0acdee2 1748 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
dac7a098 1749 io_flush_cached_locked_reqs(ctx, cs);
0ddf92e8 1750
dd78f492 1751 nr = state->free_reqs;
c7dae4ba 1752 while (!list_empty(&cs->free_list)) {
dd78f492
PB
1753 struct io_kiocb *req = list_first_entry(&cs->free_list,
1754 struct io_kiocb, compl.list);
1755
1b4c351f 1756 list_del(&req->compl.list);
dd78f492
PB
1757 state->reqs[nr++] = req;
1758 if (nr == ARRAY_SIZE(state->reqs))
e5d1bc0a 1759 break;
1b4c351f
JA
1760 }
1761
dd78f492
PB
1762 state->free_reqs = nr;
1763 return nr != 0;
0ddf92e8
JA
1764}
1765
e5d1bc0a 1766static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2b188cc1 1767{
e5d1bc0a 1768 struct io_submit_state *state = &ctx->submit_state;
864ea921
PB
1769 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1770 int ret, i;
e5d1bc0a 1771
fe7e3257 1772 BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
e5d1bc0a 1773
864ea921
PB
1774 if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
1775 goto got_req;
e5d1bc0a 1776
864ea921
PB
1777 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1778 state->reqs);
fd6fab2c 1779
864ea921
PB
1780 /*
1781 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1782 * retry single alloc to be on the safe side.
1783 */
1784 if (unlikely(ret <= 0)) {
1785 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1786 if (!state->reqs[0])
1787 return NULL;
1788 ret = 1;
2b188cc1 1789 }
864ea921
PB
1790
1791 for (i = 0; i < ret; i++)
1792 io_preinit_req(state->reqs[i], ctx);
1793 state->free_reqs = ret;
e5d1bc0a 1794got_req:
291b2821
PB
1795 state->free_reqs--;
1796 return state->reqs[state->free_reqs];
2b188cc1
JA
1797}
1798
e1d767f0 1799static inline void io_put_file(struct file *file)
8da11c19 1800{
e1d767f0 1801 if (file)
8da11c19
PB
1802 fput(file);
1803}
1804
4edf20f9 1805static void io_dismantle_req(struct io_kiocb *req)
2b188cc1 1806{
094bae49 1807 unsigned int flags = req->flags;
929a3af9 1808
3a0a6902
PB
1809 if (io_req_needs_clean(req))
1810 io_clean_op(req);
e1d767f0
PB
1811 if (!(flags & REQ_F_FIXED_FILE))
1812 io_put_file(req->file);
269bbe5f
BM
1813 if (req->fixed_rsrc_refs)
1814 percpu_ref_put(req->fixed_rsrc_refs);
99ebe4ef 1815 if (req->async_data) {
094bae49 1816 kfree(req->async_data);
99ebe4ef
PB
1817 req->async_data = NULL;
1818 }
e65ef56d
JA
1819}
1820
216578e5 1821static void __io_free_req(struct io_kiocb *req)
c6ca97b3 1822{
51a4cc11 1823 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 1824
216578e5 1825 io_dismantle_req(req);
7c660731 1826 io_put_task(req->task, 1);
c6ca97b3 1827
3893f39f 1828 kmem_cache_free(req_cachep, req);
ecfc5177 1829 percpu_ref_put(&ctx->refs);
e65ef56d
JA
1830}
1831
f2f87370
PB
1832static inline void io_remove_next_linked(struct io_kiocb *req)
1833{
1834 struct io_kiocb *nxt = req->link;
1835
1836 req->link = nxt->link;
1837 nxt->link = NULL;
1838}
1839
33cc89a9
PB
1840static bool io_kill_linked_timeout(struct io_kiocb *req)
1841 __must_hold(&req->ctx->completion_lock)
2665abfd 1842{
33cc89a9 1843 struct io_kiocb *link = req->link;
f2f87370 1844
900fad45
PB
1845 /*
1846 * Can happen if a linked timeout fired and link had been like
1847 * req -> link t-out -> link t-out [-> ...]
1848 */
c9abd7ad
PB
1849 if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1850 struct io_timeout_data *io = link->async_data;
7c86ffee 1851
f2f87370 1852 io_remove_next_linked(req);
90cd7e42 1853 link->timeout.head = NULL;
fd9c7bc5 1854 if (hrtimer_try_to_cancel(&io->timer) != -1) {
d4d19c19
PB
1855 io_cqring_fill_event(link->ctx, link->user_data,
1856 -ECANCELED, 0);
33cc89a9 1857 io_put_req_deferred(link, 1);
d4729fbd 1858 return true;
c9abd7ad
PB
1859 }
1860 }
d4729fbd 1861 return false;
7c86ffee
PB
1862}
1863
d148ca4b 1864static void io_fail_links(struct io_kiocb *req)
33cc89a9 1865 __must_hold(&req->ctx->completion_lock)
9e645e11 1866{
33cc89a9 1867 struct io_kiocb *nxt, *link = req->link;
9e645e11 1868
f2f87370 1869 req->link = NULL;
f2f87370
PB
1870 while (link) {
1871 nxt = link->link;
1872 link->link = NULL;
2665abfd 1873
f2f87370 1874 trace_io_uring_fail_link(req, link);
d4d19c19 1875 io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
1575f21a 1876 io_put_req_deferred(link, 2);
f2f87370 1877 link = nxt;
9e645e11 1878 }
33cc89a9 1879}
9e645e11 1880
33cc89a9
PB
1881static bool io_disarm_next(struct io_kiocb *req)
1882 __must_hold(&req->ctx->completion_lock)
1883{
1884 bool posted = false;
1885
1886 if (likely(req->flags & REQ_F_LINK_TIMEOUT))
1887 posted = io_kill_linked_timeout(req);
93d2bcd2 1888 if (unlikely((req->flags & REQ_F_FAIL) &&
e4335ed3 1889 !(req->flags & REQ_F_HARDLINK))) {
33cc89a9
PB
1890 posted |= (req->link != NULL);
1891 io_fail_links(req);
1892 }
1893 return posted;
9e645e11
JA
1894}
1895
3fa5e0f3 1896static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
c69f8dbe 1897{
33cc89a9 1898 struct io_kiocb *nxt;
944e58bf 1899
9e645e11
JA
1900 /*
1901 * If LINK is set, we have dependent requests in this chain. If we
1902 * didn't fail this request, queue the first one up, moving any other
1903 * dependencies to the next request. In case of failure, fail the rest
1904 * of the chain.
1905 */
93d2bcd2 1906 if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
33cc89a9
PB
1907 struct io_ring_ctx *ctx = req->ctx;
1908 unsigned long flags;
1909 bool posted;
1910
1911 spin_lock_irqsave(&ctx->completion_lock, flags);
1912 posted = io_disarm_next(req);
1913 if (posted)
1914 io_commit_cqring(req->ctx);
1915 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1916 if (posted)
1917 io_cqring_ev_posted(ctx);
f2f87370 1918 }
33cc89a9
PB
1919 nxt = req->link;
1920 req->link = NULL;
1921 return nxt;
4d7dd462 1922}
9e645e11 1923
f2f87370 1924static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
3fa5e0f3 1925{
cdbff982 1926 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
3fa5e0f3
PB
1927 return NULL;
1928 return __io_req_find_next(req);
1929}
1930
2c32395d
PB
1931static void ctx_flush_and_put(struct io_ring_ctx *ctx)
1932{
1933 if (!ctx)
1934 return;
1935 if (ctx->submit_state.comp.nr) {
1936 mutex_lock(&ctx->uring_lock);
2a2758f2 1937 io_submit_flush_completions(ctx);
2c32395d
PB
1938 mutex_unlock(&ctx->uring_lock);
1939 }
1940 percpu_ref_put(&ctx->refs);
1941}
1942
7cbf1722 1943static void tctx_task_work(struct callback_head *cb)
c40f6379 1944{
ebd0df2e 1945 struct io_ring_ctx *ctx = NULL;
3f18407d
PB
1946 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
1947 task_work);
c40f6379 1948
16f72070 1949 while (1) {
3f18407d
PB
1950 struct io_wq_work_node *node;
1951
1952 spin_lock_irq(&tctx->task_lock);
c6538be9 1953 node = tctx->task_list.first;
3f18407d
PB
1954 INIT_WQ_LIST(&tctx->task_list);
1955 spin_unlock_irq(&tctx->task_lock);
1956
3f18407d
PB
1957 while (node) {
1958 struct io_wq_work_node *next = node->next;
1959 struct io_kiocb *req = container_of(node, struct io_kiocb,
1960 io_task_work.node);
1961
1962 if (req->ctx != ctx) {
1963 ctx_flush_and_put(ctx);
1964 ctx = req->ctx;
1965 percpu_ref_get(&ctx->refs);
1966 }
5b0a6acc 1967 req->io_task_work.func(req);
3f18407d
PB
1968 node = next;
1969 }
7a778f9d 1970 if (wq_list_empty(&tctx->task_list)) {
110aa25c 1971 spin_lock_irq(&tctx->task_lock);
7a778f9d 1972 clear_bit(0, &tctx->task_state);
110aa25c
JA
1973 if (wq_list_empty(&tctx->task_list)) {
1974 spin_unlock_irq(&tctx->task_lock);
7a778f9d 1975 break;
110aa25c
JA
1976 }
1977 spin_unlock_irq(&tctx->task_lock);
7a778f9d
PB
1978 /* another tctx_task_work() is enqueued, yield */
1979 if (test_and_set_bit(0, &tctx->task_state))
1980 break;
1981 }
7cbf1722 1982 cond_resched();
3f18407d 1983 }
ebd0df2e
PB
1984
1985 ctx_flush_and_put(ctx);
7cbf1722
JA
1986}
1987
e09ee510 1988static void io_req_task_work_add(struct io_kiocb *req)
7cbf1722 1989{
c15b79de 1990 struct task_struct *tsk = req->task;
7cbf1722 1991 struct io_uring_task *tctx = tsk->io_uring;
c15b79de 1992 enum task_work_notify_mode notify;
e09ee510 1993 struct io_wq_work_node *node;
0b81e80c 1994 unsigned long flags;
7cbf1722
JA
1995
1996 WARN_ON_ONCE(!tctx);
1997
0b81e80c 1998 spin_lock_irqsave(&tctx->task_lock, flags);
7cbf1722 1999 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
0b81e80c 2000 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2001
2002 /* task_work already pending, we're done */
2003 if (test_bit(0, &tctx->task_state) ||
2004 test_and_set_bit(0, &tctx->task_state))
e09ee510 2005 return;
7cbf1722 2006
c15b79de
PB
2007 /*
2008 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2009 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2010 * processing task_work. There's no reliable way to tell if TWA_RESUME
2011 * will do the job.
2012 */
2013 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
c15b79de
PB
2014 if (!task_work_add(tsk, &tctx->task_work, notify)) {
2015 wake_up_process(tsk);
e09ee510 2016 return;
c15b79de 2017 }
2215bed9 2018
e09ee510 2019 clear_bit(0, &tctx->task_state);
0b81e80c 2020 spin_lock_irqsave(&tctx->task_lock, flags);
e09ee510
PB
2021 node = tctx->task_list.first;
2022 INIT_WQ_LIST(&tctx->task_list);
0b81e80c 2023 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722 2024
e09ee510
PB
2025 while (node) {
2026 req = container_of(node, struct io_kiocb, io_task_work.node);
2027 node = node->next;
2028 if (llist_add(&req->io_task_work.fallback_node,
2029 &req->ctx->fallback_llist))
2030 schedule_delayed_work(&req->ctx->fallback_work, 1);
2031 }
eab30c4d
PB
2032}
2033
5b0a6acc 2034static void io_req_task_cancel(struct io_kiocb *req)
c40f6379 2035{
87ceb6a6 2036 struct io_ring_ctx *ctx = req->ctx;
c40f6379 2037
e83acd7d 2038 /* ctx is guaranteed to stay alive while we hold uring_lock */
792bb6eb 2039 mutex_lock(&ctx->uring_lock);
2593553a 2040 io_req_complete_failed(req, req->result);
792bb6eb 2041 mutex_unlock(&ctx->uring_lock);
c40f6379
JA
2042}
2043
5b0a6acc 2044static void io_req_task_submit(struct io_kiocb *req)
c40f6379
JA
2045{
2046 struct io_ring_ctx *ctx = req->ctx;
2047
04fc6c80 2048 /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
81b6d05c 2049 mutex_lock(&ctx->uring_lock);
9c688260 2050 if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
c5eef2b9 2051 __io_queue_sqe(req);
81b6d05c 2052 else
2593553a 2053 io_req_complete_failed(req, -EFAULT);
81b6d05c 2054 mutex_unlock(&ctx->uring_lock);
c40f6379
JA
2055}
2056
2c4b8eb6 2057static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
c40f6379 2058{
2c4b8eb6 2059 req->result = ret;
5b0a6acc 2060 req->io_task_work.func = io_req_task_cancel;
e09ee510 2061 io_req_task_work_add(req);
c40f6379
JA
2062}
2063
2c4b8eb6 2064static void io_req_task_queue(struct io_kiocb *req)
a3df7698 2065{
5b0a6acc 2066 req->io_task_work.func = io_req_task_submit;
e09ee510 2067 io_req_task_work_add(req);
a3df7698
PB
2068}
2069
773af691
JA
2070static void io_req_task_queue_reissue(struct io_kiocb *req)
2071{
2072 req->io_task_work.func = io_queue_async_work;
2073 io_req_task_work_add(req);
2074}
2075
f2f87370 2076static inline void io_queue_next(struct io_kiocb *req)
c69f8dbe 2077{
9b5f7bd9 2078 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf
PB
2079
2080 if (nxt)
906a8c3f 2081 io_req_task_queue(nxt);
c69f8dbe
JL
2082}
2083
c3524383 2084static void io_free_req(struct io_kiocb *req)
7a743e22 2085{
c3524383
PB
2086 io_queue_next(req);
2087 __io_free_req(req);
2088}
8766dd51 2089
2d6500d4 2090struct req_batch {
5af1d13e
PB
2091 struct task_struct *task;
2092 int task_refs;
1b4c351f 2093 int ctx_refs;
2d6500d4
PB
2094};
2095
5af1d13e
PB
2096static inline void io_init_req_batch(struct req_batch *rb)
2097{
5af1d13e 2098 rb->task_refs = 0;
9ae72463 2099 rb->ctx_refs = 0;
5af1d13e
PB
2100 rb->task = NULL;
2101}
2102
2d6500d4
PB
2103static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2104 struct req_batch *rb)
2105{
6e833d53 2106 if (rb->task)
7c660731 2107 io_put_task(rb->task, rb->task_refs);
9ae72463
PB
2108 if (rb->ctx_refs)
2109 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2d6500d4
PB
2110}
2111
6ff119a6
PB
2112static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2113 struct io_submit_state *state)
2d6500d4 2114{
f2f87370 2115 io_queue_next(req);
96670657 2116 io_dismantle_req(req);
2d6500d4 2117
e3bc8e9d 2118 if (req->task != rb->task) {
7c660731
PB
2119 if (rb->task)
2120 io_put_task(rb->task, rb->task_refs);
e3bc8e9d
JA
2121 rb->task = req->task;
2122 rb->task_refs = 0;
5af1d13e 2123 }
e3bc8e9d 2124 rb->task_refs++;
9ae72463 2125 rb->ctx_refs++;
5af1d13e 2126
bd759045 2127 if (state->free_reqs != ARRAY_SIZE(state->reqs))
6ff119a6 2128 state->reqs[state->free_reqs++] = req;
bd759045
PB
2129 else
2130 list_add(&req->compl.list, &state->comp.free_list);
7a743e22
PB
2131}
2132
2a2758f2 2133static void io_submit_flush_completions(struct io_ring_ctx *ctx)
282cdc86 2134 __must_hold(&req->ctx->uring_lock)
905c172f 2135{
2a2758f2 2136 struct io_comp_state *cs = &ctx->submit_state.comp;
905c172f 2137 int i, nr = cs->nr;
905c172f
PB
2138 struct req_batch rb;
2139
905c172f
PB
2140 spin_lock_irq(&ctx->completion_lock);
2141 for (i = 0; i < nr; i++) {
5182ed2e
PB
2142 struct io_kiocb *req = cs->reqs[i];
2143
d4d19c19
PB
2144 __io_cqring_fill_event(ctx, req->user_data, req->result,
2145 req->compl.cflags);
905c172f
PB
2146 }
2147 io_commit_cqring(ctx);
2148 spin_unlock_irq(&ctx->completion_lock);
905c172f 2149 io_cqring_ev_posted(ctx);
5182ed2e
PB
2150
2151 io_init_req_batch(&rb);
905c172f 2152 for (i = 0; i < nr; i++) {
5182ed2e 2153 struct io_kiocb *req = cs->reqs[i];
905c172f
PB
2154
2155 /* submission and completion refs */
de9b4cca 2156 if (req_ref_sub_and_test(req, 2))
6ff119a6 2157 io_req_free_batch(&rb, req, &ctx->submit_state);
905c172f
PB
2158 }
2159
2160 io_req_free_batch_finish(ctx, &rb);
2161 cs->nr = 0;
7a743e22
PB
2162}
2163
ba816ad6
JA
2164/*
2165 * Drop reference to request, return next in chain (if there is one) if this
2166 * was the last reference to this request.
2167 */
0d85035a 2168static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 2169{
9b5f7bd9
PB
2170 struct io_kiocb *nxt = NULL;
2171
de9b4cca 2172 if (req_ref_put_and_test(req)) {
9b5f7bd9 2173 nxt = io_req_find_next(req);
4d7dd462 2174 __io_free_req(req);
2a44f467 2175 }
9b5f7bd9 2176 return nxt;
2b188cc1
JA
2177}
2178
0d85035a 2179static inline void io_put_req(struct io_kiocb *req)
e65ef56d 2180{
de9b4cca 2181 if (req_ref_put_and_test(req))
e65ef56d 2182 io_free_req(req);
2b188cc1
JA
2183}
2184
216578e5
PB
2185static void io_free_req_deferred(struct io_kiocb *req)
2186{
5b0a6acc 2187 req->io_task_work.func = io_free_req;
e09ee510 2188 io_req_task_work_add(req);
216578e5
PB
2189}
2190
2191static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2192{
de9b4cca 2193 if (req_ref_sub_and_test(req, refs))
216578e5
PB
2194 io_free_req_deferred(req);
2195}
2196
6c503150 2197static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
2198{
2199 /* See comment at the top of this file */
2200 smp_rmb();
e23de15f 2201 return __io_cqring_events(ctx);
a3a0e43f
JA
2202}
2203
fb5ccc98
PB
2204static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2205{
2206 struct io_rings *rings = ctx->rings;
2207
2208 /* make sure SQ entry isn't read before tail */
2209 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2210}
2211
8ff069bf 2212static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
e94f141b 2213{
8ff069bf 2214 unsigned int cflags;
e94f141b 2215
bcda7baa
JA
2216 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2217 cflags |= IORING_CQE_F_BUFFER;
0e1b6fe3 2218 req->flags &= ~REQ_F_BUFFER_SELECTED;
bcda7baa
JA
2219 kfree(kbuf);
2220 return cflags;
e94f141b
JA
2221}
2222
8ff069bf 2223static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
bcda7baa 2224{
4d954c25 2225 struct io_buffer *kbuf;
bcda7baa 2226
4d954c25 2227 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
8ff069bf
PB
2228 return io_put_kbuf(req, kbuf);
2229}
2230
4c6e277c
JA
2231static inline bool io_run_task_work(void)
2232{
ef98eb04 2233 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
4c6e277c 2234 __set_current_state(TASK_RUNNING);
ef98eb04 2235 tracehook_notify_signal();
4c6e277c
JA
2236 return true;
2237 }
2238
2239 return false;
bcda7baa
JA
2240}
2241
def596e9
JA
2242/*
2243 * Find and free completed poll iocbs
2244 */
2245static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
3c30ef0f 2246 struct list_head *done, bool resubmit)
def596e9 2247{
8237e045 2248 struct req_batch rb;
def596e9 2249 struct io_kiocb *req;
bbde017a
XW
2250
2251 /* order with ->result store in io_complete_rw_iopoll() */
2252 smp_rmb();
def596e9 2253
5af1d13e 2254 io_init_req_batch(&rb);
def596e9 2255 while (!list_empty(done)) {
bcda7baa
JA
2256 int cflags = 0;
2257
d21ffe7e 2258 req = list_first_entry(done, struct io_kiocb, inflight_entry);
f161340d
PB
2259 list_del(&req->inflight_entry);
2260
3c30ef0f 2261 if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
8c130827 2262 !(req->flags & REQ_F_DONT_REISSUE)) {
bbde017a 2263 req->iopoll_completed = 0;
8c130827 2264 req_ref_get(req);
773af691 2265 io_req_task_queue_reissue(req);
8c130827 2266 continue;
bbde017a 2267 }
def596e9 2268
bcda7baa 2269 if (req->flags & REQ_F_BUFFER_SELECTED)
8ff069bf 2270 cflags = io_put_rw_kbuf(req);
bcda7baa 2271
d4d19c19 2272 __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
def596e9
JA
2273 (*nr_events)++;
2274
de9b4cca 2275 if (req_ref_put_and_test(req))
6ff119a6 2276 io_req_free_batch(&rb, req, &ctx->submit_state);
def596e9 2277 }
def596e9 2278
09bb8394 2279 io_commit_cqring(ctx);
80c18e4a 2280 io_cqring_ev_posted_iopoll(ctx);
2d6500d4 2281 io_req_free_batch_finish(ctx, &rb);
581f9810
BM
2282}
2283
def596e9 2284static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
3c30ef0f 2285 long min, bool resubmit)
def596e9
JA
2286{
2287 struct io_kiocb *req, *tmp;
2288 LIST_HEAD(done);
2289 bool spin;
def596e9
JA
2290
2291 /*
2292 * Only spin for completions if we don't have multiple devices hanging
2293 * off our complete list, and we're under the requested amount.
2294 */
915b3dde 2295 spin = !ctx->poll_multi_queue && *nr_events < min;
def596e9 2296
d21ffe7e 2297 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
9adbd45d 2298 struct kiocb *kiocb = &req->rw.kiocb;
a2416e1e 2299 int ret;
def596e9
JA
2300
2301 /*
581f9810
BM
2302 * Move completed and retryable entries to our local lists.
2303 * If we find a request that requires polling, break out
2304 * and complete those lists first, if we have entries there.
def596e9 2305 */
65a6543d 2306 if (READ_ONCE(req->iopoll_completed)) {
d21ffe7e 2307 list_move_tail(&req->inflight_entry, &done);
def596e9
JA
2308 continue;
2309 }
2310 if (!list_empty(&done))
2311 break;
2312
2313 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
a2416e1e
PB
2314 if (unlikely(ret < 0))
2315 return ret;
2316 else if (ret)
2317 spin = false;
def596e9 2318
3aadc23e
PB
2319 /* iopoll may have completed current req */
2320 if (READ_ONCE(req->iopoll_completed))
d21ffe7e 2321 list_move_tail(&req->inflight_entry, &done);
def596e9
JA
2322 }
2323
2324 if (!list_empty(&done))
3c30ef0f 2325 io_iopoll_complete(ctx, nr_events, &done, resubmit);
def596e9 2326
a2416e1e 2327 return 0;
def596e9
JA
2328}
2329
def596e9
JA
2330/*
2331 * We can't just wait for polled events to come to us, we have to actively
2332 * find and complete them.
2333 */
b2edc0a7 2334static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
2335{
2336 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2337 return;
2338
2339 mutex_lock(&ctx->uring_lock);
540e32a0 2340 while (!list_empty(&ctx->iopoll_list)) {
def596e9
JA
2341 unsigned int nr_events = 0;
2342
3c30ef0f 2343 io_do_iopoll(ctx, &nr_events, 0, false);
08f5439f 2344
b2edc0a7
PB
2345 /* let it sleep and repeat later if can't complete a request */
2346 if (nr_events == 0)
2347 break;
08f5439f
JA
2348 /*
2349 * Ensure we allow local-to-the-cpu processing to take place,
2350 * in this case we need to ensure that we reap all events.
3fcee5a6 2351 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 2352 */
3fcee5a6
PB
2353 if (need_resched()) {
2354 mutex_unlock(&ctx->uring_lock);
2355 cond_resched();
2356 mutex_lock(&ctx->uring_lock);
2357 }
def596e9
JA
2358 }
2359 mutex_unlock(&ctx->uring_lock);
2360}
2361
7668b92a 2362static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 2363{
7668b92a 2364 unsigned int nr_events = 0;
e9979b36 2365 int ret = 0;
500f9fba 2366
c7849be9
XW
2367 /*
2368 * We disallow the app entering submit/complete with polling, but we
2369 * still need to lock the ring to prevent racing with polled issue
2370 * that got punted to a workqueue.
2371 */
2372 mutex_lock(&ctx->uring_lock);
f39c8a5b
PB
2373 /*
2374 * Don't enter poll loop if we already have events pending.
2375 * If we do, we can potentially be spinning for commands that
2376 * already triggered a CQE (eg in error).
2377 */
5ed7a37d 2378 if (test_bit(0, &ctx->check_cq_overflow))
f39c8a5b
PB
2379 __io_cqring_overflow_flush(ctx, false);
2380 if (io_cqring_events(ctx))
2381 goto out;
def596e9 2382 do {
500f9fba
JA
2383 /*
2384 * If a submit got punted to a workqueue, we can have the
2385 * application entering polling for a command before it gets
2386 * issued. That app will hold the uring_lock for the duration
2387 * of the poll right here, so we need to take a breather every
2388 * now and then to ensure that the issue has a chance to add
2389 * the poll to the issued list. Otherwise we can spin here
2390 * forever, while the workqueue is stuck trying to acquire the
2391 * very same mutex.
2392 */
e9979b36 2393 if (list_empty(&ctx->iopoll_list)) {
8f487ef2
PB
2394 u32 tail = ctx->cached_cq_tail;
2395
500f9fba 2396 mutex_unlock(&ctx->uring_lock);
4c6e277c 2397 io_run_task_work();
500f9fba 2398 mutex_lock(&ctx->uring_lock);
def596e9 2399
8f487ef2
PB
2400 /* some requests don't go through iopoll_list */
2401 if (tail != ctx->cached_cq_tail ||
2402 list_empty(&ctx->iopoll_list))
e9979b36 2403 break;
500f9fba 2404 }
3c30ef0f 2405 ret = io_do_iopoll(ctx, &nr_events, min, true);
f39c8a5b
PB
2406 } while (!ret && nr_events < min && !need_resched());
2407out:
500f9fba 2408 mutex_unlock(&ctx->uring_lock);
def596e9
JA
2409 return ret;
2410}
2411
491381ce 2412static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 2413{
491381ce
JA
2414 /*
2415 * Tell lockdep we inherited freeze protection from submission
2416 * thread.
2417 */
2418 if (req->flags & REQ_F_ISREG) {
1c98679d 2419 struct super_block *sb = file_inode(req->file)->i_sb;
2b188cc1 2420
1c98679d
PB
2421 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2422 sb_end_write(sb);
2b188cc1
JA
2423 }
2424}
2425
b63534c4 2426#ifdef CONFIG_BLOCK
dc2a6e9a 2427static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4 2428{
ab454438 2429 struct io_async_rw *rw = req->async_data;
b63534c4 2430
ab454438
PB
2431 if (!rw)
2432 return !io_req_prep_async(req);
2433 /* may have left rw->iter inconsistent on -EIOCBQUEUED */
2434 iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
2435 return true;
b63534c4 2436}
b63534c4 2437
3e6a0d3c 2438static bool io_rw_should_reissue(struct io_kiocb *req)
b63534c4 2439{
355afaeb 2440 umode_t mode = file_inode(req->file)->i_mode;
3e6a0d3c 2441 struct io_ring_ctx *ctx = req->ctx;
b63534c4 2442
355afaeb
JA
2443 if (!S_ISBLK(mode) && !S_ISREG(mode))
2444 return false;
3e6a0d3c
JA
2445 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2446 !(ctx->flags & IORING_SETUP_IOPOLL)))
b63534c4 2447 return false;
7c977a58
JA
2448 /*
2449 * If ref is dying, we might be running poll reap from the exit work.
2450 * Don't attempt to reissue from that path, just let it fail with
2451 * -EAGAIN.
2452 */
3e6a0d3c
JA
2453 if (percpu_ref_is_dying(&ctx->refs))
2454 return false;
ef046888
JA
2455 /*
2456 * Play it safe and assume not safe to re-import and reissue if we're
2457 * not in the original thread group (or in task context).
2458 */
2459 if (!same_thread_group(req->task, current) || !in_task())
2460 return false;
3e6a0d3c
JA
2461 return true;
2462}
e82ad485 2463#else
a1ff1e3f 2464static bool io_resubmit_prep(struct io_kiocb *req)
e82ad485
JA
2465{
2466 return false;
2467}
e82ad485 2468static bool io_rw_should_reissue(struct io_kiocb *req)
3e6a0d3c 2469{
b63534c4
JA
2470 return false;
2471}
3e6a0d3c 2472#endif
b63534c4 2473
9011bf9a
PB
2474static void io_fallback_req_func(struct work_struct *work)
2475{
2476 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
2477 fallback_work.work);
2478 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
2479 struct io_kiocb *req, *tmp;
2480
9cb0073b 2481 percpu_ref_get(&ctx->refs);
5b0a6acc
PB
2482 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
2483 req->io_task_work.func(req);
9cb0073b 2484 percpu_ref_put(&ctx->refs);
9011bf9a
PB
2485}
2486
a1d7c393 2487static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
889fca73 2488 unsigned int issue_flags)
a1d7c393 2489{
2f8e45f1
PB
2490 int cflags = 0;
2491
b65c128f
PB
2492 if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2493 kiocb_end_write(req);
9532b99b
PB
2494 if (res != req->result) {
2495 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2496 io_rw_should_reissue(req)) {
2497 req->flags |= REQ_F_REISSUE;
2498 return;
2499 }
93d2bcd2 2500 req_set_fail(req);
9532b99b 2501 }
2f8e45f1
PB
2502 if (req->flags & REQ_F_BUFFER_SELECTED)
2503 cflags = io_put_rw_kbuf(req);
2504 __io_req_complete(req, issue_flags, res, cflags);
ba816ad6
JA
2505}
2506
2507static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2508{
9adbd45d 2509 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 2510
889fca73 2511 __io_complete_rw(req, res, res2, 0);
2b188cc1
JA
2512}
2513
def596e9
JA
2514static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2515{
9adbd45d 2516 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 2517
491381ce
JA
2518 if (kiocb->ki_flags & IOCB_WRITE)
2519 kiocb_end_write(req);
9532b99b 2520 if (unlikely(res != req->result)) {
a1ff1e3f
JA
2521 if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
2522 io_resubmit_prep(req))) {
93d2bcd2 2523 req_set_fail(req);
9532b99b
PB
2524 req->flags |= REQ_F_DONT_REISSUE;
2525 }
8c130827 2526 }
bbde017a
XW
2527
2528 WRITE_ONCE(req->result, res);
b9b0e0d3 2529 /* order with io_iopoll_complete() checking ->result */
cd664b0e
PB
2530 smp_wmb();
2531 WRITE_ONCE(req->iopoll_completed, 1);
def596e9
JA
2532}
2533
2534/*
2535 * After the iocb has been issued, it's safe to be found on the poll list.
2536 * Adding the kiocb to the list AFTER submission ensures that we don't
f39c8a5b 2537 * find it from a io_do_iopoll() thread before the issuer is done
def596e9
JA
2538 * accessing the kiocb cookie.
2539 */
cb3d8972 2540static void io_iopoll_req_issued(struct io_kiocb *req)
def596e9
JA
2541{
2542 struct io_ring_ctx *ctx = req->ctx;
cb3d8972
PB
2543 const bool in_async = io_wq_current_is_worker();
2544
2545 /* workqueue context doesn't hold uring_lock, grab it now */
2546 if (unlikely(in_async))
2547 mutex_lock(&ctx->uring_lock);
def596e9
JA
2548
2549 /*
2550 * Track whether we have multiple files in our lists. This will impact
2551 * how we do polling eventually, not spinning if we're on potentially
2552 * different devices.
2553 */
540e32a0 2554 if (list_empty(&ctx->iopoll_list)) {
915b3dde
HX
2555 ctx->poll_multi_queue = false;
2556 } else if (!ctx->poll_multi_queue) {
def596e9 2557 struct io_kiocb *list_req;
915b3dde 2558 unsigned int queue_num0, queue_num1;
def596e9 2559
540e32a0 2560 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
d21ffe7e 2561 inflight_entry);
915b3dde
HX
2562
2563 if (list_req->file != req->file) {
2564 ctx->poll_multi_queue = true;
2565 } else {
2566 queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
2567 queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
2568 if (queue_num0 != queue_num1)
2569 ctx->poll_multi_queue = true;
2570 }
def596e9
JA
2571 }
2572
2573 /*
2574 * For fast devices, IO may have already completed. If it has, add
2575 * it to the front so we find it first.
2576 */
65a6543d 2577 if (READ_ONCE(req->iopoll_completed))
d21ffe7e 2578 list_add(&req->inflight_entry, &ctx->iopoll_list);
def596e9 2579 else
d21ffe7e 2580 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
bdcd3eab 2581
cb3d8972
PB
2582 if (unlikely(in_async)) {
2583 /*
2584 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2585 * in sq thread task context or in io worker task context. If
2586 * current task context is sq thread, we don't need to check
2587 * whether should wake up sq thread.
2588 */
2589 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2590 wq_has_sleeper(&ctx->sq_data->wait))
2591 wake_up(&ctx->sq_data->wait);
2592
2593 mutex_unlock(&ctx->uring_lock);
2594 }
def596e9
JA
2595}
2596
9f13c35b
PB
2597static inline void io_state_file_put(struct io_submit_state *state)
2598{
02b23a9a
PB
2599 if (state->file_refs) {
2600 fput_many(state->file, state->file_refs);
2601 state->file_refs = 0;
2602 }
9a56a232
JA
2603}
2604
2605/*
2606 * Get as many references to a file as we have IOs left in this submission,
2607 * assuming most submissions are for one file, or at least that each file
2608 * has more than one submission.
2609 */
8da11c19 2610static struct file *__io_file_get(struct io_submit_state *state, int fd)
9a56a232
JA
2611{
2612 if (!state)
2613 return fget(fd);
2614
6e1271e6 2615 if (state->file_refs) {
9a56a232 2616 if (state->fd == fd) {
6e1271e6 2617 state->file_refs--;
9a56a232
JA
2618 return state->file;
2619 }
02b23a9a 2620 io_state_file_put(state);
9a56a232
JA
2621 }
2622 state->file = fget_many(fd, state->ios_left);
6e1271e6 2623 if (unlikely(!state->file))
9a56a232
JA
2624 return NULL;
2625
2626 state->fd = fd;
6e1271e6 2627 state->file_refs = state->ios_left - 1;
9a56a232
JA
2628 return state->file;
2629}
2630
4503b767
JA
2631static bool io_bdev_nowait(struct block_device *bdev)
2632{
9ba0d0c8 2633 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
2634}
2635
2b188cc1
JA
2636/*
2637 * If we tracked the file through the SCM inflight mechanism, we could support
2638 * any file. For now, just ensure that anything potentially problematic is done
2639 * inline.
2640 */
b191e2df 2641static bool __io_file_supports_nowait(struct file *file, int rw)
2b188cc1
JA
2642{
2643 umode_t mode = file_inode(file)->i_mode;
2644
4503b767 2645 if (S_ISBLK(mode)) {
4e7b5671
CH
2646 if (IS_ENABLED(CONFIG_BLOCK) &&
2647 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
2648 return true;
2649 return false;
2650 }
976517f1 2651 if (S_ISSOCK(mode))
2b188cc1 2652 return true;
4503b767 2653 if (S_ISREG(mode)) {
4e7b5671
CH
2654 if (IS_ENABLED(CONFIG_BLOCK) &&
2655 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
2656 file->f_op != &io_uring_fops)
2657 return true;
2658 return false;
2659 }
2b188cc1 2660
c5b85625
JA
2661 /* any ->read/write should understand O_NONBLOCK */
2662 if (file->f_flags & O_NONBLOCK)
2663 return true;
2664
af197f50
JA
2665 if (!(file->f_mode & FMODE_NOWAIT))
2666 return false;
2667
2668 if (rw == READ)
2669 return file->f_op->read_iter != NULL;
2670
2671 return file->f_op->write_iter != NULL;
2b188cc1
JA
2672}
2673
b191e2df 2674static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
7b29f92d 2675{
b191e2df 2676 if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
7b29f92d 2677 return true;
b191e2df 2678 else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
7b29f92d
JA
2679 return true;
2680
b191e2df 2681 return __io_file_supports_nowait(req->file, rw);
7b29f92d
JA
2682}
2683
a88fc400 2684static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 2685{
def596e9 2686 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 2687 struct kiocb *kiocb = &req->rw.kiocb;
75c668cd 2688 struct file *file = req->file;
09bb8394
JA
2689 unsigned ioprio;
2690 int ret;
2b188cc1 2691
c97d8a0f 2692 if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
491381ce
JA
2693 req->flags |= REQ_F_ISREG;
2694
2b188cc1 2695 kiocb->ki_pos = READ_ONCE(sqe->off);
75c668cd 2696 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
ba04291e 2697 req->flags |= REQ_F_CUR_POS;
75c668cd 2698 kiocb->ki_pos = file->f_pos;
ba04291e 2699 }
2b188cc1 2700 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
2701 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2702 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2703 if (unlikely(ret))
2704 return ret;
2b188cc1 2705
75c668cd
PB
2706 /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2707 if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2708 req->flags |= REQ_F_NOWAIT;
2709
2b188cc1
JA
2710 ioprio = READ_ONCE(sqe->ioprio);
2711 if (ioprio) {
2712 ret = ioprio_check_cap(ioprio);
2713 if (ret)
09bb8394 2714 return ret;
2b188cc1
JA
2715
2716 kiocb->ki_ioprio = ioprio;
2717 } else
2718 kiocb->ki_ioprio = get_current_ioprio();
2719
def596e9 2720 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
2721 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2722 !kiocb->ki_filp->f_op->iopoll)
09bb8394 2723 return -EOPNOTSUPP;
2b188cc1 2724
def596e9
JA
2725 kiocb->ki_flags |= IOCB_HIPRI;
2726 kiocb->ki_complete = io_complete_rw_iopoll;
65a6543d 2727 req->iopoll_completed = 0;
def596e9 2728 } else {
09bb8394
JA
2729 if (kiocb->ki_flags & IOCB_HIPRI)
2730 return -EINVAL;
def596e9
JA
2731 kiocb->ki_complete = io_complete_rw;
2732 }
9adbd45d 2733
eae071c9
PB
2734 if (req->opcode == IORING_OP_READ_FIXED ||
2735 req->opcode == IORING_OP_WRITE_FIXED) {
2736 req->imu = NULL;
2737 io_req_set_rsrc_node(req);
2738 }
2739
3529d8c2
JA
2740 req->rw.addr = READ_ONCE(sqe->addr);
2741 req->rw.len = READ_ONCE(sqe->len);
4f4eeba8 2742 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 2743 return 0;
2b188cc1
JA
2744}
2745
2746static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2747{
2748 switch (ret) {
2749 case -EIOCBQUEUED:
2750 break;
2751 case -ERESTARTSYS:
2752 case -ERESTARTNOINTR:
2753 case -ERESTARTNOHAND:
2754 case -ERESTART_RESTARTBLOCK:
2755 /*
2756 * We can't just restart the syscall, since previously
2757 * submitted sqes may already be in progress. Just fail this
2758 * IO with EINTR.
2759 */
2760 ret = -EINTR;
df561f66 2761 fallthrough;
2b188cc1
JA
2762 default:
2763 kiocb->ki_complete(kiocb, ret, 0);
2764 }
2765}
2766
a1d7c393 2767static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
889fca73 2768 unsigned int issue_flags)
ba816ad6 2769{
ba04291e 2770 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
e8c2bc1f 2771 struct io_async_rw *io = req->async_data;
97284637 2772 bool check_reissue = kiocb->ki_complete == io_complete_rw;
ba04291e 2773
227c0c96 2774 /* add previously done IO, if any */
e8c2bc1f 2775 if (io && io->bytes_done > 0) {
227c0c96 2776 if (ret < 0)
e8c2bc1f 2777 ret = io->bytes_done;
227c0c96 2778 else
e8c2bc1f 2779 ret += io->bytes_done;
227c0c96
JA
2780 }
2781
ba04291e
JA
2782 if (req->flags & REQ_F_CUR_POS)
2783 req->file->f_pos = kiocb->ki_pos;
e149bd74 2784 if (ret >= 0 && check_reissue)
889fca73 2785 __io_complete_rw(req, ret, 0, issue_flags);
ba816ad6
JA
2786 else
2787 io_rw_done(kiocb, ret);
97284637 2788
fe7e3257 2789 if (check_reissue && (req->flags & REQ_F_REISSUE)) {
97284637 2790 req->flags &= ~REQ_F_REISSUE;
a7be7c23 2791 if (io_resubmit_prep(req)) {
8c130827 2792 req_ref_get(req);
773af691 2793 io_req_task_queue_reissue(req);
8c130827 2794 } else {
97284637
PB
2795 int cflags = 0;
2796
93d2bcd2 2797 req_set_fail(req);
97284637
PB
2798 if (req->flags & REQ_F_BUFFER_SELECTED)
2799 cflags = io_put_rw_kbuf(req);
2800 __io_req_complete(req, issue_flags, ret, cflags);
2801 }
2802 }
ba816ad6
JA
2803}
2804
eae071c9
PB
2805static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2806 struct io_mapped_ubuf *imu)
edafccee 2807{
9adbd45d 2808 size_t len = req->rw.len;
75769e3f 2809 u64 buf_end, buf_addr = req->rw.addr;
edafccee 2810 size_t offset;
edafccee 2811
75769e3f 2812 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
edafccee
JA
2813 return -EFAULT;
2814 /* not inside the mapped region */
4751f53d 2815 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
edafccee
JA
2816 return -EFAULT;
2817
2818 /*
2819 * May not be a start of buffer, set size appropriately
2820 * and advance us to the beginning.
2821 */
2822 offset = buf_addr - imu->ubuf;
2823 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
2824
2825 if (offset) {
2826 /*
2827 * Don't use iov_iter_advance() here, as it's really slow for
2828 * using the latter parts of a big fixed buffer - it iterates
2829 * over each segment manually. We can cheat a bit here, because
2830 * we know that:
2831 *
2832 * 1) it's a BVEC iter, we set it up
2833 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2834 * first and last bvec
2835 *
2836 * So just find our index, and adjust the iterator afterwards.
2837 * If the offset is within the first bvec (or the whole first
2838 * bvec, just use iov_iter_advance(). This makes it easier
2839 * since we can just skip the first segment, which may not
2840 * be PAGE_SIZE aligned.
2841 */
2842 const struct bio_vec *bvec = imu->bvec;
2843
2844 if (offset <= bvec->bv_len) {
2845 iov_iter_advance(iter, offset);
2846 } else {
2847 unsigned long seg_skip;
2848
2849 /* skip first vec */
2850 offset -= bvec->bv_len;
2851 seg_skip = 1 + (offset >> PAGE_SHIFT);
2852
2853 iter->bvec = bvec + seg_skip;
2854 iter->nr_segs -= seg_skip;
99c79f66 2855 iter->count -= bvec->bv_len + offset;
bd11b3a3 2856 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2857 }
2858 }
2859
847595de 2860 return 0;
edafccee
JA
2861}
2862
eae071c9
PB
2863static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2864{
2865 struct io_ring_ctx *ctx = req->ctx;
2866 struct io_mapped_ubuf *imu = req->imu;
2867 u16 index, buf_index = req->buf_index;
2868
2869 if (likely(!imu)) {
2870 if (unlikely(buf_index >= ctx->nr_user_bufs))
2871 return -EFAULT;
2872 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2873 imu = READ_ONCE(ctx->user_bufs[index]);
2874 req->imu = imu;
2875 }
2876 return __io_import_fixed(req, rw, iter, imu);
2877}
2878
bcda7baa
JA
2879static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2880{
2881 if (needs_lock)
2882 mutex_unlock(&ctx->uring_lock);
2883}
2884
2885static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2886{
2887 /*
2888 * "Normal" inline submissions always hold the uring_lock, since we
2889 * grab it from the system call. Same is true for the SQPOLL offload.
2890 * The only exception is when we've detached the request and issue it
2891 * from an async worker thread, grab the lock for that case.
2892 */
2893 if (needs_lock)
2894 mutex_lock(&ctx->uring_lock);
2895}
2896
2897static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2898 int bgid, struct io_buffer *kbuf,
2899 bool needs_lock)
2900{
2901 struct io_buffer *head;
2902
2903 if (req->flags & REQ_F_BUFFER_SELECTED)
2904 return kbuf;
2905
2906 io_ring_submit_lock(req->ctx, needs_lock);
2907
2908 lockdep_assert_held(&req->ctx->uring_lock);
2909
9e15c3a0 2910 head = xa_load(&req->ctx->io_buffers, bgid);
bcda7baa
JA
2911 if (head) {
2912 if (!list_empty(&head->list)) {
2913 kbuf = list_last_entry(&head->list, struct io_buffer,
2914 list);
2915 list_del(&kbuf->list);
2916 } else {
2917 kbuf = head;
9e15c3a0 2918 xa_erase(&req->ctx->io_buffers, bgid);
bcda7baa
JA
2919 }
2920 if (*len > kbuf->len)
2921 *len = kbuf->len;
2922 } else {
2923 kbuf = ERR_PTR(-ENOBUFS);
2924 }
2925
2926 io_ring_submit_unlock(req->ctx, needs_lock);
2927
2928 return kbuf;
2929}
2930
4d954c25
JA
2931static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2932 bool needs_lock)
2933{
2934 struct io_buffer *kbuf;
4f4eeba8 2935 u16 bgid;
4d954c25
JA
2936
2937 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
4f4eeba8 2938 bgid = req->buf_index;
4d954c25
JA
2939 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2940 if (IS_ERR(kbuf))
2941 return kbuf;
2942 req->rw.addr = (u64) (unsigned long) kbuf;
2943 req->flags |= REQ_F_BUFFER_SELECTED;
2944 return u64_to_user_ptr(kbuf->addr);
2945}
2946
2947#ifdef CONFIG_COMPAT
2948static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2949 bool needs_lock)
2950{
2951 struct compat_iovec __user *uiov;
2952 compat_ssize_t clen;
2953 void __user *buf;
2954 ssize_t len;
2955
2956 uiov = u64_to_user_ptr(req->rw.addr);
2957 if (!access_ok(uiov, sizeof(*uiov)))
2958 return -EFAULT;
2959 if (__get_user(clen, &uiov->iov_len))
2960 return -EFAULT;
2961 if (clen < 0)
2962 return -EINVAL;
2963
2964 len = clen;
2965 buf = io_rw_buffer_select(req, &len, needs_lock);
2966 if (IS_ERR(buf))
2967 return PTR_ERR(buf);
2968 iov[0].iov_base = buf;
2969 iov[0].iov_len = (compat_size_t) len;
2970 return 0;
2971}
2972#endif
2973
2974static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2975 bool needs_lock)
2976{
2977 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2978 void __user *buf;
2979 ssize_t len;
2980
2981 if (copy_from_user(iov, uiov, sizeof(*uiov)))
2982 return -EFAULT;
2983
2984 len = iov[0].iov_len;
2985 if (len < 0)
2986 return -EINVAL;
2987 buf = io_rw_buffer_select(req, &len, needs_lock);
2988 if (IS_ERR(buf))
2989 return PTR_ERR(buf);
2990 iov[0].iov_base = buf;
2991 iov[0].iov_len = len;
2992 return 0;
2993}
2994
2995static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2996 bool needs_lock)
2997{
dddb3e26
JA
2998 if (req->flags & REQ_F_BUFFER_SELECTED) {
2999 struct io_buffer *kbuf;
3000
3001 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3002 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3003 iov[0].iov_len = kbuf->len;
4d954c25 3004 return 0;
dddb3e26 3005 }
dd201662 3006 if (req->rw.len != 1)
4d954c25
JA
3007 return -EINVAL;
3008
3009#ifdef CONFIG_COMPAT
3010 if (req->ctx->compat)
3011 return io_compat_import(req, iov, needs_lock);
3012#endif
3013
3014 return __io_iov_buffer_select(req, iov, needs_lock);
3015}
3016
847595de
PB
3017static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3018 struct iov_iter *iter, bool needs_lock)
2b188cc1 3019{
9adbd45d
JA
3020 void __user *buf = u64_to_user_ptr(req->rw.addr);
3021 size_t sqe_len = req->rw.len;
847595de 3022 u8 opcode = req->opcode;
4d954c25 3023 ssize_t ret;
edafccee 3024
7d009165 3025 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 3026 *iovec = NULL;
9adbd45d 3027 return io_import_fixed(req, rw, iter);
edafccee 3028 }
2b188cc1 3029
bcda7baa 3030 /* buffer index only valid with fixed read/write, or buffer select */
4f4eeba8 3031 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
9adbd45d
JA
3032 return -EINVAL;
3033
3a6820f2 3034 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 3035 if (req->flags & REQ_F_BUFFER_SELECT) {
4d954c25 3036 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
867a23ea 3037 if (IS_ERR(buf))
4d954c25 3038 return PTR_ERR(buf);
3f9d6441 3039 req->rw.len = sqe_len;
bcda7baa
JA
3040 }
3041
3a6820f2
JA
3042 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3043 *iovec = NULL;
10fc72e4 3044 return ret;
3a6820f2
JA
3045 }
3046
4d954c25
JA
3047 if (req->flags & REQ_F_BUFFER_SELECT) {
3048 ret = io_iov_buffer_select(req, *iovec, needs_lock);
847595de
PB
3049 if (!ret)
3050 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
4d954c25
JA
3051 *iovec = NULL;
3052 return ret;
3053 }
3054
89cd35c5
CH
3055 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3056 req->ctx->compat);
2b188cc1
JA
3057}
3058
0fef9483
JA
3059static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3060{
5b09e37e 3061 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
3062}
3063
31b51510 3064/*
32960613
JA
3065 * For files that don't have ->read_iter() and ->write_iter(), handle them
3066 * by looping over ->read() or ->write() manually.
31b51510 3067 */
4017eb91 3068static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3069{
4017eb91
JA
3070 struct kiocb *kiocb = &req->rw.kiocb;
3071 struct file *file = req->file;
32960613
JA
3072 ssize_t ret = 0;
3073
3074 /*
3075 * Don't support polled IO through this interface, and we can't
3076 * support non-blocking either. For the latter, this just causes
3077 * the kiocb to be handled from an async context.
3078 */
3079 if (kiocb->ki_flags & IOCB_HIPRI)
3080 return -EOPNOTSUPP;
3081 if (kiocb->ki_flags & IOCB_NOWAIT)
3082 return -EAGAIN;
3083
3084 while (iov_iter_count(iter)) {
311ae9e1 3085 struct iovec iovec;
32960613
JA
3086 ssize_t nr;
3087
311ae9e1
PB
3088 if (!iov_iter_is_bvec(iter)) {
3089 iovec = iov_iter_iovec(iter);
3090 } else {
4017eb91
JA
3091 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3092 iovec.iov_len = req->rw.len;
311ae9e1
PB
3093 }
3094
32960613
JA
3095 if (rw == READ) {
3096 nr = file->f_op->read(file, iovec.iov_base,
0fef9483 3097 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3098 } else {
3099 nr = file->f_op->write(file, iovec.iov_base,
0fef9483 3100 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3101 }
3102
3103 if (nr < 0) {
3104 if (!ret)
3105 ret = nr;
3106 break;
3107 }
3108 ret += nr;
3109 if (nr != iovec.iov_len)
3110 break;
4017eb91
JA
3111 req->rw.len -= nr;
3112 req->rw.addr += nr;
32960613
JA
3113 iov_iter_advance(iter, nr);
3114 }
3115
3116 return ret;
3117}
3118
ff6165b2
JA
3119static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3120 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3121{
e8c2bc1f 3122 struct io_async_rw *rw = req->async_data;
b64e3444 3123
ff6165b2 3124 memcpy(&rw->iter, iter, sizeof(*iter));
afb87658 3125 rw->free_iovec = iovec;
227c0c96 3126 rw->bytes_done = 0;
ff6165b2 3127 /* can only be fixed buffers, no need to do anything */
9c3a205c 3128 if (iov_iter_is_bvec(iter))
ff6165b2 3129 return;
b64e3444 3130 if (!iovec) {
ff6165b2
JA
3131 unsigned iov_off = 0;
3132
3133 rw->iter.iov = rw->fast_iov;
3134 if (iter->iov != fast_iov) {
3135 iov_off = iter->iov - fast_iov;
3136 rw->iter.iov += iov_off;
3137 }
3138 if (rw->fast_iov != fast_iov)
3139 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
45097dae 3140 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
3141 } else {
3142 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
3143 }
3144}
3145
6cb78689 3146static inline int io_alloc_async_data(struct io_kiocb *req)
3d9932a8 3147{
e8c2bc1f
JA
3148 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3149 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3150 return req->async_data == NULL;
3d9932a8
XW
3151}
3152
ff6165b2
JA
3153static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3154 const struct iovec *fast_iov,
227c0c96 3155 struct iov_iter *iter, bool force)
b7bb4f7d 3156{
26f0505a 3157 if (!force && !io_op_defs[req->opcode].needs_async_setup)
74566df3 3158 return 0;
e8c2bc1f 3159 if (!req->async_data) {
6cb78689 3160 if (io_alloc_async_data(req)) {
6bf985dc 3161 kfree(iovec);
5d204bcf 3162 return -ENOMEM;
6bf985dc 3163 }
b7bb4f7d 3164
ff6165b2 3165 io_req_map_rw(req, iovec, fast_iov, iter);
5d204bcf 3166 }
b7bb4f7d 3167 return 0;
f67676d1
JA
3168}
3169
73debe68 3170static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 3171{
e8c2bc1f 3172 struct io_async_rw *iorw = req->async_data;
f4bff104 3173 struct iovec *iov = iorw->fast_iov;
847595de 3174 int ret;
c3e330a4 3175
2846c481 3176 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
c3e330a4
PB
3177 if (unlikely(ret < 0))
3178 return ret;
3179
ab0b196c
PB
3180 iorw->bytes_done = 0;
3181 iorw->free_iovec = iov;
3182 if (iov)
3183 req->flags |= REQ_F_NEED_CLEANUP;
c3e330a4
PB
3184 return 0;
3185}
3186
73debe68 3187static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3188{
3529d8c2
JA
3189 if (unlikely(!(req->file->f_mode & FMODE_READ)))
3190 return -EBADF;
93642ef8 3191 return io_prep_rw(req, sqe);
f67676d1
JA
3192}
3193
c1dd91d1
JA
3194/*
3195 * This is our waitqueue callback handler, registered through lock_page_async()
3196 * when we initially tried to do the IO with the iocb armed our waitqueue.
3197 * This gets called when the page is unlocked, and we generally expect that to
3198 * happen when the page IO is completed and the page is now uptodate. This will
3199 * queue a task_work based retry of the operation, attempting to copy the data
3200 * again. If the latter fails because the page was NOT uptodate, then we will
3201 * do a thread based blocking retry of the operation. That's the unexpected
3202 * slow path.
3203 */
bcf5a063
JA
3204static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3205 int sync, void *arg)
3206{
3207 struct wait_page_queue *wpq;
3208 struct io_kiocb *req = wait->private;
bcf5a063 3209 struct wait_page_key *key = arg;
bcf5a063
JA
3210
3211 wpq = container_of(wait, struct wait_page_queue, wait);
3212
cdc8fcb4
LT
3213 if (!wake_page_match(wpq, key))
3214 return 0;
3215
c8d317aa 3216 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063
JA
3217 list_del_init(&wait->entry);
3218
bcf5a063 3219 /* submit ref gets dropped, acquire a new one */
de9b4cca 3220 req_ref_get(req);
921b9054 3221 io_req_task_queue(req);
bcf5a063
JA
3222 return 1;
3223}
3224
c1dd91d1
JA
3225/*
3226 * This controls whether a given IO request should be armed for async page
3227 * based retry. If we return false here, the request is handed to the async
3228 * worker threads for retry. If we're doing buffered reads on a regular file,
3229 * we prepare a private wait_page_queue entry and retry the operation. This
3230 * will either succeed because the page is now uptodate and unlocked, or it
3231 * will register a callback when the page is unlocked at IO completion. Through
3232 * that callback, io_uring uses task_work to setup a retry of the operation.
3233 * That retry will attempt the buffered read again. The retry will generally
3234 * succeed, or in rare cases where it fails, we then fall back to using the
3235 * async worker threads for a blocking retry.
3236 */
227c0c96 3237static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 3238{
e8c2bc1f
JA
3239 struct io_async_rw *rw = req->async_data;
3240 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 3241 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 3242
bcf5a063
JA
3243 /* never retry for NOWAIT, we just complete with -EAGAIN */
3244 if (req->flags & REQ_F_NOWAIT)
3245 return false;
f67676d1 3246
227c0c96 3247 /* Only for buffered IO */
3b2a4439 3248 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 3249 return false;
3b2a4439 3250
bcf5a063
JA
3251 /*
3252 * just use poll if we can, and don't attempt if the fs doesn't
3253 * support callback based unlocks
3254 */
3255 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3256 return false;
f67676d1 3257
3b2a4439
JA
3258 wait->wait.func = io_async_buf_func;
3259 wait->wait.private = req;
3260 wait->wait.flags = 0;
3261 INIT_LIST_HEAD(&wait->wait.entry);
3262 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 3263 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 3264 kiocb->ki_waitq = wait;
3b2a4439 3265 return true;
bcf5a063
JA
3266}
3267
aeab9506 3268static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
bcf5a063
JA
3269{
3270 if (req->file->f_op->read_iter)
3271 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 3272 else if (req->file->f_op->read)
4017eb91 3273 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
3274 else
3275 return -EINVAL;
f67676d1
JA
3276}
3277
889fca73 3278static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3279{
3280 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3281 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3282 struct iov_iter __iter, *iter = &__iter;
e8c2bc1f 3283 struct io_async_rw *rw = req->async_data;
227c0c96 3284 ssize_t io_size, ret, ret2;
45d189c6 3285 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ff6165b2 3286
2846c481 3287 if (rw) {
e8c2bc1f 3288 iter = &rw->iter;
2846c481
PB
3289 iovec = NULL;
3290 } else {
3291 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3292 if (ret < 0)
3293 return ret;
3294 }
632546c4 3295 io_size = iov_iter_count(iter);
fa15bafb 3296 req->result = io_size;
2b188cc1 3297
fd6c2e4c
JA
3298 /* Ensure we clear previously set non-block flag */
3299 if (!force_nonblock)
29de5f6a 3300 kiocb->ki_flags &= ~IOCB_NOWAIT;
a88fc400
PB
3301 else
3302 kiocb->ki_flags |= IOCB_NOWAIT;
3303
24c74678 3304 /* If the file doesn't support async, just async punt */
b191e2df 3305 if (force_nonblock && !io_file_supports_nowait(req, READ)) {
6713e7a6 3306 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc 3307 return ret ?: -EAGAIN;
6713e7a6 3308 }
9e645e11 3309
632546c4 3310 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
5ea5dd45
PB
3311 if (unlikely(ret)) {
3312 kfree(iovec);
3313 return ret;
3314 }
2b188cc1 3315
227c0c96 3316 ret = io_iter_do_read(req, iter);
32960613 3317
230d50d4 3318 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
6ad7f233 3319 req->flags &= ~REQ_F_REISSUE;
eefdf30f
JA
3320 /* IOPOLL retry should happen for io-wq threads */
3321 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 3322 goto done;
75c668cd
PB
3323 /* no retry on NONBLOCK nor RWF_NOWAIT */
3324 if (req->flags & REQ_F_NOWAIT)
355afaeb 3325 goto done;
84216315 3326 /* some cases will consume bytes even on error returns */
632546c4 3327 iov_iter_revert(iter, io_size - iov_iter_count(iter));
f38c7e3a 3328 ret = 0;
230d50d4
JA
3329 } else if (ret == -EIOCBQUEUED) {
3330 goto out_free;
7335e3bf 3331 } else if (ret <= 0 || ret == io_size || !force_nonblock ||
75c668cd 3332 (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
7335e3bf 3333 /* read all, failed, already did sync or don't want to retry */
00d23d51 3334 goto done;
227c0c96
JA
3335 }
3336
227c0c96 3337 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc
PB
3338 if (ret2)
3339 return ret2;
3340
fe1cdd55 3341 iovec = NULL;
e8c2bc1f 3342 rw = req->async_data;
227c0c96 3343 /* now use our persistent iterator, if we aren't already */
e8c2bc1f 3344 iter = &rw->iter;
227c0c96 3345
b23df91b
PB
3346 do {
3347 io_size -= ret;
3348 rw->bytes_done += ret;
3349 /* if we can retry, do so with the callbacks armed */
3350 if (!io_rw_should_retry(req)) {
3351 kiocb->ki_flags &= ~IOCB_WAITQ;
3352 return -EAGAIN;
3353 }
3354
3355 /*
3356 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3357 * we get -EIOCBQUEUED, then we'll get a notification when the
3358 * desired page gets unlocked. We can also get a partial read
3359 * here, and if we do, then just retry at the new offset.
3360 */
3361 ret = io_iter_do_read(req, iter);
3362 if (ret == -EIOCBQUEUED)
3363 return 0;
227c0c96 3364 /* we got some bytes, but not all. retry. */
b5b0ecb7 3365 kiocb->ki_flags &= ~IOCB_WAITQ;
b23df91b 3366 } while (ret > 0 && ret < io_size);
227c0c96 3367done:
889fca73 3368 kiocb_done(kiocb, ret, issue_flags);
fe1cdd55
PB
3369out_free:
3370 /* it's faster to check here then delegate to kfree */
3371 if (iovec)
3372 kfree(iovec);
5ea5dd45 3373 return 0;
2b188cc1
JA
3374}
3375
73debe68 3376static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3377{
3529d8c2
JA
3378 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3379 return -EBADF;
93642ef8 3380 return io_prep_rw(req, sqe);
f67676d1
JA
3381}
3382
889fca73 3383static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3384{
3385 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3386 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3387 struct iov_iter __iter, *iter = &__iter;
e8c2bc1f 3388 struct io_async_rw *rw = req->async_data;
fa15bafb 3389 ssize_t ret, ret2, io_size;
45d189c6 3390 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
2b188cc1 3391
2846c481 3392 if (rw) {
e8c2bc1f 3393 iter = &rw->iter;
2846c481
PB
3394 iovec = NULL;
3395 } else {
3396 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3397 if (ret < 0)
3398 return ret;
3399 }
632546c4 3400 io_size = iov_iter_count(iter);
fa15bafb 3401 req->result = io_size;
2b188cc1 3402
fd6c2e4c
JA
3403 /* Ensure we clear previously set non-block flag */
3404 if (!force_nonblock)
a88fc400
PB
3405 kiocb->ki_flags &= ~IOCB_NOWAIT;
3406 else
3407 kiocb->ki_flags |= IOCB_NOWAIT;
fd6c2e4c 3408
24c74678 3409 /* If the file doesn't support async, just async punt */
b191e2df 3410 if (force_nonblock && !io_file_supports_nowait(req, WRITE))
f67676d1 3411 goto copy_iov;
31b51510 3412
10d59345
JA
3413 /* file path doesn't support NOWAIT for non-direct_IO */
3414 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3415 (req->flags & REQ_F_ISREG))
f67676d1 3416 goto copy_iov;
31b51510 3417
632546c4 3418 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
fa15bafb
PB
3419 if (unlikely(ret))
3420 goto out_free;
4ed734b0 3421
fa15bafb
PB
3422 /*
3423 * Open-code file_start_write here to grab freeze protection,
3424 * which will be released by another thread in
3425 * io_complete_rw(). Fool lockdep by telling it the lock got
3426 * released so that it doesn't complain about the held lock when
3427 * we return to userspace.
3428 */
3429 if (req->flags & REQ_F_ISREG) {
8a3c84b6 3430 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
3431 __sb_writers_release(file_inode(req->file)->i_sb,
3432 SB_FREEZE_WRITE);
3433 }
3434 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 3435
fa15bafb 3436 if (req->file->f_op->write_iter)
ff6165b2 3437 ret2 = call_write_iter(req->file, kiocb, iter);
2dd2111d 3438 else if (req->file->f_op->write)
4017eb91 3439 ret2 = loop_rw_iter(WRITE, req, iter);
2dd2111d
GH
3440 else
3441 ret2 = -EINVAL;
4ed734b0 3442
6ad7f233
PB
3443 if (req->flags & REQ_F_REISSUE) {
3444 req->flags &= ~REQ_F_REISSUE;
230d50d4 3445 ret2 = -EAGAIN;
6ad7f233 3446 }
230d50d4 3447
fa15bafb
PB
3448 /*
3449 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3450 * retry them without IOCB_NOWAIT.
3451 */
3452 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3453 ret2 = -EAGAIN;
75c668cd
PB
3454 /* no retry on NONBLOCK nor RWF_NOWAIT */
3455 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 3456 goto done;
fa15bafb 3457 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f
JA
3458 /* IOPOLL retry should happen for io-wq threads */
3459 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3460 goto copy_iov;
355afaeb 3461done:
889fca73 3462 kiocb_done(kiocb, ret2, issue_flags);
fa15bafb 3463 } else {
f67676d1 3464copy_iov:
84216315 3465 /* some cases will consume bytes even on error returns */
632546c4 3466 iov_iter_revert(iter, io_size - iov_iter_count(iter));
227c0c96 3467 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
6bf985dc 3468 return ret ?: -EAGAIN;
2b188cc1 3469 }
31b51510 3470out_free:
f261c168 3471 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 3472 if (iovec)
6f2cc166 3473 kfree(iovec);
2b188cc1
JA
3474 return ret;
3475}
3476
80a261fd
JA
3477static int io_renameat_prep(struct io_kiocb *req,
3478 const struct io_uring_sqe *sqe)
3479{
3480 struct io_rename *ren = &req->rename;
3481 const char __user *oldf, *newf;
3482
ed7eb259
JA
3483 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3484 return -EINVAL;
3485 if (sqe->ioprio || sqe->buf_index)
3486 return -EINVAL;
80a261fd
JA
3487 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3488 return -EBADF;
3489
3490 ren->old_dfd = READ_ONCE(sqe->fd);
3491 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3492 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3493 ren->new_dfd = READ_ONCE(sqe->len);
3494 ren->flags = READ_ONCE(sqe->rename_flags);
3495
3496 ren->oldpath = getname(oldf);
3497 if (IS_ERR(ren->oldpath))
3498 return PTR_ERR(ren->oldpath);
3499
3500 ren->newpath = getname(newf);
3501 if (IS_ERR(ren->newpath)) {
3502 putname(ren->oldpath);
3503 return PTR_ERR(ren->newpath);
3504 }
3505
3506 req->flags |= REQ_F_NEED_CLEANUP;
3507 return 0;
3508}
3509
45d189c6 3510static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
3511{
3512 struct io_rename *ren = &req->rename;
3513 int ret;
3514
45d189c6 3515 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
3516 return -EAGAIN;
3517
3518 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3519 ren->newpath, ren->flags);
3520
3521 req->flags &= ~REQ_F_NEED_CLEANUP;
3522 if (ret < 0)
93d2bcd2 3523 req_set_fail(req);
80a261fd
JA
3524 io_req_complete(req, ret);
3525 return 0;
3526}
3527
14a1143b
JA
3528static int io_unlinkat_prep(struct io_kiocb *req,
3529 const struct io_uring_sqe *sqe)
3530{
3531 struct io_unlink *un = &req->unlink;
3532 const char __user *fname;
3533
22634bc5
JA
3534 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3535 return -EINVAL;
3536 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
3537 return -EINVAL;
14a1143b
JA
3538 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3539 return -EBADF;
3540
3541 un->dfd = READ_ONCE(sqe->fd);
3542
3543 un->flags = READ_ONCE(sqe->unlink_flags);
3544 if (un->flags & ~AT_REMOVEDIR)
3545 return -EINVAL;
3546
3547 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3548 un->filename = getname(fname);
3549 if (IS_ERR(un->filename))
3550 return PTR_ERR(un->filename);
3551
3552 req->flags |= REQ_F_NEED_CLEANUP;
3553 return 0;
3554}
3555
45d189c6 3556static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
3557{
3558 struct io_unlink *un = &req->unlink;
3559 int ret;
3560
45d189c6 3561 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
3562 return -EAGAIN;
3563
3564 if (un->flags & AT_REMOVEDIR)
3565 ret = do_rmdir(un->dfd, un->filename);
3566 else
3567 ret = do_unlinkat(un->dfd, un->filename);
3568
3569 req->flags &= ~REQ_F_NEED_CLEANUP;
3570 if (ret < 0)
93d2bcd2 3571 req_set_fail(req);
14a1143b
JA
3572 io_req_complete(req, ret);
3573 return 0;
3574}
3575
36f4fa68
JA
3576static int io_shutdown_prep(struct io_kiocb *req,
3577 const struct io_uring_sqe *sqe)
3578{
3579#if defined(CONFIG_NET)
3580 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3581 return -EINVAL;
3582 if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3583 sqe->buf_index)
3584 return -EINVAL;
3585
3586 req->shutdown.how = READ_ONCE(sqe->len);
3587 return 0;
3588#else
3589 return -EOPNOTSUPP;
3590#endif
3591}
3592
45d189c6 3593static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
3594{
3595#if defined(CONFIG_NET)
3596 struct socket *sock;
3597 int ret;
3598
45d189c6 3599 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
3600 return -EAGAIN;
3601
48aba79b 3602 sock = sock_from_file(req->file);
36f4fa68 3603 if (unlikely(!sock))
48aba79b 3604 return -ENOTSOCK;
36f4fa68
JA
3605
3606 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d 3607 if (ret < 0)
93d2bcd2 3608 req_set_fail(req);
36f4fa68
JA
3609 io_req_complete(req, ret);
3610 return 0;
3611#else
3612 return -EOPNOTSUPP;
3613#endif
3614}
3615
f2a8d5c7
PB
3616static int __io_splice_prep(struct io_kiocb *req,
3617 const struct io_uring_sqe *sqe)
7d67af2c 3618{
fe7e3257 3619 struct io_splice *sp = &req->splice;
7d67af2c 3620 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 3621
3232dd02
PB
3622 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3623 return -EINVAL;
7d67af2c
PB
3624
3625 sp->file_in = NULL;
7d67af2c
PB
3626 sp->len = READ_ONCE(sqe->len);
3627 sp->flags = READ_ONCE(sqe->splice_flags);
3628
3629 if (unlikely(sp->flags & ~valid_flags))
3630 return -EINVAL;
3631
ac177053
PB
3632 sp->file_in = io_file_get(req->ctx, NULL, req,
3633 READ_ONCE(sqe->splice_fd_in),
8371adf5
PB
3634 (sp->flags & SPLICE_F_FD_IN_FIXED));
3635 if (!sp->file_in)
3636 return -EBADF;
7d67af2c 3637 req->flags |= REQ_F_NEED_CLEANUP;
7d67af2c
PB
3638 return 0;
3639}
3640
f2a8d5c7
PB
3641static int io_tee_prep(struct io_kiocb *req,
3642 const struct io_uring_sqe *sqe)
3643{
3644 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3645 return -EINVAL;
3646 return __io_splice_prep(req, sqe);
3647}
3648
45d189c6 3649static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
3650{
3651 struct io_splice *sp = &req->splice;
3652 struct file *in = sp->file_in;
3653 struct file *out = sp->file_out;
3654 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3655 long ret = 0;
3656
45d189c6 3657 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7
PB
3658 return -EAGAIN;
3659 if (sp->len)
3660 ret = do_tee(in, out, sp->len, flags);
3661
e1d767f0
PB
3662 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3663 io_put_file(in);
f2a8d5c7
PB
3664 req->flags &= ~REQ_F_NEED_CLEANUP;
3665
f2a8d5c7 3666 if (ret != sp->len)
93d2bcd2 3667 req_set_fail(req);
e1e16097 3668 io_req_complete(req, ret);
f2a8d5c7
PB
3669 return 0;
3670}
3671
3672static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3673{
fe7e3257 3674 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
3675
3676 sp->off_in = READ_ONCE(sqe->splice_off_in);
3677 sp->off_out = READ_ONCE(sqe->off);
3678 return __io_splice_prep(req, sqe);
3679}
3680
45d189c6 3681static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
3682{
3683 struct io_splice *sp = &req->splice;
3684 struct file *in = sp->file_in;
3685 struct file *out = sp->file_out;
3686 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3687 loff_t *poff_in, *poff_out;
c9687426 3688 long ret = 0;
7d67af2c 3689
45d189c6 3690 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 3691 return -EAGAIN;
7d67af2c
PB
3692
3693 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3694 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 3695
948a7749 3696 if (sp->len)
c9687426 3697 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c 3698
e1d767f0
PB
3699 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3700 io_put_file(in);
7d67af2c
PB
3701 req->flags &= ~REQ_F_NEED_CLEANUP;
3702
7d67af2c 3703 if (ret != sp->len)
93d2bcd2 3704 req_set_fail(req);
e1e16097 3705 io_req_complete(req, ret);
7d67af2c
PB
3706 return 0;
3707}
3708
2b188cc1
JA
3709/*
3710 * IORING_OP_NOP just posts a completion event, nothing else.
3711 */
889fca73 3712static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3713{
3714 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 3715
def596e9
JA
3716 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3717 return -EINVAL;
3718
889fca73 3719 __io_req_complete(req, issue_flags, 0, 0);
2b188cc1
JA
3720 return 0;
3721}
3722
1155c76a 3723static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 3724{
6b06314c 3725 struct io_ring_ctx *ctx = req->ctx;
c992fe29 3726
09bb8394
JA
3727 if (!req->file)
3728 return -EBADF;
c992fe29 3729
6b06314c 3730 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 3731 return -EINVAL;
edafccee 3732 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
3733 return -EINVAL;
3734
8ed8d3c3
JA
3735 req->sync.flags = READ_ONCE(sqe->fsync_flags);
3736 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3737 return -EINVAL;
3738
3739 req->sync.off = READ_ONCE(sqe->off);
3740 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
3741 return 0;
3742}
3743
45d189c6 3744static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 3745{
8ed8d3c3 3746 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
3747 int ret;
3748
ac45abc0 3749 /* fsync always requires a blocking context */
45d189c6 3750 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
3751 return -EAGAIN;
3752
9adbd45d 3753 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
3754 end > 0 ? end : LLONG_MAX,
3755 req->sync.flags & IORING_FSYNC_DATASYNC);
3756 if (ret < 0)
93d2bcd2 3757 req_set_fail(req);
e1e16097 3758 io_req_complete(req, ret);
c992fe29
CH
3759 return 0;
3760}
3761
d63d1b5e
JA
3762static int io_fallocate_prep(struct io_kiocb *req,
3763 const struct io_uring_sqe *sqe)
3764{
3765 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3766 return -EINVAL;
3232dd02
PB
3767 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3768 return -EINVAL;
d63d1b5e
JA
3769
3770 req->sync.off = READ_ONCE(sqe->off);
3771 req->sync.len = READ_ONCE(sqe->addr);
3772 req->sync.mode = READ_ONCE(sqe->len);
3773 return 0;
3774}
3775
45d189c6 3776static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 3777{
ac45abc0
PB
3778 int ret;
3779
d63d1b5e 3780 /* fallocate always requiring blocking context */
45d189c6 3781 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 3782 return -EAGAIN;
ac45abc0
PB
3783 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3784 req->sync.len);
ac45abc0 3785 if (ret < 0)
93d2bcd2 3786 req_set_fail(req);
e1e16097 3787 io_req_complete(req, ret);
5d17b4a4
JA
3788 return 0;
3789}
3790
ec65fea5 3791static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 3792{
f8748881 3793 const char __user *fname;
15b71abe 3794 int ret;
b7bb4f7d 3795
ec65fea5 3796 if (unlikely(sqe->ioprio || sqe->buf_index))
15b71abe 3797 return -EINVAL;
ec65fea5 3798 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 3799 return -EBADF;
03b1230c 3800
ec65fea5
PB
3801 /* open.how should be already initialised */
3802 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 3803 req->open.how.flags |= O_LARGEFILE;
3529d8c2 3804
25e72d10
PB
3805 req->open.dfd = READ_ONCE(sqe->fd);
3806 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 3807 req->open.filename = getname(fname);
15b71abe
JA
3808 if (IS_ERR(req->open.filename)) {
3809 ret = PTR_ERR(req->open.filename);
3810 req->open.filename = NULL;
3811 return ret;
3812 }
4022e7af 3813 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 3814 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 3815 return 0;
03b1230c
JA
3816}
3817
ec65fea5
PB
3818static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3819{
3820 u64 flags, mode;
3821
14587a46 3822 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4eb8dded 3823 return -EINVAL;
ec65fea5
PB
3824 mode = READ_ONCE(sqe->len);
3825 flags = READ_ONCE(sqe->open_flags);
3826 req->open.how = build_open_how(flags, mode);
3827 return __io_openat_prep(req, sqe);
3828}
3829
cebdb986 3830static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 3831{
cebdb986 3832 struct open_how __user *how;
cebdb986 3833 size_t len;
0fa03c62
JA
3834 int ret;
3835
14587a46 3836 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4eb8dded 3837 return -EINVAL;
cebdb986
JA
3838 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3839 len = READ_ONCE(sqe->len);
cebdb986
JA
3840 if (len < OPEN_HOW_SIZE_VER0)
3841 return -EINVAL;
3529d8c2 3842
cebdb986
JA
3843 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3844 len);
3845 if (ret)
3846 return ret;
3529d8c2 3847
ec65fea5 3848 return __io_openat_prep(req, sqe);
cebdb986
JA
3849}
3850
45d189c6 3851static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
3852{
3853 struct open_flags op;
15b71abe 3854 struct file *file;
3a81fd02
JA
3855 bool nonblock_set;
3856 bool resolve_nonblock;
15b71abe
JA
3857 int ret;
3858
cebdb986 3859 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
3860 if (ret)
3861 goto err;
3a81fd02
JA
3862 nonblock_set = op.open_flag & O_NONBLOCK;
3863 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 3864 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
3865 /*
3866 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3867 * it'll always -EAGAIN
3868 */
3869 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3870 return -EAGAIN;
3871 op.lookup_flags |= LOOKUP_CACHED;
3872 op.open_flag |= O_NONBLOCK;
3873 }
15b71abe 3874
4022e7af 3875 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
15b71abe
JA
3876 if (ret < 0)
3877 goto err;
3878
3879 file = do_filp_open(req->open.dfd, req->open.filename, &op);
12dcb58a 3880 if (IS_ERR(file)) {
944d1444 3881 /*
12dcb58a
PB
3882 * We could hang on to this 'fd' on retrying, but seems like
3883 * marginal gain for something that is now known to be a slower
3884 * path. So just put it, and we'll get a new one when we retry.
944d1444 3885 */
3a81fd02 3886 put_unused_fd(ret);
3a81fd02 3887
15b71abe 3888 ret = PTR_ERR(file);
12dcb58a
PB
3889 /* only retry if RESOLVE_CACHED wasn't already set by application */
3890 if (ret == -EAGAIN &&
3891 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
3892 return -EAGAIN;
3893 goto err;
15b71abe 3894 }
12dcb58a
PB
3895
3896 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3897 file->f_flags &= ~O_NONBLOCK;
3898 fsnotify_open(file);
3899 fd_install(ret, file);
15b71abe
JA
3900err:
3901 putname(req->open.filename);
8fef80bf 3902 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe 3903 if (ret < 0)
93d2bcd2 3904 req_set_fail(req);
0bdf3398 3905 __io_req_complete(req, issue_flags, ret, 0);
15b71abe
JA
3906 return 0;
3907}
3908
45d189c6 3909static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 3910{
e45cff58 3911 return io_openat2(req, issue_flags);
cebdb986
JA
3912}
3913
067524e9
JA
3914static int io_remove_buffers_prep(struct io_kiocb *req,
3915 const struct io_uring_sqe *sqe)
3916{
3917 struct io_provide_buf *p = &req->pbuf;
3918 u64 tmp;
3919
3920 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3921 return -EINVAL;
3922
3923 tmp = READ_ONCE(sqe->fd);
3924 if (!tmp || tmp > USHRT_MAX)
3925 return -EINVAL;
3926
3927 memset(p, 0, sizeof(*p));
3928 p->nbufs = tmp;
3929 p->bgid = READ_ONCE(sqe->buf_group);
3930 return 0;
3931}
3932
3933static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3934 int bgid, unsigned nbufs)
3935{
3936 unsigned i = 0;
3937
3938 /* shouldn't happen */
3939 if (!nbufs)
3940 return 0;
3941
3942 /* the head kbuf is the list itself */
3943 while (!list_empty(&buf->list)) {
3944 struct io_buffer *nxt;
3945
3946 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3947 list_del(&nxt->list);
3948 kfree(nxt);
3949 if (++i == nbufs)
3950 return i;
3951 }
3952 i++;
3953 kfree(buf);
9e15c3a0 3954 xa_erase(&ctx->io_buffers, bgid);
067524e9
JA
3955
3956 return i;
3957}
3958
889fca73 3959static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
3960{
3961 struct io_provide_buf *p = &req->pbuf;
3962 struct io_ring_ctx *ctx = req->ctx;
3963 struct io_buffer *head;
3964 int ret = 0;
45d189c6 3965 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
067524e9
JA
3966
3967 io_ring_submit_lock(ctx, !force_nonblock);
3968
3969 lockdep_assert_held(&ctx->uring_lock);
3970
3971 ret = -ENOENT;
9e15c3a0 3972 head = xa_load(&ctx->io_buffers, p->bgid);
067524e9
JA
3973 if (head)
3974 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
067524e9 3975 if (ret < 0)
93d2bcd2 3976 req_set_fail(req);
067524e9 3977
9fb8cb49
PB
3978 /* complete before unlock, IOPOLL may need the lock */
3979 __io_req_complete(req, issue_flags, ret, 0);
3980 io_ring_submit_unlock(ctx, !force_nonblock);
067524e9
JA
3981 return 0;
3982}
3983
ddf0322d
JA
3984static int io_provide_buffers_prep(struct io_kiocb *req,
3985 const struct io_uring_sqe *sqe)
3986{
38134ada 3987 unsigned long size, tmp_check;
ddf0322d
JA
3988 struct io_provide_buf *p = &req->pbuf;
3989 u64 tmp;
3990
3991 if (sqe->ioprio || sqe->rw_flags)
3992 return -EINVAL;
3993
3994 tmp = READ_ONCE(sqe->fd);
3995 if (!tmp || tmp > USHRT_MAX)
3996 return -E2BIG;
3997 p->nbufs = tmp;
3998 p->addr = READ_ONCE(sqe->addr);
3999 p->len = READ_ONCE(sqe->len);
4000
38134ada
PB
4001 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4002 &size))
4003 return -EOVERFLOW;
4004 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4005 return -EOVERFLOW;
4006
d81269fe
PB
4007 size = (unsigned long)p->len * p->nbufs;
4008 if (!access_ok(u64_to_user_ptr(p->addr), size))
ddf0322d
JA
4009 return -EFAULT;
4010
4011 p->bgid = READ_ONCE(sqe->buf_group);
4012 tmp = READ_ONCE(sqe->off);
4013 if (tmp > USHRT_MAX)
4014 return -E2BIG;
4015 p->bid = tmp;
4016 return 0;
4017}
4018
4019static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4020{
4021 struct io_buffer *buf;
4022 u64 addr = pbuf->addr;
4023 int i, bid = pbuf->bid;
4024
4025 for (i = 0; i < pbuf->nbufs; i++) {
4026 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
4027 if (!buf)
4028 break;
4029
4030 buf->addr = addr;
d1f82808 4031 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
ddf0322d
JA
4032 buf->bid = bid;
4033 addr += pbuf->len;
4034 bid++;
4035 if (!*head) {
4036 INIT_LIST_HEAD(&buf->list);
4037 *head = buf;
4038 } else {
4039 list_add_tail(&buf->list, &(*head)->list);
4040 }
4041 }
4042
4043 return i ? i : -ENOMEM;
4044}
4045
889fca73 4046static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
4047{
4048 struct io_provide_buf *p = &req->pbuf;
4049 struct io_ring_ctx *ctx = req->ctx;
4050 struct io_buffer *head, *list;
4051 int ret = 0;
45d189c6 4052 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ddf0322d
JA
4053
4054 io_ring_submit_lock(ctx, !force_nonblock);
4055
4056 lockdep_assert_held(&ctx->uring_lock);
4057
9e15c3a0 4058 list = head = xa_load(&ctx->io_buffers, p->bgid);
ddf0322d
JA
4059
4060 ret = io_add_buffers(p, &head);
9e15c3a0
JA
4061 if (ret >= 0 && !list) {
4062 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4063 if (ret < 0)
067524e9 4064 __io_remove_buffers(ctx, head, p->bgid, -1U);
ddf0322d 4065 }
ddf0322d 4066 if (ret < 0)
93d2bcd2 4067 req_set_fail(req);
9fb8cb49
PB
4068 /* complete before unlock, IOPOLL may need the lock */
4069 __io_req_complete(req, issue_flags, ret, 0);
4070 io_ring_submit_unlock(ctx, !force_nonblock);
ddf0322d 4071 return 0;
cebdb986
JA
4072}
4073
3e4827b0
JA
4074static int io_epoll_ctl_prep(struct io_kiocb *req,
4075 const struct io_uring_sqe *sqe)
4076{
4077#if defined(CONFIG_EPOLL)
4078 if (sqe->ioprio || sqe->buf_index)
4079 return -EINVAL;
2d74d042 4080 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4081 return -EINVAL;
3e4827b0
JA
4082
4083 req->epoll.epfd = READ_ONCE(sqe->fd);
4084 req->epoll.op = READ_ONCE(sqe->len);
4085 req->epoll.fd = READ_ONCE(sqe->off);
4086
4087 if (ep_op_has_event(req->epoll.op)) {
4088 struct epoll_event __user *ev;
4089
4090 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4091 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4092 return -EFAULT;
4093 }
4094
4095 return 0;
4096#else
4097 return -EOPNOTSUPP;
4098#endif
4099}
4100
889fca73 4101static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
4102{
4103#if defined(CONFIG_EPOLL)
4104 struct io_epoll *ie = &req->epoll;
4105 int ret;
45d189c6 4106 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
4107
4108 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4109 if (force_nonblock && ret == -EAGAIN)
4110 return -EAGAIN;
4111
4112 if (ret < 0)
93d2bcd2 4113 req_set_fail(req);
889fca73 4114 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
4115 return 0;
4116#else
4117 return -EOPNOTSUPP;
4118#endif
4119}
4120
c1ca757b
JA
4121static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4122{
4123#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4124 if (sqe->ioprio || sqe->buf_index || sqe->off)
4125 return -EINVAL;
3232dd02
PB
4126 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4127 return -EINVAL;
c1ca757b
JA
4128
4129 req->madvise.addr = READ_ONCE(sqe->addr);
4130 req->madvise.len = READ_ONCE(sqe->len);
4131 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4132 return 0;
4133#else
4134 return -EOPNOTSUPP;
4135#endif
4136}
4137
45d189c6 4138static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
4139{
4140#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4141 struct io_madvise *ma = &req->madvise;
4142 int ret;
4143
45d189c6 4144 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
4145 return -EAGAIN;
4146
0726b01e 4147 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b 4148 if (ret < 0)
93d2bcd2 4149 req_set_fail(req);
e1e16097 4150 io_req_complete(req, ret);
c1ca757b
JA
4151 return 0;
4152#else
4153 return -EOPNOTSUPP;
4154#endif
4155}
4156
4840e418
JA
4157static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4158{
4159 if (sqe->ioprio || sqe->buf_index || sqe->addr)
4160 return -EINVAL;
3232dd02
PB
4161 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4162 return -EINVAL;
4840e418
JA
4163
4164 req->fadvise.offset = READ_ONCE(sqe->off);
4165 req->fadvise.len = READ_ONCE(sqe->len);
4166 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4167 return 0;
4168}
4169
45d189c6 4170static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
4171{
4172 struct io_fadvise *fa = &req->fadvise;
4173 int ret;
4174
45d189c6 4175 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
4176 switch (fa->advice) {
4177 case POSIX_FADV_NORMAL:
4178 case POSIX_FADV_RANDOM:
4179 case POSIX_FADV_SEQUENTIAL:
4180 break;
4181 default:
4182 return -EAGAIN;
4183 }
4184 }
4840e418
JA
4185
4186 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4187 if (ret < 0)
93d2bcd2 4188 req_set_fail(req);
0bdf3398 4189 __io_req_complete(req, issue_flags, ret, 0);
4840e418
JA
4190 return 0;
4191}
4192
eddc7ef5
JA
4193static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4194{
2d74d042 4195 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4196 return -EINVAL;
eddc7ef5
JA
4197 if (sqe->ioprio || sqe->buf_index)
4198 return -EINVAL;
9c280f90 4199 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4200 return -EBADF;
eddc7ef5 4201
1d9e1288
BM
4202 req->statx.dfd = READ_ONCE(sqe->fd);
4203 req->statx.mask = READ_ONCE(sqe->len);
e62753e4 4204 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
4205 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4206 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5
JA
4207
4208 return 0;
4209}
4210
45d189c6 4211static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 4212{
1d9e1288 4213 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
4214 int ret;
4215
59d70013 4216 if (issue_flags & IO_URING_F_NONBLOCK)
eddc7ef5
JA
4217 return -EAGAIN;
4218
e62753e4
BM
4219 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4220 ctx->buffer);
eddc7ef5 4221
eddc7ef5 4222 if (ret < 0)
93d2bcd2 4223 req_set_fail(req);
e1e16097 4224 io_req_complete(req, ret);
eddc7ef5
JA
4225 return 0;
4226}
4227
b5dba59e
JA
4228static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4229{
14587a46 4230 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4231 return -EINVAL;
b5dba59e
JA
4232 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4233 sqe->rw_flags || sqe->buf_index)
4234 return -EINVAL;
9c280f90 4235 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4236 return -EBADF;
b5dba59e
JA
4237
4238 req->close.fd = READ_ONCE(sqe->fd);
b5dba59e 4239 return 0;
b5dba59e
JA
4240}
4241
889fca73 4242static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 4243{
9eac1904 4244 struct files_struct *files = current->files;
3af73b28 4245 struct io_close *close = &req->close;
9eac1904 4246 struct fdtable *fdt;
a1fde923
PB
4247 struct file *file = NULL;
4248 int ret = -EBADF;
b5dba59e 4249
9eac1904
JA
4250 spin_lock(&files->file_lock);
4251 fdt = files_fdtable(files);
4252 if (close->fd >= fdt->max_fds) {
4253 spin_unlock(&files->file_lock);
4254 goto err;
4255 }
4256 file = fdt->fd[close->fd];
a1fde923 4257 if (!file || file->f_op == &io_uring_fops) {
9eac1904
JA
4258 spin_unlock(&files->file_lock);
4259 file = NULL;
4260 goto err;
3af73b28 4261 }
b5dba59e
JA
4262
4263 /* if the file has a flush method, be safe and punt to async */
45d189c6 4264 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 4265 spin_unlock(&files->file_lock);
0bf0eefd 4266 return -EAGAIN;
a2100672 4267 }
b5dba59e 4268
9eac1904
JA
4269 ret = __close_fd_get_file(close->fd, &file);
4270 spin_unlock(&files->file_lock);
4271 if (ret < 0) {
4272 if (ret == -ENOENT)
4273 ret = -EBADF;
4274 goto err;
4275 }
4276
3af73b28 4277 /* No ->flush() or already async, safely close from here */
9eac1904
JA
4278 ret = filp_close(file, current->files);
4279err:
3af73b28 4280 if (ret < 0)
93d2bcd2 4281 req_set_fail(req);
9eac1904
JA
4282 if (file)
4283 fput(file);
889fca73 4284 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 4285 return 0;
b5dba59e
JA
4286}
4287
1155c76a 4288static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
4289{
4290 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4 4291
5d17b4a4
JA
4292 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4293 return -EINVAL;
4294 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4295 return -EINVAL;
4296
8ed8d3c3
JA
4297 req->sync.off = READ_ONCE(sqe->off);
4298 req->sync.len = READ_ONCE(sqe->len);
4299 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
4300 return 0;
4301}
4302
45d189c6 4303static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4304{
8ed8d3c3
JA
4305 int ret;
4306
ac45abc0 4307 /* sync_file_range always requires a blocking context */
45d189c6 4308 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4309 return -EAGAIN;
4310
9adbd45d 4311 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
4312 req->sync.flags);
4313 if (ret < 0)
93d2bcd2 4314 req_set_fail(req);
e1e16097 4315 io_req_complete(req, ret);
5d17b4a4
JA
4316 return 0;
4317}
4318
469956e8 4319#if defined(CONFIG_NET)
02d27d89
PB
4320static int io_setup_async_msg(struct io_kiocb *req,
4321 struct io_async_msghdr *kmsg)
4322{
e8c2bc1f
JA
4323 struct io_async_msghdr *async_msg = req->async_data;
4324
4325 if (async_msg)
02d27d89 4326 return -EAGAIN;
e8c2bc1f 4327 if (io_alloc_async_data(req)) {
257e84a5 4328 kfree(kmsg->free_iov);
02d27d89
PB
4329 return -ENOMEM;
4330 }
e8c2bc1f 4331 async_msg = req->async_data;
02d27d89 4332 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 4333 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 4334 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
4335 /* if were using fast_iov, set it to the new one */
4336 if (!async_msg->free_iov)
4337 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4338
02d27d89
PB
4339 return -EAGAIN;
4340}
4341
2ae523ed
PB
4342static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4343 struct io_async_msghdr *iomsg)
4344{
2ae523ed 4345 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 4346 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 4347 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 4348 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
4349}
4350
93642ef8
PB
4351static int io_sendmsg_prep_async(struct io_kiocb *req)
4352{
4353 int ret;
4354
93642ef8
PB
4355 ret = io_sendmsg_copy_hdr(req, req->async_data);
4356 if (!ret)
4357 req->flags |= REQ_F_NEED_CLEANUP;
4358 return ret;
4359}
4360
3529d8c2 4361static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 4362{
e47293fd 4363 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 4364
d2b6f48b
PB
4365 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4366 return -EINVAL;
4367
270a5940 4368 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 4369 sr->len = READ_ONCE(sqe->len);
04411806
PB
4370 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4371 if (sr->msg_flags & MSG_DONTWAIT)
4372 req->flags |= REQ_F_NOWAIT;
3529d8c2 4373
d8768362
JA
4374#ifdef CONFIG_COMPAT
4375 if (req->ctx->compat)
4376 sr->msg_flags |= MSG_CMSG_COMPAT;
4377#endif
93642ef8 4378 return 0;
03b1230c
JA
4379}
4380
889fca73 4381static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4382{
6b754c8b 4383 struct io_async_msghdr iomsg, *kmsg;
0fa03c62 4384 struct socket *sock;
7a7cacba 4385 unsigned flags;
0031275d 4386 int min_ret = 0;
0fa03c62
JA
4387 int ret;
4388
dba4a925 4389 sock = sock_from_file(req->file);
7a7cacba 4390 if (unlikely(!sock))
dba4a925 4391 return -ENOTSOCK;
3529d8c2 4392
257e84a5
PB
4393 kmsg = req->async_data;
4394 if (!kmsg) {
7a7cacba
PB
4395 ret = io_sendmsg_copy_hdr(req, &iomsg);
4396 if (ret)
4397 return ret;
4398 kmsg = &iomsg;
0fa03c62 4399 }
0fa03c62 4400
04411806
PB
4401 flags = req->sr_msg.msg_flags;
4402 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4403 flags |= MSG_DONTWAIT;
0031275d
SM
4404 if (flags & MSG_WAITALL)
4405 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4406
7a7cacba 4407 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
45d189c6 4408 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4409 return io_setup_async_msg(req, kmsg);
4410 if (ret == -ERESTARTSYS)
4411 ret = -EINTR;
0fa03c62 4412
257e84a5
PB
4413 /* fast path, check for non-NULL to avoid function call */
4414 if (kmsg->free_iov)
4415 kfree(kmsg->free_iov);
99bc4c38 4416 req->flags &= ~REQ_F_NEED_CLEANUP;
0031275d 4417 if (ret < min_ret)
93d2bcd2 4418 req_set_fail(req);
889fca73 4419 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 4420 return 0;
03b1230c 4421}
aa1fa28f 4422
889fca73 4423static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4424{
7a7cacba
PB
4425 struct io_sr_msg *sr = &req->sr_msg;
4426 struct msghdr msg;
4427 struct iovec iov;
fddaface 4428 struct socket *sock;
7a7cacba 4429 unsigned flags;
0031275d 4430 int min_ret = 0;
fddaface
JA
4431 int ret;
4432
dba4a925 4433 sock = sock_from_file(req->file);
7a7cacba 4434 if (unlikely(!sock))
dba4a925 4435 return -ENOTSOCK;
fddaface 4436
7a7cacba
PB
4437 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4438 if (unlikely(ret))
14db8411 4439 return ret;
fddaface 4440
7a7cacba
PB
4441 msg.msg_name = NULL;
4442 msg.msg_control = NULL;
4443 msg.msg_controllen = 0;
4444 msg.msg_namelen = 0;
fddaface 4445
04411806
PB
4446 flags = req->sr_msg.msg_flags;
4447 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4448 flags |= MSG_DONTWAIT;
0031275d
SM
4449 if (flags & MSG_WAITALL)
4450 min_ret = iov_iter_count(&msg.msg_iter);
4451
7a7cacba
PB
4452 msg.msg_flags = flags;
4453 ret = sock_sendmsg(sock, &msg);
45d189c6 4454 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4455 return -EAGAIN;
4456 if (ret == -ERESTARTSYS)
4457 ret = -EINTR;
fddaface 4458
0031275d 4459 if (ret < min_ret)
93d2bcd2 4460 req_set_fail(req);
889fca73 4461 __io_req_complete(req, issue_flags, ret, 0);
fddaface 4462 return 0;
fddaface
JA
4463}
4464
1400e697
PB
4465static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4466 struct io_async_msghdr *iomsg)
52de1fe1
JA
4467{
4468 struct io_sr_msg *sr = &req->sr_msg;
4469 struct iovec __user *uiov;
4470 size_t iov_len;
4471 int ret;
4472
1400e697
PB
4473 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4474 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
4475 if (ret)
4476 return ret;
4477
4478 if (req->flags & REQ_F_BUFFER_SELECT) {
4479 if (iov_len > 1)
4480 return -EINVAL;
5476dfed 4481 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 4482 return -EFAULT;
5476dfed 4483 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 4484 iomsg->free_iov = NULL;
52de1fe1 4485 } else {
257e84a5 4486 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4487 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 4488 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 4489 false);
52de1fe1
JA
4490 if (ret > 0)
4491 ret = 0;
4492 }
4493
4494 return ret;
4495}
4496
4497#ifdef CONFIG_COMPAT
4498static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 4499 struct io_async_msghdr *iomsg)
52de1fe1 4500{
52de1fe1
JA
4501 struct io_sr_msg *sr = &req->sr_msg;
4502 struct compat_iovec __user *uiov;
4503 compat_uptr_t ptr;
4504 compat_size_t len;
4505 int ret;
4506
4af3417a
PB
4507 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4508 &ptr, &len);
52de1fe1
JA
4509 if (ret)
4510 return ret;
4511
4512 uiov = compat_ptr(ptr);
4513 if (req->flags & REQ_F_BUFFER_SELECT) {
4514 compat_ssize_t clen;
4515
4516 if (len > 1)
4517 return -EINVAL;
4518 if (!access_ok(uiov, sizeof(*uiov)))
4519 return -EFAULT;
4520 if (__get_user(clen, &uiov->iov_len))
4521 return -EFAULT;
4522 if (clen < 0)
4523 return -EINVAL;
2d280bc8 4524 sr->len = clen;
257e84a5 4525 iomsg->free_iov = NULL;
52de1fe1 4526 } else {
257e84a5 4527 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4528 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 4529 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 4530 &iomsg->msg.msg_iter, true);
52de1fe1
JA
4531 if (ret < 0)
4532 return ret;
4533 }
4534
4535 return 0;
4536}
4537#endif
4538
1400e697
PB
4539static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4540 struct io_async_msghdr *iomsg)
52de1fe1 4541{
1400e697 4542 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
4543
4544#ifdef CONFIG_COMPAT
4545 if (req->ctx->compat)
1400e697 4546 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 4547#endif
52de1fe1 4548
1400e697 4549 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
4550}
4551
bcda7baa 4552static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
7fbb1b54 4553 bool needs_lock)
bcda7baa
JA
4554{
4555 struct io_sr_msg *sr = &req->sr_msg;
4556 struct io_buffer *kbuf;
4557
bcda7baa
JA
4558 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4559 if (IS_ERR(kbuf))
4560 return kbuf;
4561
4562 sr->kbuf = kbuf;
4563 req->flags |= REQ_F_BUFFER_SELECTED;
bcda7baa 4564 return kbuf;
fddaface
JA
4565}
4566
7fbb1b54
PB
4567static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4568{
4569 return io_put_kbuf(req, req->sr_msg.kbuf);
4570}
4571
93642ef8 4572static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 4573{
99bc4c38 4574 int ret;
3529d8c2 4575
93642ef8
PB
4576 ret = io_recvmsg_copy_hdr(req, req->async_data);
4577 if (!ret)
4578 req->flags |= REQ_F_NEED_CLEANUP;
4579 return ret;
4580}
4581
4582static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4583{
4584 struct io_sr_msg *sr = &req->sr_msg;
4585
d2b6f48b
PB
4586 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4587 return -EINVAL;
4588
270a5940 4589 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 4590 sr->len = READ_ONCE(sqe->len);
bcda7baa 4591 sr->bgid = READ_ONCE(sqe->buf_group);
04411806
PB
4592 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4593 if (sr->msg_flags & MSG_DONTWAIT)
4594 req->flags |= REQ_F_NOWAIT;
06b76d44 4595
d8768362
JA
4596#ifdef CONFIG_COMPAT
4597 if (req->ctx->compat)
4598 sr->msg_flags |= MSG_CMSG_COMPAT;
4599#endif
93642ef8 4600 return 0;
aa1fa28f
JA
4601}
4602
889fca73 4603static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4604{
6b754c8b 4605 struct io_async_msghdr iomsg, *kmsg;
03b1230c 4606 struct socket *sock;
7fbb1b54 4607 struct io_buffer *kbuf;
7a7cacba 4608 unsigned flags;
0031275d 4609 int min_ret = 0;
52de1fe1 4610 int ret, cflags = 0;
45d189c6 4611 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 4612
dba4a925 4613 sock = sock_from_file(req->file);
7a7cacba 4614 if (unlikely(!sock))
dba4a925 4615 return -ENOTSOCK;
3529d8c2 4616
257e84a5
PB
4617 kmsg = req->async_data;
4618 if (!kmsg) {
7a7cacba
PB
4619 ret = io_recvmsg_copy_hdr(req, &iomsg);
4620 if (ret)
681fda8d 4621 return ret;
7a7cacba
PB
4622 kmsg = &iomsg;
4623 }
03b1230c 4624
bc02ef33 4625 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4626 kbuf = io_recv_buffer_select(req, !force_nonblock);
bc02ef33 4627 if (IS_ERR(kbuf))
52de1fe1 4628 return PTR_ERR(kbuf);
7a7cacba 4629 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5476dfed
PB
4630 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4631 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
7a7cacba
PB
4632 1, req->sr_msg.len);
4633 }
52de1fe1 4634
04411806
PB
4635 flags = req->sr_msg.msg_flags;
4636 if (force_nonblock)
7a7cacba 4637 flags |= MSG_DONTWAIT;
0031275d
SM
4638 if (flags & MSG_WAITALL)
4639 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4640
7a7cacba
PB
4641 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4642 kmsg->uaddr, flags);
0e1b6fe3
PB
4643 if (force_nonblock && ret == -EAGAIN)
4644 return io_setup_async_msg(req, kmsg);
7a7cacba
PB
4645 if (ret == -ERESTARTSYS)
4646 ret = -EINTR;
03b1230c 4647
7fbb1b54
PB
4648 if (req->flags & REQ_F_BUFFER_SELECTED)
4649 cflags = io_put_recv_kbuf(req);
257e84a5
PB
4650 /* fast path, check for non-NULL to avoid function call */
4651 if (kmsg->free_iov)
4652 kfree(kmsg->free_iov);
99bc4c38 4653 req->flags &= ~REQ_F_NEED_CLEANUP;
0031275d 4654 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
93d2bcd2 4655 req_set_fail(req);
889fca73 4656 __io_req_complete(req, issue_flags, ret, cflags);
03b1230c 4657 return 0;
0fa03c62 4658}
5d17b4a4 4659
889fca73 4660static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4661{
6b754c8b 4662 struct io_buffer *kbuf;
7a7cacba
PB
4663 struct io_sr_msg *sr = &req->sr_msg;
4664 struct msghdr msg;
4665 void __user *buf = sr->buf;
fddaface 4666 struct socket *sock;
7a7cacba
PB
4667 struct iovec iov;
4668 unsigned flags;
0031275d 4669 int min_ret = 0;
bcda7baa 4670 int ret, cflags = 0;
45d189c6 4671 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 4672
dba4a925 4673 sock = sock_from_file(req->file);
7a7cacba 4674 if (unlikely(!sock))
dba4a925 4675 return -ENOTSOCK;
fddaface 4676
bc02ef33 4677 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4678 kbuf = io_recv_buffer_select(req, !force_nonblock);
bcda7baa
JA
4679 if (IS_ERR(kbuf))
4680 return PTR_ERR(kbuf);
7a7cacba 4681 buf = u64_to_user_ptr(kbuf->addr);
bc02ef33 4682 }
bcda7baa 4683
7a7cacba 4684 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
4685 if (unlikely(ret))
4686 goto out_free;
fddaface 4687
7a7cacba
PB
4688 msg.msg_name = NULL;
4689 msg.msg_control = NULL;
4690 msg.msg_controllen = 0;
4691 msg.msg_namelen = 0;
4692 msg.msg_iocb = NULL;
4693 msg.msg_flags = 0;
fddaface 4694
04411806
PB
4695 flags = req->sr_msg.msg_flags;
4696 if (force_nonblock)
7a7cacba 4697 flags |= MSG_DONTWAIT;
0031275d
SM
4698 if (flags & MSG_WAITALL)
4699 min_ret = iov_iter_count(&msg.msg_iter);
4700
7a7cacba
PB
4701 ret = sock_recvmsg(sock, &msg, flags);
4702 if (force_nonblock && ret == -EAGAIN)
4703 return -EAGAIN;
4704 if (ret == -ERESTARTSYS)
4705 ret = -EINTR;
14c32eee 4706out_free:
7fbb1b54
PB
4707 if (req->flags & REQ_F_BUFFER_SELECTED)
4708 cflags = io_put_recv_kbuf(req);
0031275d 4709 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
93d2bcd2 4710 req_set_fail(req);
889fca73 4711 __io_req_complete(req, issue_flags, ret, cflags);
fddaface 4712 return 0;
fddaface
JA
4713}
4714
3529d8c2 4715static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 4716{
8ed8d3c3
JA
4717 struct io_accept *accept = &req->accept;
4718
14587a46 4719 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
17f2fe35 4720 return -EINVAL;
8042d6ce 4721 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
4722 return -EINVAL;
4723
d55e5f5b
JA
4724 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4725 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 4726 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 4727 accept->nofile = rlimit(RLIMIT_NOFILE);
8ed8d3c3 4728 return 0;
8ed8d3c3 4729}
17f2fe35 4730
889fca73 4731static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
4732{
4733 struct io_accept *accept = &req->accept;
45d189c6 4734 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 4735 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
8ed8d3c3
JA
4736 int ret;
4737
e697deed
JX
4738 if (req->file->f_flags & O_NONBLOCK)
4739 req->flags |= REQ_F_NOWAIT;
4740
8ed8d3c3 4741 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
09952e3e
JA
4742 accept->addr_len, accept->flags,
4743 accept->nofile);
8ed8d3c3 4744 if (ret == -EAGAIN && force_nonblock)
17f2fe35 4745 return -EAGAIN;
ac45abc0
PB
4746 if (ret < 0) {
4747 if (ret == -ERESTARTSYS)
4748 ret = -EINTR;
93d2bcd2 4749 req_set_fail(req);
ac45abc0 4750 }
889fca73 4751 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 4752 return 0;
8ed8d3c3
JA
4753}
4754
93642ef8
PB
4755static int io_connect_prep_async(struct io_kiocb *req)
4756{
4757 struct io_async_connect *io = req->async_data;
4758 struct io_connect *conn = &req->connect;
4759
4760 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4761}
4762
3529d8c2 4763static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 4764{
3529d8c2 4765 struct io_connect *conn = &req->connect;
f499a021 4766
14587a46 4767 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3fbb51c1
JA
4768 return -EINVAL;
4769 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4770 return -EINVAL;
4771
3529d8c2
JA
4772 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4773 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 4774 return 0;
f499a021
JA
4775}
4776
889fca73 4777static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 4778{
e8c2bc1f 4779 struct io_async_connect __io, *io;
f8e85cf2 4780 unsigned file_flags;
3fbb51c1 4781 int ret;
45d189c6 4782 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 4783
e8c2bc1f
JA
4784 if (req->async_data) {
4785 io = req->async_data;
f499a021 4786 } else {
3529d8c2
JA
4787 ret = move_addr_to_kernel(req->connect.addr,
4788 req->connect.addr_len,
e8c2bc1f 4789 &__io.address);
f499a021
JA
4790 if (ret)
4791 goto out;
4792 io = &__io;
4793 }
4794
3fbb51c1
JA
4795 file_flags = force_nonblock ? O_NONBLOCK : 0;
4796
e8c2bc1f 4797 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 4798 req->connect.addr_len, file_flags);
87f80d62 4799 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
e8c2bc1f 4800 if (req->async_data)
b7bb4f7d 4801 return -EAGAIN;
e8c2bc1f 4802 if (io_alloc_async_data(req)) {
f499a021
JA
4803 ret = -ENOMEM;
4804 goto out;
4805 }
e8c2bc1f 4806 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 4807 return -EAGAIN;
f499a021 4808 }
f8e85cf2
JA
4809 if (ret == -ERESTARTSYS)
4810 ret = -EINTR;
f499a021 4811out:
4e88d6e7 4812 if (ret < 0)
93d2bcd2 4813 req_set_fail(req);
889fca73 4814 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 4815 return 0;
469956e8
Y
4816}
4817#else /* !CONFIG_NET */
99a10081
JA
4818#define IO_NETOP_FN(op) \
4819static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
4820{ \
4821 return -EOPNOTSUPP; \
4822}
4823
4824#define IO_NETOP_PREP(op) \
4825IO_NETOP_FN(op) \
4826static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4827{ \
4828 return -EOPNOTSUPP; \
4829} \
4830
4831#define IO_NETOP_PREP_ASYNC(op) \
4832IO_NETOP_PREP(op) \
4833static int io_##op##_prep_async(struct io_kiocb *req) \
4834{ \
4835 return -EOPNOTSUPP; \
4836}
4837
4838IO_NETOP_PREP_ASYNC(sendmsg);
4839IO_NETOP_PREP_ASYNC(recvmsg);
4840IO_NETOP_PREP_ASYNC(connect);
4841IO_NETOP_PREP(accept);
4842IO_NETOP_FN(send);
4843IO_NETOP_FN(recv);
469956e8 4844#endif /* CONFIG_NET */
f8e85cf2 4845
d7718a9d
JA
4846struct io_poll_table {
4847 struct poll_table_struct pt;
4848 struct io_kiocb *req;
68b11e8b 4849 int nr_entries;
d7718a9d
JA
4850 int error;
4851};
ce593a6c 4852
d7718a9d 4853static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5b0a6acc 4854 __poll_t mask, io_req_tw_func_t func)
d7718a9d 4855{
d7718a9d
JA
4856 /* for instances that support it check for an event match first: */
4857 if (mask && !(mask & poll->events))
4858 return 0;
4859
4860 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4861
4862 list_del_init(&poll->wait.entry);
4863
d7718a9d 4864 req->result = mask;
5b0a6acc 4865 req->io_task_work.func = func;
6d816e08 4866
d7718a9d 4867 /*
e3aabf95
JA
4868 * If this fails, then the task is exiting. When a task exits, the
4869 * work gets canceled, so just cancel this request as well instead
4870 * of executing it. We can't safely execute it anyway, as we may not
4871 * have the needed state needed for it anyway.
d7718a9d 4872 */
e09ee510 4873 io_req_task_work_add(req);
d7718a9d
JA
4874 return 1;
4875}
4876
74ce6ce4
JA
4877static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4878 __acquires(&req->ctx->completion_lock)
4879{
4880 struct io_ring_ctx *ctx = req->ctx;
4881
e09ee510
PB
4882 if (unlikely(req->task->flags & PF_EXITING))
4883 WRITE_ONCE(poll->canceled, true);
4884
74ce6ce4
JA
4885 if (!req->result && !READ_ONCE(poll->canceled)) {
4886 struct poll_table_struct pt = { ._key = poll->events };
4887
4888 req->result = vfs_poll(req->file, &pt) & poll->events;
4889 }
4890
4891 spin_lock_irq(&ctx->completion_lock);
4892 if (!req->result && !READ_ONCE(poll->canceled)) {
4893 add_wait_queue(poll->head, &poll->wait);
4894 return true;
4895 }
4896
4897 return false;
4898}
4899
d4e7cd36 4900static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 4901{
e8c2bc1f 4902 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 4903 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 4904 return req->async_data;
d4e7cd36
JA
4905 return req->apoll->double_poll;
4906}
4907
4908static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4909{
4910 if (req->opcode == IORING_OP_POLL_ADD)
4911 return &req->poll;
4912 return &req->apoll->poll;
4913}
4914
4915static void io_poll_remove_double(struct io_kiocb *req)
e07785b0 4916 __must_hold(&req->ctx->completion_lock)
d4e7cd36
JA
4917{
4918 struct io_poll_iocb *poll = io_poll_get_double(req);
18bceab1
JA
4919
4920 lockdep_assert_held(&req->ctx->completion_lock);
4921
4922 if (poll && poll->head) {
4923 struct wait_queue_head *head = poll->head;
4924
4925 spin_lock(&head->lock);
4926 list_del_init(&poll->wait.entry);
4927 if (poll->wait.private)
de9b4cca 4928 req_ref_put(req);
18bceab1
JA
4929 poll->head = NULL;
4930 spin_unlock(&head->lock);
4931 }
4932}
4933
e27414be 4934static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
e07785b0 4935 __must_hold(&req->ctx->completion_lock)
18bceab1
JA
4936{
4937 struct io_ring_ctx *ctx = req->ctx;
88e41cf9 4938 unsigned flags = IORING_CQE_F_MORE;
e27414be 4939 int error;
18bceab1 4940
e27414be 4941 if (READ_ONCE(req->poll.canceled)) {
45ab03b1 4942 error = -ECANCELED;
88e41cf9 4943 req->poll.events |= EPOLLONESHOT;
e27414be 4944 } else {
5082620f 4945 error = mangle_poll(mask);
e27414be 4946 }
b69de288
JA
4947 if (req->poll.events & EPOLLONESHOT)
4948 flags = 0;
d4d19c19 4949 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
88e41cf9
JA
4950 req->poll.done = true;
4951 flags = 0;
4952 }
7b289c38
HX
4953 if (flags & IORING_CQE_F_MORE)
4954 ctx->cq_extra++;
18bceab1 4955
18bceab1 4956 io_commit_cqring(ctx);
88e41cf9 4957 return !(flags & IORING_CQE_F_MORE);
18bceab1
JA
4958}
4959
5b0a6acc 4960static void io_poll_task_func(struct io_kiocb *req)
18bceab1
JA
4961{
4962 struct io_ring_ctx *ctx = req->ctx;
dd221f46 4963 struct io_kiocb *nxt;
18bceab1
JA
4964
4965 if (io_poll_rewait(req, &req->poll)) {
4966 spin_unlock_irq(&ctx->completion_lock);
dd221f46 4967 } else {
f40b964a 4968 bool done;
18bceab1 4969
e27414be 4970 done = io_poll_complete(req, req->result);
88e41cf9 4971 if (done) {
a890d01e 4972 io_poll_remove_double(req);
88e41cf9 4973 hash_del(&req->hash_node);
f40b964a 4974 } else {
88e41cf9
JA
4975 req->result = 0;
4976 add_wait_queue(req->poll.head, &req->poll.wait);
4977 }
dd221f46 4978 spin_unlock_irq(&ctx->completion_lock);
dd221f46 4979 io_cqring_ev_posted(ctx);
18bceab1 4980
88e41cf9
JA
4981 if (done) {
4982 nxt = io_put_req_find_next(req);
4983 if (nxt)
5b0a6acc 4984 io_req_task_submit(nxt);
88e41cf9 4985 }
dd221f46 4986 }
18bceab1
JA
4987}
4988
4989static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4990 int sync, void *key)
4991{
4992 struct io_kiocb *req = wait->private;
d4e7cd36 4993 struct io_poll_iocb *poll = io_poll_get_single(req);
18bceab1
JA
4994 __poll_t mask = key_to_poll(key);
4995
4996 /* for instances that support it check for an event match first: */
4997 if (mask && !(mask & poll->events))
4998 return 0;
88e41cf9
JA
4999 if (!(poll->events & EPOLLONESHOT))
5000 return poll->wait.func(&poll->wait, mode, sync, key);
18bceab1 5001
8706e04e
JA
5002 list_del_init(&wait->entry);
5003
9ce85ef2 5004 if (poll->head) {
18bceab1
JA
5005 bool done;
5006
807abcb0
JA
5007 spin_lock(&poll->head->lock);
5008 done = list_empty(&poll->wait.entry);
18bceab1 5009 if (!done)
807abcb0 5010 list_del_init(&poll->wait.entry);
d4e7cd36
JA
5011 /* make sure double remove sees this as being gone */
5012 wait->private = NULL;
807abcb0 5013 spin_unlock(&poll->head->lock);
c8b5e260
JA
5014 if (!done) {
5015 /* use wait func handler, so it matches the rq type */
5016 poll->wait.func(&poll->wait, mode, sync, key);
5017 }
18bceab1 5018 }
de9b4cca 5019 req_ref_put(req);
18bceab1
JA
5020 return 1;
5021}
5022
5023static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5024 wait_queue_func_t wake_func)
5025{
5026 poll->head = NULL;
5027 poll->done = false;
5028 poll->canceled = false;
464dca61
JA
5029#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5030 /* mask in events that we always want/need */
5031 poll->events = events | IO_POLL_UNMASK;
18bceab1
JA
5032 INIT_LIST_HEAD(&poll->wait.entry);
5033 init_waitqueue_func_entry(&poll->wait, wake_func);
5034}
5035
5036static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
5037 struct wait_queue_head *head,
5038 struct io_poll_iocb **poll_ptr)
18bceab1
JA
5039{
5040 struct io_kiocb *req = pt->req;
5041
5042 /*
68b11e8b
PB
5043 * The file being polled uses multiple waitqueues for poll handling
5044 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5045 * if this happens.
18bceab1 5046 */
68b11e8b 5047 if (unlikely(pt->nr_entries)) {
58852d4d
PB
5048 struct io_poll_iocb *poll_one = poll;
5049
18bceab1 5050 /* already have a 2nd entry, fail a third attempt */
807abcb0 5051 if (*poll_ptr) {
18bceab1
JA
5052 pt->error = -EINVAL;
5053 return;
5054 }
ea6a693d
JA
5055 /*
5056 * Can't handle multishot for double wait for now, turn it
5057 * into one-shot mode.
5058 */
7a274727
PB
5059 if (!(poll_one->events & EPOLLONESHOT))
5060 poll_one->events |= EPOLLONESHOT;
1c3b3e65 5061 /* double add on the same waitqueue head, ignore */
7a274727 5062 if (poll_one->head == head)
1c3b3e65 5063 return;
18bceab1
JA
5064 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5065 if (!poll) {
5066 pt->error = -ENOMEM;
5067 return;
5068 }
58852d4d 5069 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
de9b4cca 5070 req_ref_get(req);
18bceab1 5071 poll->wait.private = req;
807abcb0 5072 *poll_ptr = poll;
18bceab1
JA
5073 }
5074
68b11e8b 5075 pt->nr_entries++;
18bceab1 5076 poll->head = head;
a31eb4a2
JX
5077
5078 if (poll->events & EPOLLEXCLUSIVE)
5079 add_wait_queue_exclusive(head, &poll->wait);
5080 else
5081 add_wait_queue(head, &poll->wait);
18bceab1
JA
5082}
5083
5084static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5085 struct poll_table_struct *p)
5086{
5087 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
807abcb0 5088 struct async_poll *apoll = pt->req->apoll;
18bceab1 5089
807abcb0 5090 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
18bceab1
JA
5091}
5092
5b0a6acc 5093static void io_async_task_func(struct io_kiocb *req)
d7718a9d 5094{
d7718a9d
JA
5095 struct async_poll *apoll = req->apoll;
5096 struct io_ring_ctx *ctx = req->ctx;
5097
236daeae 5098 trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
d7718a9d 5099
74ce6ce4 5100 if (io_poll_rewait(req, &apoll->poll)) {
d7718a9d 5101 spin_unlock_irq(&ctx->completion_lock);
74ce6ce4 5102 return;
d7718a9d
JA
5103 }
5104
0ea13b44 5105 hash_del(&req->hash_node);
d4e7cd36 5106 io_poll_remove_double(req);
74ce6ce4
JA
5107 spin_unlock_irq(&ctx->completion_lock);
5108
0be0b0e3 5109 if (!READ_ONCE(apoll->poll.canceled))
5b0a6acc 5110 io_req_task_submit(req);
0be0b0e3 5111 else
2593553a 5112 io_req_complete_failed(req, -ECANCELED);
d7718a9d
JA
5113}
5114
5115static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5116 void *key)
5117{
5118 struct io_kiocb *req = wait->private;
5119 struct io_poll_iocb *poll = &req->apoll->poll;
5120
5121 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5122 key_to_poll(key));
5123
5124 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5125}
5126
5127static void io_poll_req_insert(struct io_kiocb *req)
5128{
5129 struct io_ring_ctx *ctx = req->ctx;
5130 struct hlist_head *list;
5131
5132 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5133 hlist_add_head(&req->hash_node, list);
5134}
5135
5136static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5137 struct io_poll_iocb *poll,
5138 struct io_poll_table *ipt, __poll_t mask,
5139 wait_queue_func_t wake_func)
5140 __acquires(&ctx->completion_lock)
5141{
5142 struct io_ring_ctx *ctx = req->ctx;
5143 bool cancel = false;
5144
4d52f338 5145 INIT_HLIST_NODE(&req->hash_node);
18bceab1 5146 io_init_poll_iocb(poll, mask, wake_func);
b90cd197 5147 poll->file = req->file;
18bceab1 5148 poll->wait.private = req;
d7718a9d
JA
5149
5150 ipt->pt._key = mask;
5151 ipt->req = req;
68b11e8b
PB
5152 ipt->error = 0;
5153 ipt->nr_entries = 0;
d7718a9d 5154
d7718a9d 5155 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
68b11e8b
PB
5156 if (unlikely(!ipt->nr_entries) && !ipt->error)
5157 ipt->error = -EINVAL;
d7718a9d
JA
5158
5159 spin_lock_irq(&ctx->completion_lock);
a890d01e 5160 if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
46fee9ab 5161 io_poll_remove_double(req);
d7718a9d
JA
5162 if (likely(poll->head)) {
5163 spin_lock(&poll->head->lock);
5164 if (unlikely(list_empty(&poll->wait.entry))) {
5165 if (ipt->error)
5166 cancel = true;
5167 ipt->error = 0;
5168 mask = 0;
5169 }
88e41cf9 5170 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
d7718a9d
JA
5171 list_del_init(&poll->wait.entry);
5172 else if (cancel)
5173 WRITE_ONCE(poll->canceled, true);
5174 else if (!poll->done) /* actually waiting for an event */
5175 io_poll_req_insert(req);
5176 spin_unlock(&poll->head->lock);
5177 }
5178
5179 return mask;
5180}
5181
59b735ae
OL
5182enum {
5183 IO_APOLL_OK,
5184 IO_APOLL_ABORTED,
5185 IO_APOLL_READY
5186};
5187
5188static int io_arm_poll_handler(struct io_kiocb *req)
d7718a9d
JA
5189{
5190 const struct io_op_def *def = &io_op_defs[req->opcode];
5191 struct io_ring_ctx *ctx = req->ctx;
5192 struct async_poll *apoll;
5193 struct io_poll_table ipt;
b2d9c3da 5194 __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
9dab14b8 5195 int rw;
d7718a9d
JA
5196
5197 if (!req->file || !file_can_poll(req->file))
59b735ae 5198 return IO_APOLL_ABORTED;
24c74678 5199 if (req->flags & REQ_F_POLLED)
59b735ae 5200 return IO_APOLL_ABORTED;
b2d9c3da
PB
5201 if (!def->pollin && !def->pollout)
5202 return IO_APOLL_ABORTED;
5203
5204 if (def->pollin) {
9dab14b8 5205 rw = READ;
b2d9c3da
PB
5206 mask |= POLLIN | POLLRDNORM;
5207
5208 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5209 if ((req->opcode == IORING_OP_RECVMSG) &&
5210 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5211 mask &= ~POLLIN;
5212 } else {
9dab14b8 5213 rw = WRITE;
b2d9c3da
PB
5214 mask |= POLLOUT | POLLWRNORM;
5215 }
5216
9dab14b8 5217 /* if we can't nonblock try, then no point in arming a poll handler */
b191e2df 5218 if (!io_file_supports_nowait(req, rw))
59b735ae 5219 return IO_APOLL_ABORTED;
d7718a9d
JA
5220
5221 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5222 if (unlikely(!apoll))
59b735ae 5223 return IO_APOLL_ABORTED;
807abcb0 5224 apoll->double_poll = NULL;
d7718a9d 5225 req->apoll = apoll;
b2d9c3da 5226 req->flags |= REQ_F_POLLED;
d7718a9d
JA
5227 ipt.pt._qproc = io_async_queue_proc;
5228
5229 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5230 io_async_wake);
a36da65c 5231 if (ret || ipt.error) {
d7718a9d 5232 spin_unlock_irq(&ctx->completion_lock);
59b735ae
OL
5233 if (ret)
5234 return IO_APOLL_READY;
5235 return IO_APOLL_ABORTED;
d7718a9d
JA
5236 }
5237 spin_unlock_irq(&ctx->completion_lock);
236daeae
OL
5238 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5239 mask, apoll->poll.events);
59b735ae 5240 return IO_APOLL_OK;
d7718a9d
JA
5241}
5242
5243static bool __io_poll_remove_one(struct io_kiocb *req,
b2e720ac 5244 struct io_poll_iocb *poll, bool do_cancel)
e07785b0 5245 __must_hold(&req->ctx->completion_lock)
221c5eb2 5246{
b41e9852 5247 bool do_complete = false;
221c5eb2 5248
5082620f
JA
5249 if (!poll->head)
5250 return false;
221c5eb2 5251 spin_lock(&poll->head->lock);
b2e720ac
JA
5252 if (do_cancel)
5253 WRITE_ONCE(poll->canceled, true);
392edb45
JA
5254 if (!list_empty(&poll->wait.entry)) {
5255 list_del_init(&poll->wait.entry);
b41e9852 5256 do_complete = true;
221c5eb2
JA
5257 }
5258 spin_unlock(&poll->head->lock);
3bfa5bcb 5259 hash_del(&req->hash_node);
d7718a9d
JA
5260 return do_complete;
5261}
5262
b2c3f7e1 5263static bool io_poll_remove_waitqs(struct io_kiocb *req)
e07785b0 5264 __must_hold(&req->ctx->completion_lock)
d7718a9d
JA
5265{
5266 bool do_complete;
5267
d4e7cd36 5268 io_poll_remove_double(req);
e31001a3 5269 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
d4e7cd36 5270
e31001a3 5271 if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
d7718a9d 5272 /* non-poll requests have submit ref still */
e31001a3 5273 req_ref_put(req);
b1f573bd 5274 }
b2c3f7e1
JA
5275 return do_complete;
5276}
5277
5278static bool io_poll_remove_one(struct io_kiocb *req)
e07785b0 5279 __must_hold(&req->ctx->completion_lock)
b2c3f7e1
JA
5280{
5281 bool do_complete;
b1f573bd 5282
b2c3f7e1 5283 do_complete = io_poll_remove_waitqs(req);
b41e9852 5284 if (do_complete) {
d4d19c19 5285 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
b41e9852 5286 io_commit_cqring(req->ctx);
93d2bcd2 5287 req_set_fail(req);
216578e5 5288 io_put_req_deferred(req, 1);
b41e9852
JA
5289 }
5290
5291 return do_complete;
221c5eb2
JA
5292}
5293
76e1b642
JA
5294/*
5295 * Returns true if we found and killed one or more poll requests
5296 */
6b81928d 5297static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
3dd0c97a 5298 bool cancel_all)
221c5eb2 5299{
78076bb6 5300 struct hlist_node *tmp;
221c5eb2 5301 struct io_kiocb *req;
8e2e1faf 5302 int posted = 0, i;
221c5eb2
JA
5303
5304 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
5305 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5306 struct hlist_head *list;
5307
5308 list = &ctx->cancel_hash[i];
f3606e3a 5309 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
3dd0c97a 5310 if (io_match_task(req, tsk, cancel_all))
f3606e3a
JA
5311 posted += io_poll_remove_one(req);
5312 }
221c5eb2
JA
5313 }
5314 spin_unlock_irq(&ctx->completion_lock);
b41e9852 5315
8e2e1faf
JA
5316 if (posted)
5317 io_cqring_ev_posted(ctx);
76e1b642
JA
5318
5319 return posted != 0;
221c5eb2
JA
5320}
5321
9ba5fac8
PB
5322static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5323 bool poll_only)
e07785b0 5324 __must_hold(&ctx->completion_lock)
47f46768 5325{
78076bb6 5326 struct hlist_head *list;
47f46768
JA
5327 struct io_kiocb *req;
5328
78076bb6
JA
5329 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5330 hlist_for_each_entry(req, list, hash_node) {
b41e9852
JA
5331 if (sqe_addr != req->user_data)
5332 continue;
9ba5fac8
PB
5333 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5334 continue;
b2cb805f 5335 return req;
47f46768 5336 }
b2cb805f
JA
5337 return NULL;
5338}
5339
9ba5fac8
PB
5340static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5341 bool poll_only)
e07785b0 5342 __must_hold(&ctx->completion_lock)
b2cb805f
JA
5343{
5344 struct io_kiocb *req;
5345
9ba5fac8 5346 req = io_poll_find(ctx, sqe_addr, poll_only);
b2cb805f
JA
5347 if (!req)
5348 return -ENOENT;
5349 if (io_poll_remove_one(req))
5350 return 0;
5351
5352 return -EALREADY;
47f46768
JA
5353}
5354
9096af3e
PB
5355static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5356 unsigned int flags)
5357{
5358 u32 events;
47f46768 5359
9096af3e
PB
5360 events = READ_ONCE(sqe->poll32_events);
5361#ifdef __BIG_ENDIAN
5362 events = swahw32(events);
5363#endif
5364 if (!(flags & IORING_POLL_ADD_MULTI))
5365 events |= EPOLLONESHOT;
5366 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
47f46768
JA
5367}
5368
c5de0036 5369static int io_poll_update_prep(struct io_kiocb *req,
3529d8c2 5370 const struct io_uring_sqe *sqe)
0969e783 5371{
c5de0036
PB
5372 struct io_poll_update *upd = &req->poll_update;
5373 u32 flags;
5374
0969e783
JA
5375 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5376 return -EINVAL;
c5de0036
PB
5377 if (sqe->ioprio || sqe->buf_index)
5378 return -EINVAL;
5379 flags = READ_ONCE(sqe->len);
5380 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5381 IORING_POLL_ADD_MULTI))
5382 return -EINVAL;
5383 /* meaningless without update */
5384 if (flags == IORING_POLL_ADD_MULTI)
0969e783
JA
5385 return -EINVAL;
5386
c5de0036
PB
5387 upd->old_user_data = READ_ONCE(sqe->addr);
5388 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5389 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
221c5eb2 5390
c5de0036
PB
5391 upd->new_user_data = READ_ONCE(sqe->off);
5392 if (!upd->update_user_data && upd->new_user_data)
5393 return -EINVAL;
5394 if (upd->update_events)
5395 upd->events = io_poll_parse_events(sqe, flags);
5396 else if (sqe->poll32_events)
5397 return -EINVAL;
221c5eb2 5398
221c5eb2
JA
5399 return 0;
5400}
5401
221c5eb2
JA
5402static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5403 void *key)
5404{
c2f2eb7d
JA
5405 struct io_kiocb *req = wait->private;
5406 struct io_poll_iocb *poll = &req->poll;
221c5eb2 5407
d7718a9d 5408 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
221c5eb2
JA
5409}
5410
221c5eb2
JA
5411static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5412 struct poll_table_struct *p)
5413{
5414 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5415
e8c2bc1f 5416 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
eac406c6
JA
5417}
5418
3529d8c2 5419static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
5420{
5421 struct io_poll_iocb *poll = &req->poll;
c5de0036 5422 u32 flags;
221c5eb2
JA
5423
5424 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5425 return -EINVAL;
c5de0036 5426 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
88e41cf9
JA
5427 return -EINVAL;
5428 flags = READ_ONCE(sqe->len);
c5de0036 5429 if (flags & ~IORING_POLL_ADD_MULTI)
221c5eb2
JA
5430 return -EINVAL;
5431
c5de0036 5432 poll->events = io_poll_parse_events(sqe, flags);
0969e783
JA
5433 return 0;
5434}
5435
61e98203 5436static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
5437{
5438 struct io_poll_iocb *poll = &req->poll;
5439 struct io_ring_ctx *ctx = req->ctx;
5440 struct io_poll_table ipt;
0969e783 5441 __poll_t mask;
0969e783 5442
d7718a9d 5443 ipt.pt._qproc = io_poll_queue_proc;
36703247 5444
d7718a9d
JA
5445 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5446 io_poll_wake);
221c5eb2 5447
8c838788 5448 if (mask) { /* no async, we'd stolen it */
221c5eb2 5449 ipt.error = 0;
e27414be 5450 io_poll_complete(req, mask);
221c5eb2 5451 }
221c5eb2
JA
5452 spin_unlock_irq(&ctx->completion_lock);
5453
8c838788
JA
5454 if (mask) {
5455 io_cqring_ev_posted(ctx);
88e41cf9
JA
5456 if (poll->events & EPOLLONESHOT)
5457 io_put_req(req);
221c5eb2 5458 }
8c838788 5459 return ipt.error;
221c5eb2
JA
5460}
5461
c5de0036 5462static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
b69de288
JA
5463{
5464 struct io_ring_ctx *ctx = req->ctx;
5465 struct io_kiocb *preq;
cb3b200e 5466 bool completing;
b69de288
JA
5467 int ret;
5468
5469 spin_lock_irq(&ctx->completion_lock);
9ba5fac8 5470 preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
b69de288
JA
5471 if (!preq) {
5472 ret = -ENOENT;
5473 goto err;
b69de288 5474 }
cb3b200e 5475
c5de0036
PB
5476 if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5477 completing = true;
5478 ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5479 goto err;
5480 }
5481
cb3b200e
JA
5482 /*
5483 * Don't allow racy completion with singleshot, as we cannot safely
5484 * update those. For multishot, if we're racing with completion, just
5485 * let completion re-add it.
5486 */
5487 completing = !__io_poll_remove_one(preq, &preq->poll, false);
5488 if (completing && (preq->poll.events & EPOLLONESHOT)) {
5489 ret = -EALREADY;
5490 goto err;
b69de288
JA
5491 }
5492 /* we now have a detached poll request. reissue. */
5493 ret = 0;
5494err:
b69de288 5495 if (ret < 0) {
cb3b200e 5496 spin_unlock_irq(&ctx->completion_lock);
93d2bcd2 5497 req_set_fail(req);
b69de288
JA
5498 io_req_complete(req, ret);
5499 return 0;
5500 }
5501 /* only mask one event flags, keep behavior flags */
9d805892 5502 if (req->poll_update.update_events) {
b69de288 5503 preq->poll.events &= ~0xffff;
9d805892 5504 preq->poll.events |= req->poll_update.events & 0xffff;
b69de288
JA
5505 preq->poll.events |= IO_POLL_UNMASK;
5506 }
9d805892
PB
5507 if (req->poll_update.update_user_data)
5508 preq->user_data = req->poll_update.new_user_data;
cb3b200e
JA
5509 spin_unlock_irq(&ctx->completion_lock);
5510
b69de288
JA
5511 /* complete update request, we're done with it */
5512 io_req_complete(req, ret);
5513
cb3b200e 5514 if (!completing) {
c5de0036 5515 ret = io_poll_add(preq, issue_flags);
cb3b200e 5516 if (ret < 0) {
93d2bcd2 5517 req_set_fail(preq);
cb3b200e
JA
5518 io_req_complete(preq, ret);
5519 }
b69de288
JA
5520 }
5521 return 0;
5522}
5523
5262f567
JA
5524static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5525{
ad8a48ac
JA
5526 struct io_timeout_data *data = container_of(timer,
5527 struct io_timeout_data, timer);
5528 struct io_kiocb *req = data->req;
5529 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
5530 unsigned long flags;
5531
5262f567 5532 spin_lock_irqsave(&ctx->completion_lock, flags);
a71976f3 5533 list_del_init(&req->timeout.list);
01cec8c1
PB
5534 atomic_set(&req->ctx->cq_timeouts,
5535 atomic_read(&req->ctx->cq_timeouts) + 1);
5536
d4d19c19 5537 io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
5262f567
JA
5538 io_commit_cqring(ctx);
5539 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5540
5541 io_cqring_ev_posted(ctx);
93d2bcd2 5542 req_set_fail(req);
5262f567
JA
5543 io_put_req(req);
5544 return HRTIMER_NORESTART;
5545}
5546
fbd15848
PB
5547static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5548 __u64 user_data)
e07785b0 5549 __must_hold(&ctx->completion_lock)
f254ac04 5550{
fbd15848 5551 struct io_timeout_data *io;
47f46768 5552 struct io_kiocb *req;
fd9c7bc5 5553 bool found = false;
f254ac04 5554
135fcde8 5555 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
fd9c7bc5
PB
5556 found = user_data == req->user_data;
5557 if (found)
47f46768 5558 break;
47f46768 5559 }
fd9c7bc5
PB
5560 if (!found)
5561 return ERR_PTR(-ENOENT);
fbd15848
PB
5562
5563 io = req->async_data;
fd9c7bc5 5564 if (hrtimer_try_to_cancel(&io->timer) == -1)
fbd15848 5565 return ERR_PTR(-EALREADY);
a71976f3 5566 list_del_init(&req->timeout.list);
fbd15848
PB
5567 return req;
5568}
47f46768 5569
fbd15848 5570static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
e07785b0 5571 __must_hold(&ctx->completion_lock)
fbd15848
PB
5572{
5573 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5574
5575 if (IS_ERR(req))
5576 return PTR_ERR(req);
f254ac04 5577
93d2bcd2 5578 req_set_fail(req);
d4d19c19 5579 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
216578e5 5580 io_put_req_deferred(req, 1);
f254ac04
JA
5581 return 0;
5582}
5583
9c8e11b3
PB
5584static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5585 struct timespec64 *ts, enum hrtimer_mode mode)
e07785b0 5586 __must_hold(&ctx->completion_lock)
47f46768 5587{
9c8e11b3
PB
5588 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5589 struct io_timeout_data *data;
47f46768 5590
9c8e11b3
PB
5591 if (IS_ERR(req))
5592 return PTR_ERR(req);
47f46768 5593
9c8e11b3
PB
5594 req->timeout.off = 0; /* noseq */
5595 data = req->async_data;
5596 list_add_tail(&req->timeout.list, &ctx->timeout_list);
5597 hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5598 data->timer.function = io_timeout_fn;
5599 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5600 return 0;
47f46768
JA
5601}
5602
3529d8c2
JA
5603static int io_timeout_remove_prep(struct io_kiocb *req,
5604 const struct io_uring_sqe *sqe)
b29472ee 5605{
9c8e11b3
PB
5606 struct io_timeout_rem *tr = &req->timeout_rem;
5607
b29472ee
JA
5608 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5609 return -EINVAL;
61710e43
DA
5610 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5611 return -EINVAL;
9c8e11b3 5612 if (sqe->ioprio || sqe->buf_index || sqe->len)
b29472ee
JA
5613 return -EINVAL;
5614
9c8e11b3
PB
5615 tr->addr = READ_ONCE(sqe->addr);
5616 tr->flags = READ_ONCE(sqe->timeout_flags);
5617 if (tr->flags & IORING_TIMEOUT_UPDATE) {
5618 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5619 return -EINVAL;
5620 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5621 return -EFAULT;
5622 } else if (tr->flags) {
5623 /* timeout removal doesn't support flags */
b29472ee 5624 return -EINVAL;
9c8e11b3 5625 }
b29472ee 5626
b29472ee
JA
5627 return 0;
5628}
5629
8662daec
PB
5630static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5631{
5632 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5633 : HRTIMER_MODE_REL;
5634}
5635
11365043
JA
5636/*
5637 * Remove or update an existing timeout command
5638 */
61e98203 5639static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 5640{
9c8e11b3 5641 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 5642 struct io_ring_ctx *ctx = req->ctx;
47f46768 5643 int ret;
11365043 5644
11365043 5645 spin_lock_irq(&ctx->completion_lock);
8662daec 5646 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
9c8e11b3 5647 ret = io_timeout_cancel(ctx, tr->addr);
8662daec
PB
5648 else
5649 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5650 io_translate_timeout_mode(tr->flags));
11365043 5651
d4d19c19 5652 io_cqring_fill_event(ctx, req->user_data, ret, 0);
11365043
JA
5653 io_commit_cqring(ctx);
5654 spin_unlock_irq(&ctx->completion_lock);
5262f567 5655 io_cqring_ev_posted(ctx);
4e88d6e7 5656 if (ret < 0)
93d2bcd2 5657 req_set_fail(req);
ec9c02ad 5658 io_put_req(req);
11365043 5659 return 0;
5262f567
JA
5660}
5661
3529d8c2 5662static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 5663 bool is_timeout_link)
5262f567 5664{
ad8a48ac 5665 struct io_timeout_data *data;
a41525ab 5666 unsigned flags;
56080b02 5667 u32 off = READ_ONCE(sqe->off);
5262f567 5668
ad8a48ac 5669 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 5670 return -EINVAL;
ad8a48ac 5671 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 5672 return -EINVAL;
56080b02 5673 if (off && is_timeout_link)
2d28390a 5674 return -EINVAL;
a41525ab
JA
5675 flags = READ_ONCE(sqe->timeout_flags);
5676 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 5677 return -EINVAL;
bdf20073 5678
bfe68a22 5679 req->timeout.off = off;
f18ee4cf
PB
5680 if (unlikely(off && !req->ctx->off_timeout_used))
5681 req->ctx->off_timeout_used = true;
26a61679 5682
e8c2bc1f 5683 if (!req->async_data && io_alloc_async_data(req))
26a61679
JA
5684 return -ENOMEM;
5685
e8c2bc1f 5686 data = req->async_data;
ad8a48ac 5687 data->req = req;
ad8a48ac
JA
5688
5689 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
5690 return -EFAULT;
5691
8662daec 5692 data->mode = io_translate_timeout_mode(flags);
ad8a48ac 5693 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
2482b58f
PB
5694 if (is_timeout_link)
5695 io_req_track_inflight(req);
ad8a48ac
JA
5696 return 0;
5697}
5698
61e98203 5699static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 5700{
ad8a48ac 5701 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 5702 struct io_timeout_data *data = req->async_data;
ad8a48ac 5703 struct list_head *entry;
bfe68a22 5704 u32 tail, off = req->timeout.off;
ad8a48ac 5705
733f5c95 5706 spin_lock_irq(&ctx->completion_lock);
93bd25bb 5707
5262f567
JA
5708 /*
5709 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
5710 * timeout event to be satisfied. If it isn't set, then this is
5711 * a pure timeout request, sequence isn't used.
5262f567 5712 */
8eb7e2d0 5713 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
5714 entry = ctx->timeout_list.prev;
5715 goto add;
5716 }
5262f567 5717
bfe68a22
PB
5718 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5719 req->timeout.target_seq = tail + off;
5262f567 5720
f010505b
MDG
5721 /* Update the last seq here in case io_flush_timeouts() hasn't.
5722 * This is safe because ->completion_lock is held, and submissions
5723 * and completions are never mixed in the same ->completion_lock section.
5724 */
5725 ctx->cq_last_tm_flush = tail;
5726
5262f567
JA
5727 /*
5728 * Insertion sort, ensuring the first entry in the list is always
5729 * the one we need first.
5730 */
5262f567 5731 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
5732 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5733 timeout.list);
5262f567 5734
8eb7e2d0 5735 if (io_is_timeout_noseq(nxt))
93bd25bb 5736 continue;
bfe68a22
PB
5737 /* nxt.seq is behind @tail, otherwise would've been completed */
5738 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
5739 break;
5740 }
93bd25bb 5741add:
135fcde8 5742 list_add(&req->timeout.list, entry);
ad8a48ac
JA
5743 data->timer.function = io_timeout_fn;
5744 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 5745 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
5746 return 0;
5747}
5262f567 5748
f458dd84
PB
5749struct io_cancel_data {
5750 struct io_ring_ctx *ctx;
5751 u64 user_data;
5752};
5753
62755e35
JA
5754static bool io_cancel_cb(struct io_wq_work *work, void *data)
5755{
5756 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f458dd84 5757 struct io_cancel_data *cd = data;
62755e35 5758
f458dd84 5759 return req->ctx == cd->ctx && req->user_data == cd->user_data;
62755e35
JA
5760}
5761
f458dd84
PB
5762static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
5763 struct io_ring_ctx *ctx)
62755e35 5764{
f458dd84 5765 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
62755e35 5766 enum io_wq_cancel cancel_ret;
62755e35
JA
5767 int ret = 0;
5768
f458dd84 5769 if (!tctx || !tctx->io_wq)
5aa75ed5
JA
5770 return -ENOENT;
5771
f458dd84 5772 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
62755e35
JA
5773 switch (cancel_ret) {
5774 case IO_WQ_CANCEL_OK:
5775 ret = 0;
5776 break;
5777 case IO_WQ_CANCEL_RUNNING:
5778 ret = -EALREADY;
5779 break;
5780 case IO_WQ_CANCEL_NOTFOUND:
5781 ret = -ENOENT;
5782 break;
5783 }
5784
e977d6d3
JA
5785 return ret;
5786}
5787
47f46768
JA
5788static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5789 struct io_kiocb *req, __u64 sqe_addr,
014db007 5790 int success_ret)
47f46768
JA
5791{
5792 unsigned long flags;
5793 int ret;
5794
f458dd84 5795 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
47f46768 5796 spin_lock_irqsave(&ctx->completion_lock, flags);
df9727af
PB
5797 if (ret != -ENOENT)
5798 goto done;
47f46768
JA
5799 ret = io_timeout_cancel(ctx, sqe_addr);
5800 if (ret != -ENOENT)
5801 goto done;
9ba5fac8 5802 ret = io_poll_cancel(ctx, sqe_addr, false);
47f46768 5803done:
b0dd8a41
JA
5804 if (!ret)
5805 ret = success_ret;
d4d19c19 5806 io_cqring_fill_event(ctx, req->user_data, ret, 0);
47f46768
JA
5807 io_commit_cqring(ctx);
5808 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5809 io_cqring_ev_posted(ctx);
5810
4e88d6e7 5811 if (ret < 0)
93d2bcd2 5812 req_set_fail(req);
47f46768
JA
5813}
5814
3529d8c2
JA
5815static int io_async_cancel_prep(struct io_kiocb *req,
5816 const struct io_uring_sqe *sqe)
e977d6d3 5817{
fbf23849 5818 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3 5819 return -EINVAL;
61710e43
DA
5820 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5821 return -EINVAL;
5822 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
e977d6d3
JA
5823 return -EINVAL;
5824
fbf23849
JA
5825 req->cancel.addr = READ_ONCE(sqe->addr);
5826 return 0;
5827}
5828
61e98203 5829static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
5830{
5831 struct io_ring_ctx *ctx = req->ctx;
58f99373
PB
5832 u64 sqe_addr = req->cancel.addr;
5833 struct io_tctx_node *node;
5834 int ret;
5835
5836 /* tasks should wait for their io-wq threads, so safe w/o sync */
5837 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
5838 spin_lock_irq(&ctx->completion_lock);
5839 if (ret != -ENOENT)
5840 goto done;
5841 ret = io_timeout_cancel(ctx, sqe_addr);
5842 if (ret != -ENOENT)
5843 goto done;
9ba5fac8 5844 ret = io_poll_cancel(ctx, sqe_addr, false);
58f99373
PB
5845 if (ret != -ENOENT)
5846 goto done;
5847 spin_unlock_irq(&ctx->completion_lock);
5848
5849 /* slow path, try all io-wq's */
5850 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
5851 ret = -ENOENT;
5852 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
5853 struct io_uring_task *tctx = node->task->io_uring;
fbf23849 5854
58f99373
PB
5855 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
5856 if (ret != -ENOENT)
5857 break;
5858 }
5859 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
5860
5861 spin_lock_irq(&ctx->completion_lock);
5862done:
d4d19c19 5863 io_cqring_fill_event(ctx, req->user_data, ret, 0);
58f99373
PB
5864 io_commit_cqring(ctx);
5865 spin_unlock_irq(&ctx->completion_lock);
5866 io_cqring_ev_posted(ctx);
5867
5868 if (ret < 0)
93d2bcd2 5869 req_set_fail(req);
58f99373 5870 io_put_req(req);
5262f567
JA
5871 return 0;
5872}
5873
269bbe5f 5874static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
5875 const struct io_uring_sqe *sqe)
5876{
61710e43
DA
5877 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5878 return -EINVAL;
5879 if (sqe->ioprio || sqe->rw_flags)
05f3fb3c
JA
5880 return -EINVAL;
5881
269bbe5f
BM
5882 req->rsrc_update.offset = READ_ONCE(sqe->off);
5883 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
5884 if (!req->rsrc_update.nr_args)
05f3fb3c 5885 return -EINVAL;
269bbe5f 5886 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
5887 return 0;
5888}
5889
889fca73 5890static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
5891{
5892 struct io_ring_ctx *ctx = req->ctx;
c3bdad02 5893 struct io_uring_rsrc_update2 up;
05f3fb3c 5894 int ret;
fbf23849 5895
45d189c6 5896 if (issue_flags & IO_URING_F_NONBLOCK)
05f3fb3c 5897 return -EAGAIN;
05f3fb3c 5898
269bbe5f
BM
5899 up.offset = req->rsrc_update.offset;
5900 up.data = req->rsrc_update.arg;
c3bdad02
PB
5901 up.nr = 0;
5902 up.tags = 0;
615cee49 5903 up.resv = 0;
05f3fb3c
JA
5904
5905 mutex_lock(&ctx->uring_lock);
fdecb662 5906 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
98f0b3b4 5907 &up, req->rsrc_update.nr_args);
05f3fb3c
JA
5908 mutex_unlock(&ctx->uring_lock);
5909
5910 if (ret < 0)
93d2bcd2 5911 req_set_fail(req);
889fca73 5912 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
5913 return 0;
5914}
5915
bfe76559 5916static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 5917{
d625c6ee 5918 switch (req->opcode) {
e781573e 5919 case IORING_OP_NOP:
bfe76559 5920 return 0;
f67676d1
JA
5921 case IORING_OP_READV:
5922 case IORING_OP_READ_FIXED:
3a6820f2 5923 case IORING_OP_READ:
bfe76559 5924 return io_read_prep(req, sqe);
f67676d1
JA
5925 case IORING_OP_WRITEV:
5926 case IORING_OP_WRITE_FIXED:
3a6820f2 5927 case IORING_OP_WRITE:
bfe76559 5928 return io_write_prep(req, sqe);
0969e783 5929 case IORING_OP_POLL_ADD:
bfe76559 5930 return io_poll_add_prep(req, sqe);
0969e783 5931 case IORING_OP_POLL_REMOVE:
c5de0036 5932 return io_poll_update_prep(req, sqe);
8ed8d3c3 5933 case IORING_OP_FSYNC:
1155c76a 5934 return io_fsync_prep(req, sqe);
8ed8d3c3 5935 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 5936 return io_sfr_prep(req, sqe);
03b1230c 5937 case IORING_OP_SENDMSG:
fddaface 5938 case IORING_OP_SEND:
bfe76559 5939 return io_sendmsg_prep(req, sqe);
03b1230c 5940 case IORING_OP_RECVMSG:
fddaface 5941 case IORING_OP_RECV:
bfe76559 5942 return io_recvmsg_prep(req, sqe);
f499a021 5943 case IORING_OP_CONNECT:
bfe76559 5944 return io_connect_prep(req, sqe);
2d28390a 5945 case IORING_OP_TIMEOUT:
bfe76559 5946 return io_timeout_prep(req, sqe, false);
b29472ee 5947 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 5948 return io_timeout_remove_prep(req, sqe);
fbf23849 5949 case IORING_OP_ASYNC_CANCEL:
bfe76559 5950 return io_async_cancel_prep(req, sqe);
2d28390a 5951 case IORING_OP_LINK_TIMEOUT:
bfe76559 5952 return io_timeout_prep(req, sqe, true);
8ed8d3c3 5953 case IORING_OP_ACCEPT:
bfe76559 5954 return io_accept_prep(req, sqe);
d63d1b5e 5955 case IORING_OP_FALLOCATE:
bfe76559 5956 return io_fallocate_prep(req, sqe);
15b71abe 5957 case IORING_OP_OPENAT:
bfe76559 5958 return io_openat_prep(req, sqe);
b5dba59e 5959 case IORING_OP_CLOSE:
bfe76559 5960 return io_close_prep(req, sqe);
05f3fb3c 5961 case IORING_OP_FILES_UPDATE:
269bbe5f 5962 return io_rsrc_update_prep(req, sqe);
eddc7ef5 5963 case IORING_OP_STATX:
bfe76559 5964 return io_statx_prep(req, sqe);
4840e418 5965 case IORING_OP_FADVISE:
bfe76559 5966 return io_fadvise_prep(req, sqe);
c1ca757b 5967 case IORING_OP_MADVISE:
bfe76559 5968 return io_madvise_prep(req, sqe);
cebdb986 5969 case IORING_OP_OPENAT2:
bfe76559 5970 return io_openat2_prep(req, sqe);
3e4827b0 5971 case IORING_OP_EPOLL_CTL:
bfe76559 5972 return io_epoll_ctl_prep(req, sqe);
7d67af2c 5973 case IORING_OP_SPLICE:
bfe76559 5974 return io_splice_prep(req, sqe);
ddf0322d 5975 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 5976 return io_provide_buffers_prep(req, sqe);
067524e9 5977 case IORING_OP_REMOVE_BUFFERS:
bfe76559 5978 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 5979 case IORING_OP_TEE:
bfe76559 5980 return io_tee_prep(req, sqe);
36f4fa68
JA
5981 case IORING_OP_SHUTDOWN:
5982 return io_shutdown_prep(req, sqe);
80a261fd
JA
5983 case IORING_OP_RENAMEAT:
5984 return io_renameat_prep(req, sqe);
14a1143b
JA
5985 case IORING_OP_UNLINKAT:
5986 return io_unlinkat_prep(req, sqe);
f67676d1
JA
5987 }
5988
bfe76559
PB
5989 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5990 req->opcode);
bd54b6fe 5991 return -EINVAL;
bfe76559
PB
5992}
5993
93642ef8 5994static int io_req_prep_async(struct io_kiocb *req)
bfe76559 5995{
b7e298d2
PB
5996 if (!io_op_defs[req->opcode].needs_async_setup)
5997 return 0;
5998 if (WARN_ON_ONCE(req->async_data))
5999 return -EFAULT;
6000 if (io_alloc_async_data(req))
6001 return -EAGAIN;
6002
93642ef8
PB
6003 switch (req->opcode) {
6004 case IORING_OP_READV:
93642ef8
PB
6005 return io_rw_prep_async(req, READ);
6006 case IORING_OP_WRITEV:
93642ef8
PB
6007 return io_rw_prep_async(req, WRITE);
6008 case IORING_OP_SENDMSG:
93642ef8
PB
6009 return io_sendmsg_prep_async(req);
6010 case IORING_OP_RECVMSG:
93642ef8
PB
6011 return io_recvmsg_prep_async(req);
6012 case IORING_OP_CONNECT:
6013 return io_connect_prep_async(req);
6014 }
b7e298d2
PB
6015 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6016 req->opcode);
6017 return -EFAULT;
f67676d1
JA
6018}
6019
9cf7c104
PB
6020static u32 io_get_sequence(struct io_kiocb *req)
6021{
a3dbdf54 6022 u32 seq = req->ctx->cached_sq_head;
9cf7c104 6023
a3dbdf54
PB
6024 /* need original cached_sq_head, but it was increased for each req */
6025 io_for_each_link(req, req)
6026 seq--;
6027 return seq;
9cf7c104
PB
6028}
6029
76cc33d7 6030static bool io_drain_req(struct io_kiocb *req)
de0617e4 6031{
3c19966d 6032 struct io_kiocb *pos;
a197f664 6033 struct io_ring_ctx *ctx = req->ctx;
27dc8338 6034 struct io_defer_entry *de;
f67676d1 6035 int ret;
9cf7c104 6036 u32 seq;
de0617e4 6037
3c19966d
PB
6038 /*
6039 * If we need to drain a request in the middle of a link, drain the
6040 * head request and the next request/link after the current link.
6041 * Considering sequential execution of links, IOSQE_IO_DRAIN will be
6042 * maintained for every request of our link.
6043 */
6044 if (ctx->drain_next) {
6045 req->flags |= REQ_F_IO_DRAIN;
6046 ctx->drain_next = false;
6047 }
6048 /* not interested in head, start from the first linked */
6049 io_for_each_link(pos, req->link) {
6050 if (pos->flags & REQ_F_IO_DRAIN) {
6051 ctx->drain_next = true;
6052 req->flags |= REQ_F_IO_DRAIN;
6053 break;
6054 }
6055 }
6056
9d858b21 6057 /* Still need defer if there is pending req in defer list. */
9cf7c104 6058 if (likely(list_empty_careful(&ctx->defer_list) &&
10c66904
PB
6059 !(req->flags & REQ_F_IO_DRAIN))) {
6060 ctx->drain_active = false;
76cc33d7 6061 return false;
10c66904 6062 }
9cf7c104
PB
6063
6064 seq = io_get_sequence(req);
6065 /* Still a chance to pass the sequence check */
6066 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
76cc33d7 6067 return false;
de0617e4 6068
b7e298d2 6069 ret = io_req_prep_async(req);
be7053b7 6070 if (ret)
1b48773f 6071 goto fail;
cbdcb435 6072 io_prep_async_link(req);
27dc8338 6073 de = kmalloc(sizeof(*de), GFP_KERNEL);
76cc33d7 6074 if (!de) {
1b48773f
PB
6075 ret = -ENOMEM;
6076fail:
6077 io_req_complete_failed(req, ret);
76cc33d7
PB
6078 return true;
6079 }
2d28390a 6080
de0617e4 6081 spin_lock_irq(&ctx->completion_lock);
9cf7c104 6082 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
de0617e4 6083 spin_unlock_irq(&ctx->completion_lock);
27dc8338 6084 kfree(de);
ae34817b 6085 io_queue_async_work(req);
76cc33d7 6086 return true;
de0617e4
JA
6087 }
6088
915967f6 6089 trace_io_uring_defer(ctx, req, req->user_data);
27dc8338 6090 de->req = req;
9cf7c104 6091 de->seq = seq;
27dc8338 6092 list_add_tail(&de->list, &ctx->defer_list);
de0617e4 6093 spin_unlock_irq(&ctx->completion_lock);
76cc33d7 6094 return true;
de0617e4
JA
6095}
6096
68fb8979 6097static void io_clean_op(struct io_kiocb *req)
99bc4c38 6098{
0e1b6fe3
PB
6099 if (req->flags & REQ_F_BUFFER_SELECTED) {
6100 switch (req->opcode) {
6101 case IORING_OP_READV:
6102 case IORING_OP_READ_FIXED:
6103 case IORING_OP_READ:
bcda7baa 6104 kfree((void *)(unsigned long)req->rw.addr);
0e1b6fe3
PB
6105 break;
6106 case IORING_OP_RECVMSG:
6107 case IORING_OP_RECV:
bcda7baa 6108 kfree(req->sr_msg.kbuf);
0e1b6fe3
PB
6109 break;
6110 }
99bc4c38
PB
6111 }
6112
0e1b6fe3
PB
6113 if (req->flags & REQ_F_NEED_CLEANUP) {
6114 switch (req->opcode) {
6115 case IORING_OP_READV:
6116 case IORING_OP_READ_FIXED:
6117 case IORING_OP_READ:
6118 case IORING_OP_WRITEV:
6119 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
6120 case IORING_OP_WRITE: {
6121 struct io_async_rw *io = req->async_data;
1dacb4df
PB
6122
6123 kfree(io->free_iovec);
0e1b6fe3 6124 break;
e8c2bc1f 6125 }
0e1b6fe3 6126 case IORING_OP_RECVMSG:
e8c2bc1f
JA
6127 case IORING_OP_SENDMSG: {
6128 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
6129
6130 kfree(io->free_iov);
0e1b6fe3 6131 break;
e8c2bc1f 6132 }
0e1b6fe3
PB
6133 case IORING_OP_SPLICE:
6134 case IORING_OP_TEE:
e1d767f0
PB
6135 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6136 io_put_file(req->splice.file_in);
0e1b6fe3 6137 break;
f3cd4850
JA
6138 case IORING_OP_OPENAT:
6139 case IORING_OP_OPENAT2:
6140 if (req->open.filename)
6141 putname(req->open.filename);
6142 break;
80a261fd
JA
6143 case IORING_OP_RENAMEAT:
6144 putname(req->rename.oldpath);
6145 putname(req->rename.newpath);
6146 break;
14a1143b
JA
6147 case IORING_OP_UNLINKAT:
6148 putname(req->unlink.filename);
6149 break;
0e1b6fe3 6150 }
99bc4c38 6151 }
75652a30
JA
6152 if ((req->flags & REQ_F_POLLED) && req->apoll) {
6153 kfree(req->apoll->double_poll);
6154 kfree(req->apoll);
6155 req->apoll = NULL;
6156 }
3a0a6902
PB
6157 if (req->flags & REQ_F_INFLIGHT) {
6158 struct io_uring_task *tctx = req->task->io_uring;
6159
6160 atomic_dec(&tctx->inflight_tracked);
3a0a6902 6161 }
c854357b 6162 if (req->flags & REQ_F_CREDS)
b8e64b53 6163 put_cred(req->creds);
c854357b
PB
6164
6165 req->flags &= ~IO_REQ_CLEAN_FLAGS;
99bc4c38
PB
6166}
6167
889fca73 6168static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 6169{
a197f664 6170 struct io_ring_ctx *ctx = req->ctx;
5730b27e 6171 const struct cred *creds = NULL;
d625c6ee 6172 int ret;
2b188cc1 6173
b8e64b53 6174 if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
c10d1f98 6175 creds = override_creds(req->creds);
5730b27e 6176
d625c6ee 6177 switch (req->opcode) {
2b188cc1 6178 case IORING_OP_NOP:
889fca73 6179 ret = io_nop(req, issue_flags);
2b188cc1
JA
6180 break;
6181 case IORING_OP_READV:
edafccee 6182 case IORING_OP_READ_FIXED:
3a6820f2 6183 case IORING_OP_READ:
889fca73 6184 ret = io_read(req, issue_flags);
edafccee 6185 break;
3529d8c2 6186 case IORING_OP_WRITEV:
edafccee 6187 case IORING_OP_WRITE_FIXED:
3a6820f2 6188 case IORING_OP_WRITE:
889fca73 6189 ret = io_write(req, issue_flags);
2b188cc1 6190 break;
c992fe29 6191 case IORING_OP_FSYNC:
45d189c6 6192 ret = io_fsync(req, issue_flags);
c992fe29 6193 break;
221c5eb2 6194 case IORING_OP_POLL_ADD:
61e98203 6195 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
6196 break;
6197 case IORING_OP_POLL_REMOVE:
c5de0036 6198 ret = io_poll_update(req, issue_flags);
221c5eb2 6199 break;
5d17b4a4 6200 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 6201 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 6202 break;
0fa03c62 6203 case IORING_OP_SENDMSG:
889fca73 6204 ret = io_sendmsg(req, issue_flags);
062d04d7 6205 break;
fddaface 6206 case IORING_OP_SEND:
889fca73 6207 ret = io_send(req, issue_flags);
0fa03c62 6208 break;
aa1fa28f 6209 case IORING_OP_RECVMSG:
889fca73 6210 ret = io_recvmsg(req, issue_flags);
062d04d7 6211 break;
fddaface 6212 case IORING_OP_RECV:
889fca73 6213 ret = io_recv(req, issue_flags);
aa1fa28f 6214 break;
5262f567 6215 case IORING_OP_TIMEOUT:
61e98203 6216 ret = io_timeout(req, issue_flags);
5262f567 6217 break;
11365043 6218 case IORING_OP_TIMEOUT_REMOVE:
61e98203 6219 ret = io_timeout_remove(req, issue_flags);
11365043 6220 break;
17f2fe35 6221 case IORING_OP_ACCEPT:
889fca73 6222 ret = io_accept(req, issue_flags);
17f2fe35 6223 break;
f8e85cf2 6224 case IORING_OP_CONNECT:
889fca73 6225 ret = io_connect(req, issue_flags);
f8e85cf2 6226 break;
62755e35 6227 case IORING_OP_ASYNC_CANCEL:
61e98203 6228 ret = io_async_cancel(req, issue_flags);
62755e35 6229 break;
d63d1b5e 6230 case IORING_OP_FALLOCATE:
45d189c6 6231 ret = io_fallocate(req, issue_flags);
d63d1b5e 6232 break;
15b71abe 6233 case IORING_OP_OPENAT:
45d189c6 6234 ret = io_openat(req, issue_flags);
15b71abe 6235 break;
b5dba59e 6236 case IORING_OP_CLOSE:
889fca73 6237 ret = io_close(req, issue_flags);
b5dba59e 6238 break;
05f3fb3c 6239 case IORING_OP_FILES_UPDATE:
889fca73 6240 ret = io_files_update(req, issue_flags);
05f3fb3c 6241 break;
eddc7ef5 6242 case IORING_OP_STATX:
45d189c6 6243 ret = io_statx(req, issue_flags);
eddc7ef5 6244 break;
4840e418 6245 case IORING_OP_FADVISE:
45d189c6 6246 ret = io_fadvise(req, issue_flags);
4840e418 6247 break;
c1ca757b 6248 case IORING_OP_MADVISE:
45d189c6 6249 ret = io_madvise(req, issue_flags);
c1ca757b 6250 break;
cebdb986 6251 case IORING_OP_OPENAT2:
45d189c6 6252 ret = io_openat2(req, issue_flags);
cebdb986 6253 break;
3e4827b0 6254 case IORING_OP_EPOLL_CTL:
889fca73 6255 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 6256 break;
7d67af2c 6257 case IORING_OP_SPLICE:
45d189c6 6258 ret = io_splice(req, issue_flags);
7d67af2c 6259 break;
ddf0322d 6260 case IORING_OP_PROVIDE_BUFFERS:
889fca73 6261 ret = io_provide_buffers(req, issue_flags);
ddf0322d 6262 break;
067524e9 6263 case IORING_OP_REMOVE_BUFFERS:
889fca73 6264 ret = io_remove_buffers(req, issue_flags);
3e4827b0 6265 break;
f2a8d5c7 6266 case IORING_OP_TEE:
45d189c6 6267 ret = io_tee(req, issue_flags);
f2a8d5c7 6268 break;
36f4fa68 6269 case IORING_OP_SHUTDOWN:
45d189c6 6270 ret = io_shutdown(req, issue_flags);
36f4fa68 6271 break;
80a261fd 6272 case IORING_OP_RENAMEAT:
45d189c6 6273 ret = io_renameat(req, issue_flags);
80a261fd 6274 break;
14a1143b 6275 case IORING_OP_UNLINKAT:
45d189c6 6276 ret = io_unlinkat(req, issue_flags);
14a1143b 6277 break;
2b188cc1
JA
6278 default:
6279 ret = -EINVAL;
6280 break;
6281 }
6282
5730b27e
JA
6283 if (creds)
6284 revert_creds(creds);
def596e9
JA
6285 if (ret)
6286 return ret;
b532576e 6287 /* If the op doesn't have a file, we're not polling for it */
cb3d8972
PB
6288 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6289 io_iopoll_req_issued(req);
def596e9
JA
6290
6291 return 0;
2b188cc1
JA
6292}
6293
ebc11b6c
PB
6294static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6295{
6296 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6297
6298 req = io_put_req_find_next(req);
6299 return req ? &req->work : NULL;
6300}
6301
5280f7e5 6302static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
6303{
6304 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6df1db6b 6305 struct io_kiocb *timeout;
561fb04a 6306 int ret = 0;
2b188cc1 6307
6df1db6b
PB
6308 timeout = io_prep_linked_timeout(req);
6309 if (timeout)
6310 io_queue_linked_timeout(timeout);
d4c81f38 6311
4014d943 6312 if (work->flags & IO_WQ_WORK_CANCEL)
561fb04a 6313 ret = -ECANCELED;
31b51510 6314
561fb04a 6315 if (!ret) {
561fb04a 6316 do {
889fca73 6317 ret = io_issue_sqe(req, 0);
561fb04a
JA
6318 /*
6319 * We can get EAGAIN for polled IO even though we're
6320 * forcing a sync submission from here, since we can't
6321 * wait for request slots on the block side.
6322 */
6323 if (ret != -EAGAIN)
6324 break;
6325 cond_resched();
6326 } while (1);
6327 }
31b51510 6328
a3df7698 6329 /* avoid locking problems by failing it from a clean context */
561fb04a 6330 if (ret) {
a3df7698 6331 /* io-wq is going to take one down */
de9b4cca 6332 req_ref_get(req);
a3df7698 6333 io_req_task_queue_fail(req, ret);
edafccee 6334 }
2b188cc1
JA
6335}
6336
aeca241b 6337static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
042b0d85 6338 unsigned i)
65e19f54 6339{
042b0d85 6340 return &table->files[i];
dafecf19
PB
6341}
6342
65e19f54
JA
6343static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6344 int index)
6345{
aeca241b 6346 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
65e19f54 6347
a04b0ac0 6348 return (struct file *) (slot->file_ptr & FFS_MASK);
65e19f54
JA
6349}
6350
a04b0ac0 6351static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
9a321c98
PB
6352{
6353 unsigned long file_ptr = (unsigned long) file;
6354
b191e2df 6355 if (__io_file_supports_nowait(file, READ))
9a321c98 6356 file_ptr |= FFS_ASYNC_READ;
b191e2df 6357 if (__io_file_supports_nowait(file, WRITE))
9a321c98
PB
6358 file_ptr |= FFS_ASYNC_WRITE;
6359 if (S_ISREG(file_inode(file)->i_mode))
6360 file_ptr |= FFS_ISREG;
a04b0ac0 6361 file_slot->file_ptr = file_ptr;
65e19f54
JA
6362}
6363
ac177053
PB
6364static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6365 struct io_kiocb *req, int fd)
09bb8394 6366{
8da11c19 6367 struct file *file;
ac177053 6368 unsigned long file_ptr;
09bb8394 6369
ac177053
PB
6370 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6371 return NULL;
6372 fd = array_index_nospec(fd, ctx->nr_user_files);
6373 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6374 file = (struct file *) (file_ptr & FFS_MASK);
6375 file_ptr &= ~FFS_MASK;
6376 /* mask in overlapping REQ_F and FFS bits */
b191e2df 6377 req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
ac177053
PB
6378 io_req_set_rsrc_node(req);
6379 return file;
6380}
d44f554e 6381
ac177053
PB
6382static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
6383 struct io_submit_state *state,
6384 struct io_kiocb *req, int fd)
6385{
6386 struct file *file = __io_file_get(state, fd);
6387
6388 trace_io_uring_file_get(ctx, fd);
09bb8394 6389
ac177053
PB
6390 /* we don't allow fixed io_uring files */
6391 if (file && unlikely(file->f_op == &io_uring_fops))
6392 io_req_track_inflight(req);
8371adf5 6393 return file;
09bb8394
JA
6394}
6395
ac177053
PB
6396static inline struct file *io_file_get(struct io_ring_ctx *ctx,
6397 struct io_submit_state *state,
6398 struct io_kiocb *req, int fd, bool fixed)
6399{
6400 if (fixed)
6401 return io_file_get_fixed(ctx, req, fd);
6402 else
6403 return io_file_get_normal(ctx, state, req, fd);
6404}
6405
2665abfd 6406static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 6407{
ad8a48ac
JA
6408 struct io_timeout_data *data = container_of(timer,
6409 struct io_timeout_data, timer);
90cd7e42 6410 struct io_kiocb *prev, *req = data->req;
2665abfd 6411 struct io_ring_ctx *ctx = req->ctx;
2665abfd 6412 unsigned long flags;
2665abfd
JA
6413
6414 spin_lock_irqsave(&ctx->completion_lock, flags);
90cd7e42
PB
6415 prev = req->timeout.head;
6416 req->timeout.head = NULL;
2665abfd
JA
6417
6418 /*
6419 * We don't expect the list to be empty, that will only happen if we
6420 * race with the completion of the linked work.
6421 */
447c19f3 6422 if (prev) {
f2f87370 6423 io_remove_next_linked(prev);
447c19f3
PB
6424 if (!req_ref_inc_not_zero(prev))
6425 prev = NULL;
6426 }
2665abfd
JA
6427 spin_unlock_irqrestore(&ctx->completion_lock, flags);
6428
6429 if (prev) {
014db007 6430 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
9ae1f8dd 6431 io_put_req_deferred(prev, 1);
a298232e 6432 io_put_req_deferred(req, 1);
47f46768 6433 } else {
9ae1f8dd 6434 io_req_complete_post(req, -ETIME, 0);
2665abfd 6435 }
2665abfd
JA
6436 return HRTIMER_NORESTART;
6437}
6438
de968c18 6439static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 6440{
de968c18
PB
6441 struct io_ring_ctx *ctx = req->ctx;
6442
6443 spin_lock_irq(&ctx->completion_lock);
76a46e06 6444 /*
f2f87370
PB
6445 * If the back reference is NULL, then our linked request finished
6446 * before we got a chance to setup the timer
76a46e06 6447 */
90cd7e42 6448 if (req->timeout.head) {
e8c2bc1f 6449 struct io_timeout_data *data = req->async_data;
94ae5e77 6450
ad8a48ac
JA
6451 data->timer.function = io_link_timeout_fn;
6452 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6453 data->mode);
2665abfd 6454 }
76a46e06 6455 spin_unlock_irq(&ctx->completion_lock);
2665abfd 6456 /* drop submission reference */
76a46e06
JA
6457 io_put_req(req);
6458}
2665abfd 6459
ad8a48ac 6460static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd 6461{
f2f87370 6462 struct io_kiocb *nxt = req->link;
2665abfd 6463
f2f87370
PB
6464 if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6465 nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 6466 return NULL;
2665abfd 6467
90cd7e42 6468 nxt->timeout.head = req;
900fad45 6469 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
76a46e06 6470 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 6471 return nxt;
2665abfd
JA
6472}
6473
c5eef2b9 6474static void __io_queue_sqe(struct io_kiocb *req)
282cdc86 6475 __must_hold(&req->ctx->uring_lock)
2b188cc1 6476{
d3d7298d 6477 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
e0c5c576 6478 int ret;
2b188cc1 6479
59b735ae 6480issue_sqe:
c5eef2b9 6481 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 6482
491381ce
JA
6483 /*
6484 * We async punt it if the file wasn't marked NOWAIT, or if the file
6485 * doesn't support non-blocking read/write attempts
6486 */
1840038e 6487 if (likely(!ret)) {
0d63c148 6488 /* drop submission reference */
e342c807 6489 if (req->flags & REQ_F_COMPLETE_INLINE) {
c5eef2b9
PB
6490 struct io_ring_ctx *ctx = req->ctx;
6491 struct io_comp_state *cs = &ctx->submit_state.comp;
e65ef56d 6492
6dd0be1e 6493 cs->reqs[cs->nr++] = req;
d3d7298d 6494 if (cs->nr == ARRAY_SIZE(cs->reqs))
2a2758f2 6495 io_submit_flush_completions(ctx);
9affd664 6496 } else {
d3d7298d 6497 io_put_req(req);
0d63c148 6498 }
1840038e 6499 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
59b735ae
OL
6500 switch (io_arm_poll_handler(req)) {
6501 case IO_APOLL_READY:
6502 goto issue_sqe;
6503 case IO_APOLL_ABORTED:
1840038e
PB
6504 /*
6505 * Queued up for async execution, worker will release
6506 * submit reference when the iocb is actually submitted.
6507 */
6508 io_queue_async_work(req);
59b735ae 6509 break;
1840038e 6510 }
0d63c148 6511 } else {
f41db273 6512 io_req_complete_failed(req, ret);
9e645e11 6513 }
d3d7298d
PB
6514 if (linked_timeout)
6515 io_queue_linked_timeout(linked_timeout);
2b188cc1
JA
6516}
6517
441b8a78 6518static inline void io_queue_sqe(struct io_kiocb *req)
282cdc86 6519 __must_hold(&req->ctx->uring_lock)
4fe2c963 6520{
10c66904 6521 if (unlikely(req->ctx->drain_active) && io_drain_req(req))
76cc33d7 6522 return;
4fe2c963 6523
76cc33d7 6524 if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
c5eef2b9 6525 __io_queue_sqe(req);
76cc33d7
PB
6526 } else {
6527 int ret = io_req_prep_async(req);
6528
6529 if (unlikely(ret))
6530 io_req_complete_failed(req, ret);
6531 else
6532 io_queue_async_work(req);
ce35a47a 6533 }
4fe2c963
JL
6534}
6535
b16fed66
PB
6536/*
6537 * Check SQE restrictions (opcode and flags).
6538 *
6539 * Returns 'true' if SQE is allowed, 'false' otherwise.
6540 */
6541static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6542 struct io_kiocb *req,
6543 unsigned int sqe_flags)
4fe2c963 6544{
4cfb25bf 6545 if (likely(!ctx->restricted))
b16fed66
PB
6546 return true;
6547
6548 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6549 return false;
6550
6551 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6552 ctx->restrictions.sqe_flags_required)
6553 return false;
6554
6555 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6556 ctx->restrictions.sqe_flags_required))
6557 return false;
6558
6559 return true;
4fe2c963
JL
6560}
6561
b16fed66
PB
6562static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6563 const struct io_uring_sqe *sqe)
282cdc86 6564 __must_hold(&ctx->uring_lock)
b16fed66
PB
6565{
6566 struct io_submit_state *state;
6567 unsigned int sqe_flags;
003e8dcc 6568 int personality, ret = 0;
b16fed66 6569
864ea921 6570 /* req is partially pre-initialised, see io_preinit_req() */
b16fed66
PB
6571 req->opcode = READ_ONCE(sqe->opcode);
6572 /* same numerical values with corresponding REQ_F_*, safe to copy */
6573 req->flags = sqe_flags = READ_ONCE(sqe->flags);
6574 req->user_data = READ_ONCE(sqe->user_data);
b16fed66 6575 req->file = NULL;
b16fed66
PB
6576 req->fixed_rsrc_refs = NULL;
6577 /* one is dropped after submission, the other at completion */
abc54d63 6578 atomic_set(&req->refs, 2);
b16fed66 6579 req->task = current;
b16fed66
PB
6580
6581 /* enforce forwards compatibility on users */
dddca226 6582 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
b16fed66 6583 return -EINVAL;
b16fed66
PB
6584 if (unlikely(req->opcode >= IORING_OP_LAST))
6585 return -EINVAL;
4cfb25bf 6586 if (!io_check_restriction(ctx, req, sqe_flags))
b16fed66
PB
6587 return -EACCES;
6588
6589 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6590 !io_op_defs[req->opcode].buffer_select)
6591 return -EOPNOTSUPP;
3c19966d
PB
6592 if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
6593 ctx->drain_active = true;
863e0560 6594
003e8dcc
JA
6595 personality = READ_ONCE(sqe->personality);
6596 if (personality) {
c10d1f98
PB
6597 req->creds = xa_load(&ctx->personalities, personality);
6598 if (!req->creds)
003e8dcc 6599 return -EINVAL;
c10d1f98 6600 get_cred(req->creds);
b8e64b53 6601 req->flags |= REQ_F_CREDS;
003e8dcc 6602 }
b16fed66
PB
6603 state = &ctx->submit_state;
6604
6605 /*
6606 * Plug now if we have more than 1 IO left after this, and the target
6607 * is potentially a read/write to block based storage.
6608 */
6609 if (!state->plug_started && state->ios_left > 1 &&
6610 io_op_defs[req->opcode].plug) {
6611 blk_start_plug(&state->plug);
6612 state->plug_started = true;
6613 }
6614
6615 if (io_op_defs[req->opcode].needs_file) {
ac177053
PB
6616 req->file = io_file_get(ctx, state, req, READ_ONCE(sqe->fd),
6617 (sqe_flags & IOSQE_FIXED_FILE));
b16fed66
PB
6618 if (unlikely(!req->file))
6619 ret = -EBADF;
6620 }
6621
6622 state->ios_left--;
6623 return ret;
6624}
6625
a6b8cadc 6626static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 6627 const struct io_uring_sqe *sqe)
282cdc86 6628 __must_hold(&ctx->uring_lock)
9e645e11 6629{
a1ab7b35 6630 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 6631 int ret;
9e645e11 6632
a6b8cadc
PB
6633 ret = io_init_req(ctx, req, sqe);
6634 if (unlikely(ret)) {
6635fail_req:
de59bc10
PB
6636 if (link->head) {
6637 /* fail even hard links since we don't submit */
93d2bcd2 6638 req_set_fail(link->head);
f41db273 6639 io_req_complete_failed(link->head, -ECANCELED);
de59bc10
PB
6640 link->head = NULL;
6641 }
f41db273 6642 io_req_complete_failed(req, ret);
a6b8cadc
PB
6643 return ret;
6644 }
441b8a78 6645
be7053b7
PB
6646 ret = io_req_prep(req, sqe);
6647 if (unlikely(ret))
6648 goto fail_req;
a6b8cadc 6649
be7053b7 6650 /* don't need @sqe from now on */
236daeae
OL
6651 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
6652 req->flags, true,
6653 ctx->flags & IORING_SETUP_SQPOLL);
a6b8cadc 6654
9e645e11
JA
6655 /*
6656 * If we already have a head request, queue this one for async
6657 * submittal once the head completes. If we don't have a head but
6658 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6659 * submitted sync once the chain is complete. If none of those
6660 * conditions are true (normal request), then just queue it.
6661 */
863e0560
PB
6662 if (link->head) {
6663 struct io_kiocb *head = link->head;
4e88d6e7 6664
b7e298d2 6665 ret = io_req_prep_async(req);
cf109604 6666 if (unlikely(ret))
a6b8cadc 6667 goto fail_req;
9d76377f 6668 trace_io_uring_link(ctx, req, head);
f2f87370 6669 link->last->link = req;
863e0560 6670 link->last = req;
32fe525b
PB
6671
6672 /* last request of a link, enqueue the link */
ef4ff581 6673 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
863e0560 6674 link->head = NULL;
5e159204 6675 io_queue_sqe(head);
32fe525b 6676 }
9e645e11 6677 } else {
ef4ff581 6678 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
863e0560
PB
6679 link->head = req;
6680 link->last = req;
711be031 6681 } else {
be7053b7 6682 io_queue_sqe(req);
711be031 6683 }
9e645e11 6684 }
2e6e1fde 6685
1d4240cc 6686 return 0;
9e645e11
JA
6687}
6688
9a56a232
JA
6689/*
6690 * Batched submission is done, ensure local IO is flushed out.
6691 */
ba88ff11
PB
6692static void io_submit_state_end(struct io_submit_state *state,
6693 struct io_ring_ctx *ctx)
9a56a232 6694{
a1ab7b35 6695 if (state->link.head)
de59bc10 6696 io_queue_sqe(state->link.head);
6dd0be1e 6697 if (state->comp.nr)
2a2758f2 6698 io_submit_flush_completions(ctx);
27926b68
JA
6699 if (state->plug_started)
6700 blk_finish_plug(&state->plug);
9f13c35b 6701 io_state_file_put(state);
9a56a232
JA
6702}
6703
6704/*
6705 * Start submission side cache.
6706 */
6707static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 6708 unsigned int max_ios)
9a56a232 6709{
27926b68 6710 state->plug_started = false;
9a56a232 6711 state->ios_left = max_ios;
a1ab7b35
PB
6712 /* set only head, no need to init link_last in advance */
6713 state->link.head = NULL;
9a56a232
JA
6714}
6715
2b188cc1
JA
6716static void io_commit_sqring(struct io_ring_ctx *ctx)
6717{
75b28aff 6718 struct io_rings *rings = ctx->rings;
2b188cc1 6719
caf582c6
PB
6720 /*
6721 * Ensure any loads from the SQEs are done at this point,
6722 * since once we write the new head, the application could
6723 * write new data to them.
6724 */
6725 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
6726}
6727
2b188cc1 6728/*
dd9ae8a0 6729 * Fetch an sqe, if one is available. Note this returns a pointer to memory
2b188cc1
JA
6730 * that is mapped by userspace. This means that care needs to be taken to
6731 * ensure that reads are stable, as we cannot rely on userspace always
6732 * being a good citizen. If members of the sqe are validated and then later
6733 * used, it's important that those reads are done through READ_ONCE() to
6734 * prevent a re-load down the line.
6735 */
709b302f 6736static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 6737{
ea5ab3b5 6738 unsigned head, mask = ctx->sq_entries - 1;
17d3aeb3 6739 unsigned sq_idx = ctx->cached_sq_head++ & mask;
2b188cc1
JA
6740
6741 /*
6742 * The cached sq head (or cq tail) serves two purposes:
6743 *
6744 * 1) allows us to batch the cost of updating the user visible
6745 * head updates.
6746 * 2) allows the kernel side to track the head on its own, even
6747 * though the application is the one updating it.
6748 */
17d3aeb3 6749 head = READ_ONCE(ctx->sq_array[sq_idx]);
709b302f
PB
6750 if (likely(head < ctx->sq_entries))
6751 return &ctx->sq_sqes[head];
2b188cc1
JA
6752
6753 /* drop invalid entries */
15641e42
PB
6754 ctx->cq_extra--;
6755 WRITE_ONCE(ctx->rings->sq_dropped,
6756 READ_ONCE(ctx->rings->sq_dropped) + 1);
709b302f
PB
6757 return NULL;
6758}
6759
0f212204 6760static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
282cdc86 6761 __must_hold(&ctx->uring_lock)
6c271ce2 6762{
09899b19 6763 struct io_uring_task *tctx;
46c4e16a 6764 int submitted = 0;
6c271ce2 6765
ee7d46d9
PB
6766 /* make sure SQ entry isn't read before tail */
6767 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
2b85edfc
PB
6768 if (!percpu_ref_tryget_many(&ctx->refs, nr))
6769 return -EAGAIN;
6c271ce2 6770
09899b19
PB
6771 tctx = current->io_uring;
6772 tctx->cached_refs -= nr;
6773 if (unlikely(tctx->cached_refs < 0)) {
6774 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
6775
6776 percpu_counter_add(&tctx->inflight, refill);
6777 refcount_add(refill, &current->usage);
6778 tctx->cached_refs += refill;
6779 }
ba88ff11 6780 io_submit_state_start(&ctx->submit_state, nr);
b14cca0c 6781
46c4e16a 6782 while (submitted < nr) {
3529d8c2 6783 const struct io_uring_sqe *sqe;
196be95c 6784 struct io_kiocb *req;
fb5ccc98 6785
258b29a9 6786 req = io_alloc_req(ctx);
196be95c
PB
6787 if (unlikely(!req)) {
6788 if (!submitted)
6789 submitted = -EAGAIN;
fb5ccc98 6790 break;
196be95c 6791 }
4fccfcbb
PB
6792 sqe = io_get_sqe(ctx);
6793 if (unlikely(!sqe)) {
6794 kmem_cache_free(req_cachep, req);
6795 break;
6796 }
d3656344
JA
6797 /* will complete beyond this point, count as submitted */
6798 submitted++;
a1ab7b35 6799 if (io_submit_sqe(ctx, req, sqe))
196be95c 6800 break;
6c271ce2
JA
6801 }
6802
9466f437
PB
6803 if (unlikely(submitted != nr)) {
6804 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
d8a6df10 6805 int unused = nr - ref_used;
9466f437 6806
09899b19 6807 current->io_uring->cached_refs += unused;
d8a6df10 6808 percpu_ref_put_many(&ctx->refs, unused);
9466f437 6809 }
6c271ce2 6810
a1ab7b35 6811 io_submit_state_end(&ctx->submit_state, ctx);
ae9428ca
PB
6812 /* Commit SQ ring head once we've consumed and submitted all SQEs */
6813 io_commit_sqring(ctx);
6814
6c271ce2
JA
6815 return submitted;
6816}
6817
e4b6d902
PB
6818static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
6819{
6820 return READ_ONCE(sqd->state);
6821}
6822
23b3628e
XW
6823static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6824{
6825 /* Tell userspace we may need a wakeup call */
6826 spin_lock_irq(&ctx->completion_lock);
20c0b380
NA
6827 WRITE_ONCE(ctx->rings->sq_flags,
6828 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
23b3628e
XW
6829 spin_unlock_irq(&ctx->completion_lock);
6830}
6831
6832static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6833{
6834 spin_lock_irq(&ctx->completion_lock);
20c0b380
NA
6835 WRITE_ONCE(ctx->rings->sq_flags,
6836 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
23b3628e
XW
6837 spin_unlock_irq(&ctx->completion_lock);
6838}
6839
08369246 6840static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 6841{
c8d1ba58 6842 unsigned int to_submit;
bdcd3eab 6843 int ret = 0;
6c271ce2 6844
c8d1ba58 6845 to_submit = io_sqring_entries(ctx);
e95eee2d 6846 /* if we're handling multiple rings, cap submit size for fairness */
4ce8ad95
OL
6847 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
6848 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
e95eee2d 6849
906a3c6f 6850 if (!list_empty(&ctx->iopoll_list) || to_submit) {
c8d1ba58 6851 unsigned nr_events = 0;
948e1947
PB
6852 const struct cred *creds = NULL;
6853
6854 if (ctx->sq_creds != current_cred())
6855 creds = override_creds(ctx->sq_creds);
a4c0b3de 6856
c8d1ba58 6857 mutex_lock(&ctx->uring_lock);
906a3c6f 6858 if (!list_empty(&ctx->iopoll_list))
3c30ef0f 6859 io_do_iopoll(ctx, &nr_events, 0, true);
906a3c6f 6860
3b763ba1
PB
6861 /*
6862 * Don't submit if refs are dying, good for io_uring_register(),
6863 * but also it is relied upon by io_ring_exit_work()
6864 */
0298ef96
PB
6865 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
6866 !(ctx->flags & IORING_SETUP_R_DISABLED))
08369246 6867 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58 6868 mutex_unlock(&ctx->uring_lock);
6c271ce2 6869
acfb381d
PB
6870 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
6871 wake_up(&ctx->sqo_sq_wait);
948e1947
PB
6872 if (creds)
6873 revert_creds(creds);
acfb381d 6874 }
6c271ce2 6875
08369246
XW
6876 return ret;
6877}
6c271ce2 6878
08369246
XW
6879static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
6880{
6881 struct io_ring_ctx *ctx;
6882 unsigned sq_thread_idle = 0;
6c271ce2 6883
c9dca27d
PB
6884 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6885 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
08369246 6886 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 6887}
6c271ce2 6888
e4b6d902
PB
6889static bool io_sqd_handle_event(struct io_sq_data *sqd)
6890{
6891 bool did_sig = false;
6892 struct ksignal ksig;
6893
6894 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
6895 signal_pending(current)) {
6896 mutex_unlock(&sqd->lock);
6897 if (signal_pending(current))
6898 did_sig = get_signal(&ksig);
6899 cond_resched();
6900 mutex_lock(&sqd->lock);
6901 }
e4b6d902
PB
6902 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
6903}
6904
c8d1ba58
JA
6905static int io_sq_thread(void *data)
6906{
69fb2131
JA
6907 struct io_sq_data *sqd = data;
6908 struct io_ring_ctx *ctx;
a0d9205f 6909 unsigned long timeout = 0;
37d1e2e3 6910 char buf[TASK_COMM_LEN];
08369246 6911 DEFINE_WAIT(wait);
6c271ce2 6912
696ee88a 6913 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
37d1e2e3 6914 set_task_comm(current, buf);
37d1e2e3
JA
6915
6916 if (sqd->sq_cpu != -1)
6917 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
6918 else
6919 set_cpus_allowed_ptr(current, cpu_online_mask);
6920 current->flags |= PF_NO_SETAFFINITY;
6921
09a6f4ef 6922 mutex_lock(&sqd->lock);
e4b6d902 6923 while (1) {
1a924a80 6924 bool cap_entries, sqt_spin = false;
c1edbf5f 6925
e4b6d902
PB
6926 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
6927 if (io_sqd_handle_event(sqd))
c7d95613 6928 break;
08369246
XW
6929 timeout = jiffies + sqd->sq_thread_idle;
6930 }
e4b6d902 6931
e95eee2d 6932 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131 6933 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
948e1947 6934 int ret = __io_sq_thread(ctx, cap_entries);
7c30f36a 6935
08369246
XW
6936 if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
6937 sqt_spin = true;
69fb2131 6938 }
dd432ea5
PB
6939 if (io_run_task_work())
6940 sqt_spin = true;
6c271ce2 6941
08369246 6942 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58 6943 cond_resched();
08369246
XW
6944 if (sqt_spin)
6945 timeout = jiffies + sqd->sq_thread_idle;
6946 continue;
6947 }
6948
08369246 6949 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
dd432ea5 6950 if (!io_sqd_events_pending(sqd) && !current->task_works) {
1a924a80
PB
6951 bool needs_sched = true;
6952
724cb4f9 6953 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
aaa9f0f4
PB
6954 io_ring_set_wakeup_flag(ctx);
6955
724cb4f9
HX
6956 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6957 !list_empty_careful(&ctx->iopoll_list)) {
6958 needs_sched = false;
6959 break;
6960 }
6961 if (io_sqring_entries(ctx)) {
6962 needs_sched = false;
6963 break;
6964 }
6965 }
6966
6967 if (needs_sched) {
6968 mutex_unlock(&sqd->lock);
6969 schedule();
6970 mutex_lock(&sqd->lock);
6971 }
69fb2131
JA
6972 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6973 io_ring_clear_wakeup_flag(ctx);
6c271ce2 6974 }
08369246
XW
6975
6976 finish_wait(&sqd->wait, &wait);
6977 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2 6978 }
28cea78a 6979
78cc687b 6980 io_uring_cancel_generic(true, sqd);
37d1e2e3 6981 sqd->thread = NULL;
05962f95 6982 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
5f3f26f9 6983 io_ring_set_wakeup_flag(ctx);
521d6a73 6984 io_run_task_work();
734551df
PB
6985 mutex_unlock(&sqd->lock);
6986
37d1e2e3
JA
6987 complete(&sqd->exited);
6988 do_exit(0);
6c271ce2
JA
6989}
6990
bda52162
JA
6991struct io_wait_queue {
6992 struct wait_queue_entry wq;
6993 struct io_ring_ctx *ctx;
5fd46178 6994 unsigned cq_tail;
bda52162
JA
6995 unsigned nr_timeouts;
6996};
6997
6c503150 6998static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
6999{
7000 struct io_ring_ctx *ctx = iowq->ctx;
5fd46178 7001 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
bda52162
JA
7002
7003 /*
d195a66e 7004 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
7005 * started waiting. For timeouts, we always want to return to userspace,
7006 * regardless of event count.
7007 */
5fd46178 7008 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
bda52162
JA
7009}
7010
7011static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7012 int wake_flags, void *key)
7013{
7014 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7015 wq);
7016
6c503150
PB
7017 /*
7018 * Cannot safely flush overflowed CQEs from here, ensure we wake up
7019 * the task, and the next invocation will do it.
7020 */
5ed7a37d 7021 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
6c503150
PB
7022 return autoremove_wake_function(curr, mode, wake_flags, key);
7023 return -1;
bda52162
JA
7024}
7025
af9c1a44
JA
7026static int io_run_task_work_sig(void)
7027{
7028 if (io_run_task_work())
7029 return 1;
7030 if (!signal_pending(current))
7031 return 0;
0b8cfa97 7032 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
792ee0f6 7033 return -ERESTARTSYS;
af9c1a44
JA
7034 return -EINTR;
7035}
7036
eeb60b9a
PB
7037/* when returns >0, the caller should retry */
7038static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7039 struct io_wait_queue *iowq,
7040 signed long *timeout)
7041{
7042 int ret;
7043
7044 /* make sure we run task_work before checking for signals */
7045 ret = io_run_task_work_sig();
7046 if (ret || io_should_wake(iowq))
7047 return ret;
7048 /* let the caller flush overflows, retry */
5ed7a37d 7049 if (test_bit(0, &ctx->check_cq_overflow))
eeb60b9a
PB
7050 return 1;
7051
7052 *timeout = schedule_timeout(*timeout);
7053 return !*timeout ? -ETIME : 1;
7054}
7055
2b188cc1
JA
7056/*
7057 * Wait until events become available, if we don't already have some. The
7058 * application must reap them itself, as they reside on the shared cq ring.
7059 */
7060static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
7061 const sigset_t __user *sig, size_t sigsz,
7062 struct __kernel_timespec __user *uts)
2b188cc1 7063{
90291099 7064 struct io_wait_queue iowq;
75b28aff 7065 struct io_rings *rings = ctx->rings;
c1d5a224
PB
7066 signed long timeout = MAX_SCHEDULE_TIMEOUT;
7067 int ret;
2b188cc1 7068
b41e9852 7069 do {
6c2450ae 7070 io_cqring_overflow_flush(ctx, false);
6c503150 7071 if (io_cqring_events(ctx) >= min_events)
b41e9852 7072 return 0;
4c6e277c 7073 if (!io_run_task_work())
b41e9852 7074 break;
b41e9852 7075 } while (1);
2b188cc1
JA
7076
7077 if (sig) {
9e75ad5d
AB
7078#ifdef CONFIG_COMPAT
7079 if (in_compat_syscall())
7080 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 7081 sigsz);
9e75ad5d
AB
7082 else
7083#endif
b772434b 7084 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 7085
2b188cc1
JA
7086 if (ret)
7087 return ret;
7088 }
7089
c73ebb68 7090 if (uts) {
c1d5a224
PB
7091 struct timespec64 ts;
7092
c73ebb68
HX
7093 if (get_timespec64(&ts, uts))
7094 return -EFAULT;
7095 timeout = timespec64_to_jiffies(&ts);
7096 }
7097
90291099
PB
7098 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7099 iowq.wq.private = current;
7100 INIT_LIST_HEAD(&iowq.wq.entry);
7101 iowq.ctx = ctx;
bda52162 7102 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
5fd46178 7103 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
90291099 7104
c826bd7a 7105 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 7106 do {
ca0a2651 7107 /* if we can't even flush overflow, don't wait for more */
6c2450ae 7108 if (!io_cqring_overflow_flush(ctx, false)) {
ca0a2651
JA
7109 ret = -EBUSY;
7110 break;
7111 }
311997b3 7112 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
bda52162 7113 TASK_INTERRUPTIBLE);
eeb60b9a 7114 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
311997b3 7115 finish_wait(&ctx->cq_wait, &iowq.wq);
ca0a2651 7116 cond_resched();
eeb60b9a 7117 } while (ret > 0);
bda52162 7118
b7db41c9 7119 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 7120
75b28aff 7121 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
7122}
7123
9123c8ff 7124static void io_free_page_table(void **table, size_t size)
05f3fb3c 7125{
9123c8ff 7126 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
05f3fb3c 7127
846a4ef2 7128 for (i = 0; i < nr_tables; i++)
9123c8ff
PB
7129 kfree(table[i]);
7130 kfree(table);
7131}
7132
7133static void **io_alloc_page_table(size_t size)
7134{
7135 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7136 size_t init_size = size;
7137 void **table;
7138
7139 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
7140 if (!table)
7141 return NULL;
7142
7143 for (i = 0; i < nr_tables; i++) {
27f6b318 7144 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
9123c8ff
PB
7145
7146 table[i] = kzalloc(this_size, GFP_KERNEL);
7147 if (!table[i]) {
7148 io_free_page_table(table, init_size);
7149 return NULL;
7150 }
7151 size -= this_size;
7152 }
7153 return table;
05f3fb3c
JA
7154}
7155
28a9fe25 7156static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
1642b445 7157{
28a9fe25
PB
7158 percpu_ref_exit(&ref_node->refs);
7159 kfree(ref_node);
1642b445
PB
7160}
7161
b9bd2bea
PB
7162static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7163{
7164 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7165 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7166 unsigned long flags;
7167 bool first_add = false;
7168
7169 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7170 node->done = true;
7171
7172 while (!list_empty(&ctx->rsrc_ref_list)) {
7173 node = list_first_entry(&ctx->rsrc_ref_list,
7174 struct io_rsrc_node, node);
7175 /* recycle ref nodes in order */
7176 if (!node->done)
7177 break;
7178 list_del(&node->node);
7179 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7180 }
7181 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7182
7183 if (first_add)
7184 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7185}
7186
7187static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7188{
7189 struct io_rsrc_node *ref_node;
7190
7191 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7192 if (!ref_node)
7193 return NULL;
7194
7195 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7196 0, GFP_KERNEL)) {
7197 kfree(ref_node);
7198 return NULL;
7199 }
7200 INIT_LIST_HEAD(&ref_node->node);
7201 INIT_LIST_HEAD(&ref_node->rsrc_list);
7202 ref_node->done = false;
7203 return ref_node;
7204}
7205
a7f0ed5a
PB
7206static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7207 struct io_rsrc_data *data_to_kill)
6b06314c 7208{
a7f0ed5a
PB
7209 WARN_ON_ONCE(!ctx->rsrc_backup_node);
7210 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
6b06314c 7211
a7f0ed5a
PB
7212 if (data_to_kill) {
7213 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
82fbcfa9 7214
a7f0ed5a 7215 rsrc_node->rsrc_data = data_to_kill;
4956b9ea 7216 spin_lock_irq(&ctx->rsrc_ref_lock);
a7f0ed5a 7217 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
4956b9ea 7218 spin_unlock_irq(&ctx->rsrc_ref_lock);
82fbcfa9 7219
3e942498 7220 atomic_inc(&data_to_kill->refs);
a7f0ed5a
PB
7221 percpu_ref_kill(&rsrc_node->refs);
7222 ctx->rsrc_node = NULL;
7223 }
6b06314c 7224
a7f0ed5a
PB
7225 if (!ctx->rsrc_node) {
7226 ctx->rsrc_node = ctx->rsrc_backup_node;
7227 ctx->rsrc_backup_node = NULL;
7228 }
8bad28d8
HX
7229}
7230
a7f0ed5a 7231static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8dd03afe
PB
7232{
7233 if (ctx->rsrc_backup_node)
7234 return 0;
b895c9a6 7235 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
8dd03afe 7236 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8bad28d8
HX
7237}
7238
40ae0ff7 7239static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
8bad28d8
HX
7240{
7241 int ret;
05589553 7242
215c3902 7243 /* As we may drop ->uring_lock, other task may have started quiesce */
8bad28d8
HX
7244 if (data->quiesce)
7245 return -ENXIO;
05589553 7246
8bad28d8 7247 data->quiesce = true;
1ffc5422 7248 do {
a7f0ed5a 7249 ret = io_rsrc_node_switch_start(ctx);
8dd03afe 7250 if (ret)
f2303b1f 7251 break;
a7f0ed5a 7252 io_rsrc_node_switch(ctx, data);
f2303b1f 7253
3e942498
PB
7254 /* kill initial ref, already quiesced if zero */
7255 if (atomic_dec_and_test(&data->refs))
7256 break;
c018db4a 7257 mutex_unlock(&ctx->uring_lock);
8bad28d8 7258 flush_delayed_work(&ctx->rsrc_put_work);
1ffc5422 7259 ret = wait_for_completion_interruptible(&data->done);
c018db4a
JA
7260 if (!ret) {
7261 mutex_lock(&ctx->uring_lock);
1ffc5422 7262 break;
c018db4a 7263 }
8bad28d8 7264
3e942498
PB
7265 atomic_inc(&data->refs);
7266 /* wait for all works potentially completing data->done */
7267 flush_delayed_work(&ctx->rsrc_put_work);
cb5e1b81 7268 reinit_completion(&data->done);
8dd03afe 7269
1ffc5422 7270 ret = io_run_task_work_sig();
8bad28d8 7271 mutex_lock(&ctx->uring_lock);
f2303b1f 7272 } while (ret >= 0);
8bad28d8 7273 data->quiesce = false;
05f3fb3c 7274
8bad28d8 7275 return ret;
d7954b2b
BM
7276}
7277
2d091d62
PB
7278static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7279{
7280 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7281 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7282
7283 return &data->tags[table_idx][off];
7284}
7285
44b31f2f 7286static void io_rsrc_data_free(struct io_rsrc_data *data)
1ad555c6 7287{
2d091d62
PB
7288 size_t size = data->nr * sizeof(data->tags[0][0]);
7289
7290 if (data->tags)
7291 io_free_page_table((void **)data->tags, size);
44b31f2f
PB
7292 kfree(data);
7293}
7294
d878c816
PB
7295static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7296 u64 __user *utags, unsigned nr,
7297 struct io_rsrc_data **pdata)
1ad555c6 7298{
b895c9a6 7299 struct io_rsrc_data *data;
2d091d62 7300 int ret = -ENOMEM;
d878c816 7301 unsigned i;
1ad555c6
BM
7302
7303 data = kzalloc(sizeof(*data), GFP_KERNEL);
7304 if (!data)
d878c816 7305 return -ENOMEM;
2d091d62 7306 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
b60c8dce 7307 if (!data->tags) {
1ad555c6 7308 kfree(data);
d878c816
PB
7309 return -ENOMEM;
7310 }
2d091d62
PB
7311
7312 data->nr = nr;
7313 data->ctx = ctx;
7314 data->do_put = do_put;
d878c816 7315 if (utags) {
2d091d62 7316 ret = -EFAULT;
d878c816 7317 for (i = 0; i < nr; i++) {
fdd1dc31
CIK
7318 u64 *tag_slot = io_get_tag_slot(data, i);
7319
7320 if (copy_from_user(tag_slot, &utags[i],
7321 sizeof(*tag_slot)))
2d091d62 7322 goto fail;
d878c816 7323 }
1ad555c6 7324 }
b60c8dce 7325
3e942498 7326 atomic_set(&data->refs, 1);
1ad555c6 7327 init_completion(&data->done);
d878c816
PB
7328 *pdata = data;
7329 return 0;
2d091d62
PB
7330fail:
7331 io_rsrc_data_free(data);
7332 return ret;
1ad555c6
BM
7333}
7334
9123c8ff
PB
7335static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7336{
042b0d85 7337 table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
9123c8ff
PB
7338 return !!table->files;
7339}
7340
042b0d85 7341static void io_free_file_tables(struct io_file_table *table)
9123c8ff 7342{
042b0d85 7343 kvfree(table->files);
9123c8ff
PB
7344 table->files = NULL;
7345}
7346
fff4db76 7347static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1ad555c6 7348{
fff4db76
PB
7349#if defined(CONFIG_UNIX)
7350 if (ctx->ring_sock) {
7351 struct sock *sock = ctx->ring_sock->sk;
7352 struct sk_buff *skb;
7353
7354 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7355 kfree_skb(skb);
7356 }
7357#else
7358 int i;
7359
7360 for (i = 0; i < ctx->nr_user_files; i++) {
7361 struct file *file;
7362
7363 file = io_file_from_index(ctx, i);
7364 if (file)
7365 fput(file);
7366 }
7367#endif
042b0d85 7368 io_free_file_tables(&ctx->file_table);
44b31f2f 7369 io_rsrc_data_free(ctx->file_data);
fff4db76
PB
7370 ctx->file_data = NULL;
7371 ctx->nr_user_files = 0;
1ad555c6
BM
7372}
7373
d7954b2b
BM
7374static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7375{
d7954b2b
BM
7376 int ret;
7377
08480400 7378 if (!ctx->file_data)
d7954b2b 7379 return -ENXIO;
08480400
PB
7380 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7381 if (!ret)
7382 __io_sqe_files_unregister(ctx);
7383 return ret;
6b06314c
JA
7384}
7385
37d1e2e3 7386static void io_sq_thread_unpark(struct io_sq_data *sqd)
09a6f4ef 7387 __releases(&sqd->lock)
37d1e2e3 7388{
521d6a73
PB
7389 WARN_ON_ONCE(sqd->thread == current);
7390
9e138a48
PB
7391 /*
7392 * Do the dance but not conditional clear_bit() because it'd race with
7393 * other threads incrementing park_pending and setting the bit.
7394 */
37d1e2e3 7395 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
9e138a48
PB
7396 if (atomic_dec_return(&sqd->park_pending))
7397 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 7398 mutex_unlock(&sqd->lock);
37d1e2e3
JA
7399}
7400
86e0d676 7401static void io_sq_thread_park(struct io_sq_data *sqd)
09a6f4ef 7402 __acquires(&sqd->lock)
37d1e2e3 7403{
521d6a73
PB
7404 WARN_ON_ONCE(sqd->thread == current);
7405
9e138a48 7406 atomic_inc(&sqd->park_pending);
86e0d676 7407 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 7408 mutex_lock(&sqd->lock);
05962f95 7409 if (sqd->thread)
86e0d676 7410 wake_up_process(sqd->thread);
37d1e2e3
JA
7411}
7412
7413static void io_sq_thread_stop(struct io_sq_data *sqd)
7414{
521d6a73 7415 WARN_ON_ONCE(sqd->thread == current);
88885f66 7416 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
521d6a73 7417
05962f95 7418 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
88885f66 7419 mutex_lock(&sqd->lock);
e8f98f24
JA
7420 if (sqd->thread)
7421 wake_up_process(sqd->thread);
09a6f4ef 7422 mutex_unlock(&sqd->lock);
05962f95 7423 wait_for_completion(&sqd->exited);
37d1e2e3
JA
7424}
7425
534ca6d6 7426static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 7427{
534ca6d6 7428 if (refcount_dec_and_test(&sqd->refs)) {
9e138a48
PB
7429 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
7430
37d1e2e3
JA
7431 io_sq_thread_stop(sqd);
7432 kfree(sqd);
7433 }
7434}
7435
7436static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7437{
7438 struct io_sq_data *sqd = ctx->sq_data;
7439
7440 if (sqd) {
05962f95 7441 io_sq_thread_park(sqd);
521d6a73 7442 list_del_init(&ctx->sqd_list);
37d1e2e3 7443 io_sqd_update_thread_idle(sqd);
05962f95 7444 io_sq_thread_unpark(sqd);
37d1e2e3
JA
7445
7446 io_put_sq_data(sqd);
7447 ctx->sq_data = NULL;
534ca6d6
JA
7448 }
7449}
7450
aa06165d
JA
7451static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7452{
7453 struct io_ring_ctx *ctx_attach;
7454 struct io_sq_data *sqd;
7455 struct fd f;
7456
7457 f = fdget(p->wq_fd);
7458 if (!f.file)
7459 return ERR_PTR(-ENXIO);
7460 if (f.file->f_op != &io_uring_fops) {
7461 fdput(f);
7462 return ERR_PTR(-EINVAL);
7463 }
7464
7465 ctx_attach = f.file->private_data;
7466 sqd = ctx_attach->sq_data;
7467 if (!sqd) {
7468 fdput(f);
7469 return ERR_PTR(-EINVAL);
7470 }
5c2469e0
JA
7471 if (sqd->task_tgid != current->tgid) {
7472 fdput(f);
7473 return ERR_PTR(-EPERM);
7474 }
aa06165d
JA
7475
7476 refcount_inc(&sqd->refs);
7477 fdput(f);
7478 return sqd;
7479}
7480
26984fbf
PB
7481static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7482 bool *attached)
534ca6d6
JA
7483{
7484 struct io_sq_data *sqd;
7485
26984fbf 7486 *attached = false;
5c2469e0
JA
7487 if (p->flags & IORING_SETUP_ATTACH_WQ) {
7488 sqd = io_attach_sq_data(p);
26984fbf
PB
7489 if (!IS_ERR(sqd)) {
7490 *attached = true;
5c2469e0 7491 return sqd;
26984fbf 7492 }
5c2469e0
JA
7493 /* fall through for EPERM case, setup new sqd/task */
7494 if (PTR_ERR(sqd) != -EPERM)
7495 return sqd;
7496 }
aa06165d 7497
534ca6d6
JA
7498 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7499 if (!sqd)
7500 return ERR_PTR(-ENOMEM);
7501
9e138a48 7502 atomic_set(&sqd->park_pending, 0);
534ca6d6 7503 refcount_set(&sqd->refs, 1);
69fb2131 7504 INIT_LIST_HEAD(&sqd->ctx_list);
09a6f4ef 7505 mutex_init(&sqd->lock);
534ca6d6 7506 init_waitqueue_head(&sqd->wait);
37d1e2e3 7507 init_completion(&sqd->exited);
534ca6d6
JA
7508 return sqd;
7509}
7510
6b06314c 7511#if defined(CONFIG_UNIX)
6b06314c
JA
7512/*
7513 * Ensure the UNIX gc is aware of our file set, so we are certain that
7514 * the io_uring can be safely unregistered on process exit, even if we have
7515 * loops in the file referencing.
7516 */
7517static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7518{
7519 struct sock *sk = ctx->ring_sock->sk;
7520 struct scm_fp_list *fpl;
7521 struct sk_buff *skb;
08a45173 7522 int i, nr_files;
6b06314c 7523
6b06314c
JA
7524 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7525 if (!fpl)
7526 return -ENOMEM;
7527
7528 skb = alloc_skb(0, GFP_KERNEL);
7529 if (!skb) {
7530 kfree(fpl);
7531 return -ENOMEM;
7532 }
7533
7534 skb->sk = sk;
6b06314c 7535
08a45173 7536 nr_files = 0;
62e398be 7537 fpl->user = get_uid(current_user());
6b06314c 7538 for (i = 0; i < nr; i++) {
65e19f54
JA
7539 struct file *file = io_file_from_index(ctx, i + offset);
7540
7541 if (!file)
08a45173 7542 continue;
65e19f54 7543 fpl->fp[nr_files] = get_file(file);
08a45173
JA
7544 unix_inflight(fpl->user, fpl->fp[nr_files]);
7545 nr_files++;
6b06314c
JA
7546 }
7547
08a45173
JA
7548 if (nr_files) {
7549 fpl->max = SCM_MAX_FD;
7550 fpl->count = nr_files;
7551 UNIXCB(skb).fp = fpl;
05f3fb3c 7552 skb->destructor = unix_destruct_scm;
08a45173
JA
7553 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7554 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 7555
08a45173
JA
7556 for (i = 0; i < nr_files; i++)
7557 fput(fpl->fp[i]);
7558 } else {
7559 kfree_skb(skb);
7560 kfree(fpl);
7561 }
6b06314c
JA
7562
7563 return 0;
7564}
7565
7566/*
7567 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7568 * causes regular reference counting to break down. We rely on the UNIX
7569 * garbage collection to take care of this problem for us.
7570 */
7571static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7572{
7573 unsigned left, total;
7574 int ret = 0;
7575
7576 total = 0;
7577 left = ctx->nr_user_files;
7578 while (left) {
7579 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
7580
7581 ret = __io_sqe_files_scm(ctx, this_files, total);
7582 if (ret)
7583 break;
7584 left -= this_files;
7585 total += this_files;
7586 }
7587
7588 if (!ret)
7589 return 0;
7590
7591 while (total < ctx->nr_user_files) {
65e19f54
JA
7592 struct file *file = io_file_from_index(ctx, total);
7593
7594 if (file)
7595 fput(file);
6b06314c
JA
7596 total++;
7597 }
7598
7599 return ret;
7600}
7601#else
7602static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7603{
7604 return 0;
7605}
7606#endif
7607
47e90392 7608static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 7609{
50238531 7610 struct file *file = prsrc->file;
05f3fb3c
JA
7611#if defined(CONFIG_UNIX)
7612 struct sock *sock = ctx->ring_sock->sk;
7613 struct sk_buff_head list, *head = &sock->sk_receive_queue;
7614 struct sk_buff *skb;
7615 int i;
7616
7617 __skb_queue_head_init(&list);
7618
7619 /*
7620 * Find the skb that holds this file in its SCM_RIGHTS. When found,
7621 * remove this entry and rearrange the file array.
7622 */
7623 skb = skb_dequeue(head);
7624 while (skb) {
7625 struct scm_fp_list *fp;
7626
7627 fp = UNIXCB(skb).fp;
7628 for (i = 0; i < fp->count; i++) {
7629 int left;
7630
7631 if (fp->fp[i] != file)
7632 continue;
7633
7634 unix_notinflight(fp->user, fp->fp[i]);
7635 left = fp->count - 1 - i;
7636 if (left) {
7637 memmove(&fp->fp[i], &fp->fp[i + 1],
7638 left * sizeof(struct file *));
7639 }
7640 fp->count--;
7641 if (!fp->count) {
7642 kfree_skb(skb);
7643 skb = NULL;
7644 } else {
7645 __skb_queue_tail(&list, skb);
7646 }
7647 fput(file);
7648 file = NULL;
7649 break;
7650 }
7651
7652 if (!file)
7653 break;
7654
7655 __skb_queue_tail(&list, skb);
7656
7657 skb = skb_dequeue(head);
7658 }
7659
7660 if (skb_peek(&list)) {
7661 spin_lock_irq(&head->lock);
7662 while ((skb = __skb_dequeue(&list)) != NULL)
7663 __skb_queue_tail(head, skb);
7664 spin_unlock_irq(&head->lock);
7665 }
7666#else
7667 fput(file);
7668#endif
7669}
7670
b895c9a6 7671static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
65e19f54 7672{
b895c9a6 7673 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
269bbe5f
BM
7674 struct io_ring_ctx *ctx = rsrc_data->ctx;
7675 struct io_rsrc_put *prsrc, *tmp;
05589553 7676
269bbe5f
BM
7677 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7678 list_del(&prsrc->list);
b60c8dce
PB
7679
7680 if (prsrc->tag) {
7681 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
b60c8dce
PB
7682
7683 io_ring_submit_lock(ctx, lock_ring);
157d257f 7684 spin_lock_irq(&ctx->completion_lock);
b60c8dce 7685 io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
2840f710 7686 ctx->cq_extra++;
b60c8dce 7687 io_commit_cqring(ctx);
157d257f 7688 spin_unlock_irq(&ctx->completion_lock);
b60c8dce
PB
7689 io_cqring_ev_posted(ctx);
7690 io_ring_submit_unlock(ctx, lock_ring);
7691 }
7692
40ae0ff7 7693 rsrc_data->do_put(ctx, prsrc);
269bbe5f 7694 kfree(prsrc);
65e19f54 7695 }
05589553 7696
28a9fe25 7697 io_rsrc_node_destroy(ref_node);
3e942498
PB
7698 if (atomic_dec_and_test(&rsrc_data->refs))
7699 complete(&rsrc_data->done);
2faf852d 7700}
65e19f54 7701
269bbe5f 7702static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
7703{
7704 struct io_ring_ctx *ctx;
7705 struct llist_node *node;
7706
269bbe5f
BM
7707 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7708 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
7709
7710 while (node) {
b895c9a6 7711 struct io_rsrc_node *ref_node;
4a38aed2
JA
7712 struct llist_node *next = node->next;
7713
b895c9a6 7714 ref_node = llist_entry(node, struct io_rsrc_node, llist);
269bbe5f 7715 __io_rsrc_put_work(ref_node);
4a38aed2
JA
7716 node = next;
7717 }
7718}
7719
6b06314c 7720static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
792e3582 7721 unsigned nr_args, u64 __user *tags)
6b06314c
JA
7722{
7723 __s32 __user *fds = (__s32 __user *) arg;
05f3fb3c 7724 struct file *file;
f3baed39 7725 int fd, ret;
846a4ef2 7726 unsigned i;
6b06314c 7727
05f3fb3c 7728 if (ctx->file_data)
6b06314c
JA
7729 return -EBUSY;
7730 if (!nr_args)
7731 return -EINVAL;
7732 if (nr_args > IORING_MAX_FIXED_FILES)
7733 return -EMFILE;
a7f0ed5a 7734 ret = io_rsrc_node_switch_start(ctx);
f3baed39
PB
7735 if (ret)
7736 return ret;
d878c816
PB
7737 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
7738 &ctx->file_data);
7739 if (ret)
7740 return ret;
6b06314c 7741
f3baed39 7742 ret = -ENOMEM;
aeca241b 7743 if (!io_alloc_file_tables(&ctx->file_table, nr_args))
1ad555c6 7744 goto out_free;
65e19f54 7745
08a45173 7746 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
d878c816 7747 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
600cf3f8
PB
7748 ret = -EFAULT;
7749 goto out_fput;
7750 }
08a45173 7751 /* allow sparse sets */
792e3582
PB
7752 if (fd == -1) {
7753 ret = -EINVAL;
2d091d62 7754 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
792e3582 7755 goto out_fput;
08a45173 7756 continue;
792e3582 7757 }
6b06314c 7758
05f3fb3c 7759 file = fget(fd);
6b06314c 7760 ret = -EBADF;
792e3582 7761 if (unlikely(!file))
600cf3f8 7762 goto out_fput;
05f3fb3c 7763
6b06314c
JA
7764 /*
7765 * Don't allow io_uring instances to be registered. If UNIX
7766 * isn't enabled, then this causes a reference cycle and this
7767 * instance can never get freed. If UNIX is enabled we'll
7768 * handle it just fine, but there's still no point in allowing
7769 * a ring fd as it doesn't support regular read/write anyway.
7770 */
05f3fb3c
JA
7771 if (file->f_op == &io_uring_fops) {
7772 fput(file);
600cf3f8 7773 goto out_fput;
6b06314c 7774 }
aeca241b 7775 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
6b06314c
JA
7776 }
7777
6b06314c 7778 ret = io_sqe_files_scm(ctx);
05589553 7779 if (ret) {
08480400 7780 __io_sqe_files_unregister(ctx);
05589553
XW
7781 return ret;
7782 }
6b06314c 7783
a7f0ed5a 7784 io_rsrc_node_switch(ctx, NULL);
6b06314c 7785 return ret;
600cf3f8
PB
7786out_fput:
7787 for (i = 0; i < ctx->nr_user_files; i++) {
7788 file = io_file_from_index(ctx, i);
7789 if (file)
7790 fput(file);
7791 }
042b0d85 7792 io_free_file_tables(&ctx->file_table);
600cf3f8 7793 ctx->nr_user_files = 0;
600cf3f8 7794out_free:
44b31f2f 7795 io_rsrc_data_free(ctx->file_data);
55cbc256 7796 ctx->file_data = NULL;
6b06314c
JA
7797 return ret;
7798}
7799
c3a31e60
JA
7800static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7801 int index)
7802{
7803#if defined(CONFIG_UNIX)
7804 struct sock *sock = ctx->ring_sock->sk;
7805 struct sk_buff_head *head = &sock->sk_receive_queue;
7806 struct sk_buff *skb;
7807
7808 /*
7809 * See if we can merge this file into an existing skb SCM_RIGHTS
7810 * file set. If there's no room, fall back to allocating a new skb
7811 * and filling it in.
7812 */
7813 spin_lock_irq(&head->lock);
7814 skb = skb_peek(head);
7815 if (skb) {
7816 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7817
7818 if (fpl->count < SCM_MAX_FD) {
7819 __skb_unlink(skb, head);
7820 spin_unlock_irq(&head->lock);
7821 fpl->fp[fpl->count] = get_file(file);
7822 unix_inflight(fpl->user, fpl->fp[fpl->count]);
7823 fpl->count++;
7824 spin_lock_irq(&head->lock);
7825 __skb_queue_head(head, skb);
7826 } else {
7827 skb = NULL;
7828 }
7829 }
7830 spin_unlock_irq(&head->lock);
7831
7832 if (skb) {
7833 fput(file);
7834 return 0;
7835 }
7836
7837 return __io_sqe_files_scm(ctx, 1, index);
7838#else
7839 return 0;
7840#endif
7841}
7842
b60c8dce 7843static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
e7c78371 7844 struct io_rsrc_node *node, void *rsrc)
05f3fb3c 7845{
269bbe5f 7846 struct io_rsrc_put *prsrc;
05f3fb3c 7847
269bbe5f
BM
7848 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7849 if (!prsrc)
a5318d3c 7850 return -ENOMEM;
05f3fb3c 7851
2d091d62 7852 prsrc->tag = *io_get_tag_slot(data, idx);
50238531 7853 prsrc->rsrc = rsrc;
e7c78371 7854 list_add(&prsrc->list, &node->rsrc_list);
a5318d3c 7855 return 0;
05f3fb3c
JA
7856}
7857
7858static int __io_sqe_files_update(struct io_ring_ctx *ctx,
c3bdad02 7859 struct io_uring_rsrc_update2 *up,
05f3fb3c
JA
7860 unsigned nr_args)
7861{
c3bdad02 7862 u64 __user *tags = u64_to_user_ptr(up->tags);
98f0b3b4 7863 __s32 __user *fds = u64_to_user_ptr(up->data);
b895c9a6 7864 struct io_rsrc_data *data = ctx->file_data;
a04b0ac0
PB
7865 struct io_fixed_file *file_slot;
7866 struct file *file;
98f0b3b4
PB
7867 int fd, i, err = 0;
7868 unsigned int done;
05589553 7869 bool needs_switch = false;
c3a31e60 7870
98f0b3b4
PB
7871 if (!ctx->file_data)
7872 return -ENXIO;
7873 if (up->offset + nr_args > ctx->nr_user_files)
c3a31e60
JA
7874 return -EINVAL;
7875
67973b93 7876 for (done = 0; done < nr_args; done++) {
c3bdad02
PB
7877 u64 tag = 0;
7878
7879 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
7880 copy_from_user(&fd, &fds[done], sizeof(fd))) {
c3a31e60
JA
7881 err = -EFAULT;
7882 break;
7883 }
c3bdad02
PB
7884 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
7885 err = -EINVAL;
7886 break;
7887 }
4e0377a1 7888 if (fd == IORING_REGISTER_FILES_SKIP)
7889 continue;
7890
67973b93 7891 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
aeca241b 7892 file_slot = io_fixed_file_slot(&ctx->file_table, i);
ea64ec02 7893
a04b0ac0
PB
7894 if (file_slot->file_ptr) {
7895 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
b60c8dce
PB
7896 err = io_queue_rsrc_removal(data, up->offset + done,
7897 ctx->rsrc_node, file);
a5318d3c
HD
7898 if (err)
7899 break;
a04b0ac0 7900 file_slot->file_ptr = 0;
05589553 7901 needs_switch = true;
c3a31e60
JA
7902 }
7903 if (fd != -1) {
c3a31e60
JA
7904 file = fget(fd);
7905 if (!file) {
7906 err = -EBADF;
7907 break;
7908 }
7909 /*
7910 * Don't allow io_uring instances to be registered. If
7911 * UNIX isn't enabled, then this causes a reference
7912 * cycle and this instance can never get freed. If UNIX
7913 * is enabled we'll handle it just fine, but there's
7914 * still no point in allowing a ring fd as it doesn't
7915 * support regular read/write anyway.
7916 */
7917 if (file->f_op == &io_uring_fops) {
7918 fput(file);
7919 err = -EBADF;
7920 break;
7921 }
2d091d62 7922 *io_get_tag_slot(data, up->offset + done) = tag;
9a321c98 7923 io_fixed_file_set(file_slot, file);
c3a31e60 7924 err = io_sqe_file_register(ctx, file, i);
f3bd9dae 7925 if (err) {
a04b0ac0 7926 file_slot->file_ptr = 0;
f3bd9dae 7927 fput(file);
c3a31e60 7928 break;
f3bd9dae 7929 }
c3a31e60 7930 }
05f3fb3c
JA
7931 }
7932
a7f0ed5a
PB
7933 if (needs_switch)
7934 io_rsrc_node_switch(ctx, data);
c3a31e60
JA
7935 return done ? done : err;
7936}
05589553 7937
685fe7fe
JA
7938static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
7939 struct task_struct *task)
24369c2e 7940{
e941894e 7941 struct io_wq_hash *hash;
24369c2e 7942 struct io_wq_data data;
24369c2e 7943 unsigned int concurrency;
24369c2e 7944
362a9e65 7945 mutex_lock(&ctx->uring_lock);
e941894e
JA
7946 hash = ctx->hash_map;
7947 if (!hash) {
7948 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
362a9e65
YY
7949 if (!hash) {
7950 mutex_unlock(&ctx->uring_lock);
e941894e 7951 return ERR_PTR(-ENOMEM);
362a9e65 7952 }
e941894e
JA
7953 refcount_set(&hash->refs, 1);
7954 init_waitqueue_head(&hash->wait);
7955 ctx->hash_map = hash;
24369c2e 7956 }
362a9e65 7957 mutex_unlock(&ctx->uring_lock);
24369c2e 7958
e941894e 7959 data.hash = hash;
685fe7fe 7960 data.task = task;
ebc11b6c 7961 data.free_work = io_wq_free_work;
f5fa38c5 7962 data.do_work = io_wq_submit_work;
24369c2e 7963
d25e3a3d
JA
7964 /* Do QD, or 4 * CPUS, whatever is smallest */
7965 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
24369c2e 7966
5aa75ed5 7967 return io_wq_create(concurrency, &data);
24369c2e
PB
7968}
7969
5aa75ed5
JA
7970static int io_uring_alloc_task_context(struct task_struct *task,
7971 struct io_ring_ctx *ctx)
0f212204
JA
7972{
7973 struct io_uring_task *tctx;
d8a6df10 7974 int ret;
0f212204 7975
09899b19 7976 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
0f212204
JA
7977 if (unlikely(!tctx))
7978 return -ENOMEM;
7979
d8a6df10
JA
7980 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7981 if (unlikely(ret)) {
7982 kfree(tctx);
7983 return ret;
7984 }
7985
685fe7fe 7986 tctx->io_wq = io_init_wq_offload(ctx, task);
5aa75ed5
JA
7987 if (IS_ERR(tctx->io_wq)) {
7988 ret = PTR_ERR(tctx->io_wq);
7989 percpu_counter_destroy(&tctx->inflight);
7990 kfree(tctx);
7991 return ret;
7992 }
7993
0f212204
JA
7994 xa_init(&tctx->xa);
7995 init_waitqueue_head(&tctx->wait);
fdaf083c 7996 atomic_set(&tctx->in_idle, 0);
b303fe2e 7997 atomic_set(&tctx->inflight_tracked, 0);
0f212204 7998 task->io_uring = tctx;
7cbf1722
JA
7999 spin_lock_init(&tctx->task_lock);
8000 INIT_WQ_LIST(&tctx->task_list);
7cbf1722 8001 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
8002 return 0;
8003}
8004
8005void __io_uring_free(struct task_struct *tsk)
8006{
8007 struct io_uring_task *tctx = tsk->io_uring;
8008
8009 WARN_ON_ONCE(!xa_empty(&tctx->xa));
ef8eaa4e 8010 WARN_ON_ONCE(tctx->io_wq);
09899b19 8011 WARN_ON_ONCE(tctx->cached_refs);
ef8eaa4e 8012
d8a6df10 8013 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
8014 kfree(tctx);
8015 tsk->io_uring = NULL;
8016}
8017
7e84e1c7
SG
8018static int io_sq_offload_create(struct io_ring_ctx *ctx,
8019 struct io_uring_params *p)
2b188cc1
JA
8020{
8021 int ret;
8022
d25e3a3d
JA
8023 /* Retain compatibility with failing for an invalid attach attempt */
8024 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8025 IORING_SETUP_ATTACH_WQ) {
8026 struct fd f;
8027
8028 f = fdget(p->wq_fd);
8029 if (!f.file)
8030 return -ENXIO;
0cc936f7
JA
8031 if (f.file->f_op != &io_uring_fops) {
8032 fdput(f);
f2a48dd0 8033 return -EINVAL;
0cc936f7
JA
8034 }
8035 fdput(f);
d25e3a3d 8036 }
6c271ce2 8037 if (ctx->flags & IORING_SETUP_SQPOLL) {
46fe18b1 8038 struct task_struct *tsk;
534ca6d6 8039 struct io_sq_data *sqd;
26984fbf 8040 bool attached;
534ca6d6 8041
26984fbf 8042 sqd = io_get_sq_data(p, &attached);
534ca6d6
JA
8043 if (IS_ERR(sqd)) {
8044 ret = PTR_ERR(sqd);
8045 goto err;
8046 }
69fb2131 8047
7c30f36a 8048 ctx->sq_creds = get_current_cred();
534ca6d6 8049 ctx->sq_data = sqd;
917257da
JA
8050 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8051 if (!ctx->sq_thread_idle)
8052 ctx->sq_thread_idle = HZ;
8053
78d7f6ba 8054 io_sq_thread_park(sqd);
de75a3d3
PB
8055 list_add(&ctx->sqd_list, &sqd->ctx_list);
8056 io_sqd_update_thread_idle(sqd);
26984fbf 8057 /* don't attach to a dying SQPOLL thread, would be racy */
f2a48dd0 8058 ret = (attached && !sqd->thread) ? -ENXIO : 0;
78d7f6ba
PB
8059 io_sq_thread_unpark(sqd);
8060
de75a3d3
PB
8061 if (ret < 0)
8062 goto err;
8063 if (attached)
5aa75ed5 8064 return 0;
aa06165d 8065
6c271ce2 8066 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 8067 int cpu = p->sq_thread_cpu;
6c271ce2 8068
917257da 8069 ret = -EINVAL;
f2a48dd0 8070 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
e8f98f24 8071 goto err_sqpoll;
37d1e2e3 8072 sqd->sq_cpu = cpu;
6c271ce2 8073 } else {
37d1e2e3 8074 sqd->sq_cpu = -1;
6c271ce2 8075 }
37d1e2e3
JA
8076
8077 sqd->task_pid = current->pid;
5c2469e0 8078 sqd->task_tgid = current->tgid;
46fe18b1
JA
8079 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8080 if (IS_ERR(tsk)) {
8081 ret = PTR_ERR(tsk);
e8f98f24 8082 goto err_sqpoll;
6c271ce2 8083 }
97a73a0f 8084
46fe18b1 8085 sqd->thread = tsk;
97a73a0f 8086 ret = io_uring_alloc_task_context(tsk, ctx);
46fe18b1 8087 wake_up_new_task(tsk);
0f212204
JA
8088 if (ret)
8089 goto err;
6c271ce2
JA
8090 } else if (p->flags & IORING_SETUP_SQ_AFF) {
8091 /* Can't have SQ_AFF without SQPOLL */
8092 ret = -EINVAL;
8093 goto err;
8094 }
8095
2b188cc1 8096 return 0;
f2a48dd0
PB
8097err_sqpoll:
8098 complete(&ctx->sq_data->exited);
2b188cc1 8099err:
37d1e2e3 8100 io_sq_thread_finish(ctx);
2b188cc1
JA
8101 return ret;
8102}
8103
a087e2b5
BM
8104static inline void __io_unaccount_mem(struct user_struct *user,
8105 unsigned long nr_pages)
2b188cc1
JA
8106{
8107 atomic_long_sub(nr_pages, &user->locked_vm);
8108}
8109
a087e2b5
BM
8110static inline int __io_account_mem(struct user_struct *user,
8111 unsigned long nr_pages)
2b188cc1
JA
8112{
8113 unsigned long page_limit, cur_pages, new_pages;
8114
8115 /* Don't allow more pages than we can safely lock */
8116 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8117
8118 do {
8119 cur_pages = atomic_long_read(&user->locked_vm);
8120 new_pages = cur_pages + nr_pages;
8121 if (new_pages > page_limit)
8122 return -ENOMEM;
8123 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8124 new_pages) != cur_pages);
8125
8126 return 0;
8127}
8128
26bfa89e 8129static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8130{
62e398be 8131 if (ctx->user)
a087e2b5 8132 __io_unaccount_mem(ctx->user, nr_pages);
30975825 8133
26bfa89e
JA
8134 if (ctx->mm_account)
8135 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8136}
8137
26bfa89e 8138static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8139{
30975825
BM
8140 int ret;
8141
62e398be 8142 if (ctx->user) {
30975825
BM
8143 ret = __io_account_mem(ctx->user, nr_pages);
8144 if (ret)
8145 return ret;
8146 }
8147
26bfa89e
JA
8148 if (ctx->mm_account)
8149 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8150
8151 return 0;
8152}
8153
2b188cc1
JA
8154static void io_mem_free(void *ptr)
8155{
52e04ef4
MR
8156 struct page *page;
8157
8158 if (!ptr)
8159 return;
2b188cc1 8160
52e04ef4 8161 page = virt_to_head_page(ptr);
2b188cc1
JA
8162 if (put_page_testzero(page))
8163 free_compound_page(page);
8164}
8165
8166static void *io_mem_alloc(size_t size)
8167{
8168 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
26bfa89e 8169 __GFP_NORETRY | __GFP_ACCOUNT;
2b188cc1
JA
8170
8171 return (void *) __get_free_pages(gfp_flags, get_order(size));
8172}
8173
75b28aff
HV
8174static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8175 size_t *sq_offset)
8176{
8177 struct io_rings *rings;
8178 size_t off, sq_array_size;
8179
8180 off = struct_size(rings, cqes, cq_entries);
8181 if (off == SIZE_MAX)
8182 return SIZE_MAX;
8183
8184#ifdef CONFIG_SMP
8185 off = ALIGN(off, SMP_CACHE_BYTES);
8186 if (off == 0)
8187 return SIZE_MAX;
8188#endif
8189
b36200f5
DV
8190 if (sq_offset)
8191 *sq_offset = off;
8192
75b28aff
HV
8193 sq_array_size = array_size(sizeof(u32), sq_entries);
8194 if (sq_array_size == SIZE_MAX)
8195 return SIZE_MAX;
8196
8197 if (check_add_overflow(off, sq_array_size, &off))
8198 return SIZE_MAX;
8199
75b28aff
HV
8200 return off;
8201}
8202
41edf1a5 8203static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
7f61a1e9 8204{
41edf1a5 8205 struct io_mapped_ubuf *imu = *slot;
7f61a1e9
PB
8206 unsigned int i;
8207
6224843d
PB
8208 if (imu != ctx->dummy_ubuf) {
8209 for (i = 0; i < imu->nr_bvecs; i++)
8210 unpin_user_page(imu->bvec[i].bv_page);
8211 if (imu->acct_pages)
8212 io_unaccount_mem(ctx, imu->acct_pages);
8213 kvfree(imu);
8214 }
41edf1a5 8215 *slot = NULL;
7f61a1e9
PB
8216}
8217
bd54b6fe 8218static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
edafccee 8219{
634d00df
PB
8220 io_buffer_unmap(ctx, &prsrc->buf);
8221 prsrc->buf = NULL;
bd54b6fe 8222}
edafccee 8223
bd54b6fe
BM
8224static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8225{
8226 unsigned int i;
edafccee 8227
7f61a1e9
PB
8228 for (i = 0; i < ctx->nr_user_bufs; i++)
8229 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
edafccee 8230 kfree(ctx->user_bufs);
bb6659cc 8231 io_rsrc_data_free(ctx->buf_data);
edafccee 8232 ctx->user_bufs = NULL;
bd54b6fe 8233 ctx->buf_data = NULL;
edafccee 8234 ctx->nr_user_bufs = 0;
bd54b6fe
BM
8235}
8236
0a96bbe4 8237static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee 8238{
bd54b6fe 8239 int ret;
edafccee 8240
bd54b6fe 8241 if (!ctx->buf_data)
edafccee
JA
8242 return -ENXIO;
8243
bd54b6fe
BM
8244 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8245 if (!ret)
8246 __io_sqe_buffers_unregister(ctx);
8247 return ret;
edafccee
JA
8248}
8249
8250static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8251 void __user *arg, unsigned index)
8252{
8253 struct iovec __user *src;
8254
8255#ifdef CONFIG_COMPAT
8256 if (ctx->compat) {
8257 struct compat_iovec __user *ciovs;
8258 struct compat_iovec ciov;
8259
8260 ciovs = (struct compat_iovec __user *) arg;
8261 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8262 return -EFAULT;
8263
d55e5f5b 8264 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
8265 dst->iov_len = ciov.iov_len;
8266 return 0;
8267 }
8268#endif
8269 src = (struct iovec __user *) arg;
8270 if (copy_from_user(dst, &src[index], sizeof(*dst)))
8271 return -EFAULT;
8272 return 0;
8273}
8274
de293938
JA
8275/*
8276 * Not super efficient, but this is just a registration time. And we do cache
8277 * the last compound head, so generally we'll only do a full search if we don't
8278 * match that one.
8279 *
8280 * We check if the given compound head page has already been accounted, to
8281 * avoid double accounting it. This allows us to account the full size of the
8282 * page, not just the constituent pages of a huge page.
8283 */
8284static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8285 int nr_pages, struct page *hpage)
8286{
8287 int i, j;
8288
8289 /* check current page array */
8290 for (i = 0; i < nr_pages; i++) {
8291 if (!PageCompound(pages[i]))
8292 continue;
8293 if (compound_head(pages[i]) == hpage)
8294 return true;
8295 }
8296
8297 /* check previously registered pages */
8298 for (i = 0; i < ctx->nr_user_bufs; i++) {
41edf1a5 8299 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
de293938
JA
8300
8301 for (j = 0; j < imu->nr_bvecs; j++) {
8302 if (!PageCompound(imu->bvec[j].bv_page))
8303 continue;
8304 if (compound_head(imu->bvec[j].bv_page) == hpage)
8305 return true;
8306 }
8307 }
8308
8309 return false;
8310}
8311
8312static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8313 int nr_pages, struct io_mapped_ubuf *imu,
8314 struct page **last_hpage)
8315{
8316 int i, ret;
8317
216e5835 8318 imu->acct_pages = 0;
de293938
JA
8319 for (i = 0; i < nr_pages; i++) {
8320 if (!PageCompound(pages[i])) {
8321 imu->acct_pages++;
8322 } else {
8323 struct page *hpage;
8324
8325 hpage = compound_head(pages[i]);
8326 if (hpage == *last_hpage)
8327 continue;
8328 *last_hpage = hpage;
8329 if (headpage_already_acct(ctx, pages, i, hpage))
8330 continue;
8331 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8332 }
8333 }
8334
8335 if (!imu->acct_pages)
8336 return 0;
8337
26bfa89e 8338 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
8339 if (ret)
8340 imu->acct_pages = 0;
8341 return ret;
8342}
8343
0a96bbe4 8344static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
41edf1a5 8345 struct io_mapped_ubuf **pimu,
0a96bbe4 8346 struct page **last_hpage)
edafccee 8347{
41edf1a5 8348 struct io_mapped_ubuf *imu = NULL;
edafccee
JA
8349 struct vm_area_struct **vmas = NULL;
8350 struct page **pages = NULL;
0a96bbe4
BM
8351 unsigned long off, start, end, ubuf;
8352 size_t size;
8353 int ret, pret, nr_pages, i;
8354
6224843d
PB
8355 if (!iov->iov_base) {
8356 *pimu = ctx->dummy_ubuf;
8357 return 0;
8358 }
8359
0a96bbe4
BM
8360 ubuf = (unsigned long) iov->iov_base;
8361 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8362 start = ubuf >> PAGE_SHIFT;
8363 nr_pages = end - start;
8364
41edf1a5 8365 *pimu = NULL;
0a96bbe4
BM
8366 ret = -ENOMEM;
8367
8368 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8369 if (!pages)
8370 goto done;
8371
8372 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8373 GFP_KERNEL);
8374 if (!vmas)
8375 goto done;
edafccee 8376
41edf1a5 8377 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
a2b4198c 8378 if (!imu)
0a96bbe4
BM
8379 goto done;
8380
8381 ret = 0;
8382 mmap_read_lock(current->mm);
8383 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8384 pages, vmas);
8385 if (pret == nr_pages) {
8386 /* don't support file backed memory */
8387 for (i = 0; i < nr_pages; i++) {
8388 struct vm_area_struct *vma = vmas[i];
8389
40dad765
PB
8390 if (vma_is_shmem(vma))
8391 continue;
0a96bbe4
BM
8392 if (vma->vm_file &&
8393 !is_file_hugepages(vma->vm_file)) {
8394 ret = -EOPNOTSUPP;
8395 break;
8396 }
8397 }
8398 } else {
8399 ret = pret < 0 ? pret : -EFAULT;
8400 }
8401 mmap_read_unlock(current->mm);
8402 if (ret) {
8403 /*
8404 * if we did partial map, or found file backed vmas,
8405 * release any pages we did get
8406 */
8407 if (pret > 0)
8408 unpin_user_pages(pages, pret);
0a96bbe4
BM
8409 goto done;
8410 }
8411
8412 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8413 if (ret) {
8414 unpin_user_pages(pages, pret);
0a96bbe4
BM
8415 goto done;
8416 }
8417
8418 off = ubuf & ~PAGE_MASK;
8419 size = iov->iov_len;
8420 for (i = 0; i < nr_pages; i++) {
8421 size_t vec_len;
8422
8423 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8424 imu->bvec[i].bv_page = pages[i];
8425 imu->bvec[i].bv_len = vec_len;
8426 imu->bvec[i].bv_offset = off;
8427 off = 0;
8428 size -= vec_len;
8429 }
8430 /* store original address for later verification */
8431 imu->ubuf = ubuf;
4751f53d 8432 imu->ubuf_end = ubuf + iov->iov_len;
0a96bbe4 8433 imu->nr_bvecs = nr_pages;
41edf1a5 8434 *pimu = imu;
0a96bbe4
BM
8435 ret = 0;
8436done:
41edf1a5
PB
8437 if (ret)
8438 kvfree(imu);
0a96bbe4
BM
8439 kvfree(pages);
8440 kvfree(vmas);
8441 return ret;
8442}
8443
2b358604 8444static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 8445{
87094465
PB
8446 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
8447 return ctx->user_bufs ? 0 : -ENOMEM;
2b358604 8448}
edafccee 8449
2b358604
BM
8450static int io_buffer_validate(struct iovec *iov)
8451{
50e96989
PB
8452 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
8453
2b358604
BM
8454 /*
8455 * Don't impose further limits on the size and buffer
8456 * constraints here, we'll -EINVAL later when IO is
8457 * submitted if they are wrong.
8458 */
6224843d
PB
8459 if (!iov->iov_base)
8460 return iov->iov_len ? -EFAULT : 0;
8461 if (!iov->iov_len)
2b358604 8462 return -EFAULT;
edafccee 8463
2b358604
BM
8464 /* arbitrary limit, but we need something */
8465 if (iov->iov_len > SZ_1G)
8466 return -EFAULT;
edafccee 8467
50e96989
PB
8468 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
8469 return -EOVERFLOW;
8470
2b358604
BM
8471 return 0;
8472}
edafccee 8473
2b358604 8474static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
634d00df 8475 unsigned int nr_args, u64 __user *tags)
2b358604 8476{
bd54b6fe
BM
8477 struct page *last_hpage = NULL;
8478 struct io_rsrc_data *data;
2b358604
BM
8479 int i, ret;
8480 struct iovec iov;
edafccee 8481
87094465
PB
8482 if (ctx->user_bufs)
8483 return -EBUSY;
489809e2 8484 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
87094465 8485 return -EINVAL;
bd54b6fe 8486 ret = io_rsrc_node_switch_start(ctx);
2b358604
BM
8487 if (ret)
8488 return ret;
d878c816
PB
8489 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
8490 if (ret)
8491 return ret;
bd54b6fe
BM
8492 ret = io_buffers_map_alloc(ctx, nr_args);
8493 if (ret) {
bb6659cc 8494 io_rsrc_data_free(data);
bd54b6fe
BM
8495 return ret;
8496 }
edafccee 8497
87094465 8498 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
edafccee
JA
8499 ret = io_copy_iov(ctx, &iov, arg, i);
8500 if (ret)
0a96bbe4 8501 break;
2b358604
BM
8502 ret = io_buffer_validate(&iov);
8503 if (ret)
0a96bbe4 8504 break;
2d091d62 8505 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
cf3770e7
CIK
8506 ret = -EINVAL;
8507 break;
8508 }
edafccee 8509
41edf1a5
PB
8510 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
8511 &last_hpage);
0a96bbe4
BM
8512 if (ret)
8513 break;
edafccee 8514 }
0a96bbe4 8515
bd54b6fe 8516 WARN_ON_ONCE(ctx->buf_data);
0a96bbe4 8517
bd54b6fe
BM
8518 ctx->buf_data = data;
8519 if (ret)
8520 __io_sqe_buffers_unregister(ctx);
8521 else
8522 io_rsrc_node_switch(ctx, NULL);
edafccee
JA
8523 return ret;
8524}
8525
634d00df
PB
8526static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
8527 struct io_uring_rsrc_update2 *up,
8528 unsigned int nr_args)
8529{
8530 u64 __user *tags = u64_to_user_ptr(up->tags);
8531 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
634d00df
PB
8532 struct page *last_hpage = NULL;
8533 bool needs_switch = false;
8534 __u32 done;
8535 int i, err;
8536
8537 if (!ctx->buf_data)
8538 return -ENXIO;
8539 if (up->offset + nr_args > ctx->nr_user_bufs)
8540 return -EINVAL;
8541
8542 for (done = 0; done < nr_args; done++) {
0b8c0e7c
PB
8543 struct io_mapped_ubuf *imu;
8544 int offset = up->offset + done;
634d00df
PB
8545 u64 tag = 0;
8546
8547 err = io_copy_iov(ctx, &iov, iovs, done);
8548 if (err)
8549 break;
8550 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
8551 err = -EFAULT;
8552 break;
8553 }
0b8c0e7c
PB
8554 err = io_buffer_validate(&iov);
8555 if (err)
8556 break;
cf3770e7
CIK
8557 if (!iov.iov_base && tag) {
8558 err = -EINVAL;
8559 break;
8560 }
0b8c0e7c
PB
8561 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
8562 if (err)
8563 break;
634d00df 8564
0b8c0e7c 8565 i = array_index_nospec(offset, ctx->nr_user_bufs);
6224843d 8566 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
0b8c0e7c
PB
8567 err = io_queue_rsrc_removal(ctx->buf_data, offset,
8568 ctx->rsrc_node, ctx->user_bufs[i]);
8569 if (unlikely(err)) {
8570 io_buffer_unmap(ctx, &imu);
634d00df 8571 break;
0b8c0e7c 8572 }
634d00df
PB
8573 ctx->user_bufs[i] = NULL;
8574 needs_switch = true;
8575 }
8576
0b8c0e7c 8577 ctx->user_bufs[i] = imu;
2d091d62 8578 *io_get_tag_slot(ctx->buf_data, offset) = tag;
634d00df
PB
8579 }
8580
8581 if (needs_switch)
8582 io_rsrc_node_switch(ctx, ctx->buf_data);
8583 return done ? done : err;
8584}
8585
9b402849
JA
8586static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8587{
8588 __s32 __user *fds = arg;
8589 int fd;
8590
8591 if (ctx->cq_ev_fd)
8592 return -EBUSY;
8593
8594 if (copy_from_user(&fd, fds, sizeof(*fds)))
8595 return -EFAULT;
8596
8597 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8598 if (IS_ERR(ctx->cq_ev_fd)) {
8599 int ret = PTR_ERR(ctx->cq_ev_fd);
fe7e3257 8600
9b402849
JA
8601 ctx->cq_ev_fd = NULL;
8602 return ret;
8603 }
8604
8605 return 0;
8606}
8607
8608static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8609{
8610 if (ctx->cq_ev_fd) {
8611 eventfd_ctx_put(ctx->cq_ev_fd);
8612 ctx->cq_ev_fd = NULL;
8613 return 0;
8614 }
8615
8616 return -ENXIO;
8617}
8618
5a2e745d
JA
8619static void io_destroy_buffers(struct io_ring_ctx *ctx)
8620{
9e15c3a0
JA
8621 struct io_buffer *buf;
8622 unsigned long index;
8623
8624 xa_for_each(&ctx->io_buffers, index, buf)
8625 __io_remove_buffers(ctx, buf, index, -1U);
5a2e745d
JA
8626}
8627
68e68ee6 8628static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
1b4c351f 8629{
68e68ee6 8630 struct io_kiocb *req, *nxt;
1b4c351f 8631
68e68ee6
JA
8632 list_for_each_entry_safe(req, nxt, list, compl.list) {
8633 if (tsk && req->task != tsk)
8634 continue;
1b4c351f
JA
8635 list_del(&req->compl.list);
8636 kmem_cache_free(req_cachep, req);
8637 }
8638}
8639
4010fec4 8640static void io_req_caches_free(struct io_ring_ctx *ctx)
2b188cc1 8641{
bf019da7 8642 struct io_submit_state *submit_state = &ctx->submit_state;
e5547d2c 8643 struct io_comp_state *cs = &ctx->submit_state.comp;
bf019da7 8644
9a4fdbd8
JA
8645 mutex_lock(&ctx->uring_lock);
8646
8e5c66c4 8647 if (submit_state->free_reqs) {
9a4fdbd8
JA
8648 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8649 submit_state->reqs);
8e5c66c4
PB
8650 submit_state->free_reqs = 0;
8651 }
9a4fdbd8 8652
dac7a098 8653 io_flush_cached_locked_reqs(ctx, cs);
e5547d2c 8654 io_req_cache_free(&cs->free_list, NULL);
9a4fdbd8
JA
8655 mutex_unlock(&ctx->uring_lock);
8656}
8657
43597aac 8658static void io_wait_rsrc_data(struct io_rsrc_data *data)
2b188cc1 8659{
43597aac 8660 if (data && !atomic_dec_and_test(&data->refs))
bd54b6fe 8661 wait_for_completion(&data->done);
bd54b6fe 8662}
04fc6c80 8663
2b188cc1
JA
8664static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8665{
37d1e2e3 8666 io_sq_thread_finish(ctx);
2aede0e4 8667
37d1e2e3 8668 if (ctx->mm_account) {
2aede0e4
JA
8669 mmdrop(ctx->mm_account);
8670 ctx->mm_account = NULL;
30975825 8671 }
def596e9 8672
43597aac
PB
8673 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
8674 io_wait_rsrc_data(ctx->buf_data);
8675 io_wait_rsrc_data(ctx->file_data);
8676
8bad28d8 8677 mutex_lock(&ctx->uring_lock);
43597aac 8678 if (ctx->buf_data)
bd54b6fe 8679 __io_sqe_buffers_unregister(ctx);
43597aac 8680 if (ctx->file_data)
08480400 8681 __io_sqe_files_unregister(ctx);
c4ea060e
PB
8682 if (ctx->rings)
8683 __io_cqring_overflow_flush(ctx, true);
8bad28d8 8684 mutex_unlock(&ctx->uring_lock);
9b402849 8685 io_eventfd_unregister(ctx);
5a2e745d 8686 io_destroy_buffers(ctx);
07db298a
PB
8687 if (ctx->sq_creds)
8688 put_cred(ctx->sq_creds);
def596e9 8689
a7f0ed5a
PB
8690 /* there are no registered resources left, nobody uses it */
8691 if (ctx->rsrc_node)
8692 io_rsrc_node_destroy(ctx->rsrc_node);
8dd03afe 8693 if (ctx->rsrc_backup_node)
b895c9a6 8694 io_rsrc_node_destroy(ctx->rsrc_backup_node);
a7f0ed5a
PB
8695 flush_delayed_work(&ctx->rsrc_put_work);
8696
8697 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
8698 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
def596e9 8699
2b188cc1 8700#if defined(CONFIG_UNIX)
355e8d26
EB
8701 if (ctx->ring_sock) {
8702 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 8703 sock_release(ctx->ring_sock);
355e8d26 8704 }
2b188cc1
JA
8705#endif
8706
75b28aff 8707 io_mem_free(ctx->rings);
2b188cc1 8708 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
8709
8710 percpu_ref_exit(&ctx->refs);
2b188cc1 8711 free_uid(ctx->user);
4010fec4 8712 io_req_caches_free(ctx);
e941894e
JA
8713 if (ctx->hash_map)
8714 io_wq_put_hash(ctx->hash_map);
78076bb6 8715 kfree(ctx->cancel_hash);
6224843d 8716 kfree(ctx->dummy_ubuf);
2b188cc1
JA
8717 kfree(ctx);
8718}
8719
8720static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8721{
8722 struct io_ring_ctx *ctx = file->private_data;
8723 __poll_t mask = 0;
8724
311997b3 8725 poll_wait(file, &ctx->poll_wait, wait);
4f7067c3
SB
8726 /*
8727 * synchronizes with barrier from wq_has_sleeper call in
8728 * io_commit_cqring
8729 */
2b188cc1 8730 smp_rmb();
90554200 8731 if (!io_sqring_full(ctx))
2b188cc1 8732 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
8733
8734 /*
8735 * Don't flush cqring overflow list here, just do a simple check.
8736 * Otherwise there could possible be ABBA deadlock:
8737 * CPU0 CPU1
8738 * ---- ----
8739 * lock(&ctx->uring_lock);
8740 * lock(&ep->mtx);
8741 * lock(&ctx->uring_lock);
8742 * lock(&ep->mtx);
8743 *
8744 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8745 * pushs them to do the flush.
8746 */
5ed7a37d 8747 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
2b188cc1
JA
8748 mask |= EPOLLIN | EPOLLRDNORM;
8749
8750 return mask;
8751}
8752
8753static int io_uring_fasync(int fd, struct file *file, int on)
8754{
8755 struct io_ring_ctx *ctx = file->private_data;
8756
8757 return fasync_helper(fd, file, on, &ctx->cq_fasync);
8758}
8759
0bead8cd 8760static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 8761{
4379bf8b 8762 const struct cred *creds;
071698e1 8763
61cf9370 8764 creds = xa_erase(&ctx->personalities, id);
4379bf8b
JA
8765 if (creds) {
8766 put_cred(creds);
0bead8cd 8767 return 0;
1e6fa521 8768 }
0bead8cd
YD
8769
8770 return -EINVAL;
8771}
8772
d56d938b
PB
8773struct io_tctx_exit {
8774 struct callback_head task_work;
8775 struct completion completion;
baf186c4 8776 struct io_ring_ctx *ctx;
d56d938b
PB
8777};
8778
8779static void io_tctx_exit_cb(struct callback_head *cb)
8780{
8781 struct io_uring_task *tctx = current->io_uring;
8782 struct io_tctx_exit *work;
8783
8784 work = container_of(cb, struct io_tctx_exit, task_work);
8785 /*
8786 * When @in_idle, we're in cancellation and it's racy to remove the
8787 * node. It'll be removed by the end of cancellation, just ignore it.
8788 */
8789 if (!atomic_read(&tctx->in_idle))
eef51daa 8790 io_uring_del_tctx_node((unsigned long)work->ctx);
d56d938b
PB
8791 complete(&work->completion);
8792}
8793
28090c13
PB
8794static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
8795{
8796 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8797
8798 return req->ctx == data;
8799}
8800
85faa7b8
JA
8801static void io_ring_exit_work(struct work_struct *work)
8802{
d56d938b 8803 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
b5bb3a24 8804 unsigned long timeout = jiffies + HZ * 60 * 5;
d56d938b
PB
8805 struct io_tctx_exit exit;
8806 struct io_tctx_node *node;
8807 int ret;
85faa7b8 8808
56952e91
JA
8809 /*
8810 * If we're doing polled IO and end up having requests being
8811 * submitted async (out-of-line), then completions can come in while
8812 * we're waiting for refs to drop. We need to reap these manually,
8813 * as nobody else will be looking for them.
8814 */
b2edc0a7 8815 do {
3dd0c97a 8816 io_uring_try_cancel_requests(ctx, NULL, true);
28090c13
PB
8817 if (ctx->sq_data) {
8818 struct io_sq_data *sqd = ctx->sq_data;
8819 struct task_struct *tsk;
8820
8821 io_sq_thread_park(sqd);
8822 tsk = sqd->thread;
8823 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
8824 io_wq_cancel_cb(tsk->io_uring->io_wq,
8825 io_cancel_ctx_cb, ctx, true);
8826 io_sq_thread_unpark(sqd);
8827 }
b5bb3a24
PB
8828
8829 WARN_ON_ONCE(time_after(jiffies, timeout));
b2edc0a7 8830 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
d56d938b 8831
7f00651a
PB
8832 init_completion(&exit.completion);
8833 init_task_work(&exit.task_work, io_tctx_exit_cb);
8834 exit.ctx = ctx;
89b5066e
PB
8835 /*
8836 * Some may use context even when all refs and requests have been put,
8837 * and they are free to do so while still holding uring_lock or
5b0a6acc 8838 * completion_lock, see io_req_task_submit(). Apart from other work,
89b5066e
PB
8839 * this lock/unlock section also waits them to finish.
8840 */
d56d938b
PB
8841 mutex_lock(&ctx->uring_lock);
8842 while (!list_empty(&ctx->tctx_list)) {
b5bb3a24
PB
8843 WARN_ON_ONCE(time_after(jiffies, timeout));
8844
d56d938b
PB
8845 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
8846 ctx_node);
7f00651a
PB
8847 /* don't spin on a single task if cancellation failed */
8848 list_rotate_left(&ctx->tctx_list);
d56d938b
PB
8849 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
8850 if (WARN_ON_ONCE(ret))
8851 continue;
8852 wake_up_process(node->task);
8853
8854 mutex_unlock(&ctx->uring_lock);
8855 wait_for_completion(&exit.completion);
d56d938b
PB
8856 mutex_lock(&ctx->uring_lock);
8857 }
8858 mutex_unlock(&ctx->uring_lock);
89b5066e
PB
8859 spin_lock_irq(&ctx->completion_lock);
8860 spin_unlock_irq(&ctx->completion_lock);
d56d938b 8861
85faa7b8
JA
8862 io_ring_ctx_free(ctx);
8863}
8864
80c4cbdb
PB
8865/* Returns true if we found and killed one or more timeouts */
8866static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
3dd0c97a 8867 bool cancel_all)
80c4cbdb
PB
8868{
8869 struct io_kiocb *req, *tmp;
8870 int canceled = 0;
8871
8872 spin_lock_irq(&ctx->completion_lock);
8873 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
3dd0c97a 8874 if (io_match_task(req, tsk, cancel_all)) {
80c4cbdb
PB
8875 io_kill_timeout(req, -ECANCELED);
8876 canceled++;
8877 }
8878 }
51520426
PB
8879 if (canceled != 0)
8880 io_commit_cqring(ctx);
80c4cbdb 8881 spin_unlock_irq(&ctx->completion_lock);
80c4cbdb
PB
8882 if (canceled != 0)
8883 io_cqring_ev_posted(ctx);
8884 return canceled != 0;
8885}
8886
2b188cc1
JA
8887static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8888{
61cf9370
MWO
8889 unsigned long index;
8890 struct creds *creds;
8891
2b188cc1
JA
8892 mutex_lock(&ctx->uring_lock);
8893 percpu_ref_kill(&ctx->refs);
634578f8 8894 if (ctx->rings)
6c2450ae 8895 __io_cqring_overflow_flush(ctx, true);
61cf9370
MWO
8896 xa_for_each(&ctx->personalities, index, creds)
8897 io_unregister_personality(ctx, index);
2b188cc1
JA
8898 mutex_unlock(&ctx->uring_lock);
8899
3dd0c97a
PB
8900 io_kill_timeouts(ctx, NULL, true);
8901 io_poll_remove_all(ctx, NULL, true);
561fb04a 8902
15dff286 8903 /* if we failed setting up the ctx, we might not have any rings */
b2edc0a7 8904 io_iopoll_try_reap_events(ctx);
309fc03a 8905
85faa7b8 8906 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
8907 /*
8908 * Use system_unbound_wq to avoid spawning tons of event kworkers
8909 * if we're exiting a ton of rings at the same time. It just adds
8910 * noise and overhead, there's no discernable change in runtime
8911 * over using system_wq.
8912 */
8913 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
8914}
8915
8916static int io_uring_release(struct inode *inode, struct file *file)
8917{
8918 struct io_ring_ctx *ctx = file->private_data;
8919
8920 file->private_data = NULL;
8921 io_ring_ctx_wait_and_kill(ctx);
8922 return 0;
8923}
8924
f6edbabb
PB
8925struct io_task_cancel {
8926 struct task_struct *task;
3dd0c97a 8927 bool all;
f6edbabb 8928};
f254ac04 8929
f6edbabb 8930static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 8931{
9a472ef7 8932 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 8933 struct io_task_cancel *cancel = data;
9a472ef7
PB
8934 bool ret;
8935
3dd0c97a 8936 if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
9a472ef7
PB
8937 unsigned long flags;
8938 struct io_ring_ctx *ctx = req->ctx;
8939
8940 /* protect against races with linked timeouts */
8941 spin_lock_irqsave(&ctx->completion_lock, flags);
3dd0c97a 8942 ret = io_match_task(req, cancel->task, cancel->all);
9a472ef7
PB
8943 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8944 } else {
3dd0c97a 8945 ret = io_match_task(req, cancel->task, cancel->all);
9a472ef7
PB
8946 }
8947 return ret;
b711d4ea
JA
8948}
8949
e1915f76 8950static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
3dd0c97a 8951 struct task_struct *task, bool cancel_all)
b7ddce3c 8952{
e1915f76 8953 struct io_defer_entry *de;
b7ddce3c
PB
8954 LIST_HEAD(list);
8955
8956 spin_lock_irq(&ctx->completion_lock);
8957 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
3dd0c97a 8958 if (io_match_task(de->req, task, cancel_all)) {
b7ddce3c
PB
8959 list_cut_position(&list, &ctx->defer_list, &de->list);
8960 break;
8961 }
8962 }
8963 spin_unlock_irq(&ctx->completion_lock);
e1915f76
PB
8964 if (list_empty(&list))
8965 return false;
b7ddce3c
PB
8966
8967 while (!list_empty(&list)) {
8968 de = list_first_entry(&list, struct io_defer_entry, list);
8969 list_del_init(&de->list);
f41db273 8970 io_req_complete_failed(de->req, -ECANCELED);
b7ddce3c
PB
8971 kfree(de);
8972 }
e1915f76 8973 return true;
b7ddce3c
PB
8974}
8975
1b00764f
PB
8976static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
8977{
8978 struct io_tctx_node *node;
8979 enum io_wq_cancel cret;
8980 bool ret = false;
8981
8982 mutex_lock(&ctx->uring_lock);
8983 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
8984 struct io_uring_task *tctx = node->task->io_uring;
8985
8986 /*
8987 * io_wq will stay alive while we hold uring_lock, because it's
8988 * killed after ctx nodes, which requires to take the lock.
8989 */
8990 if (!tctx || !tctx->io_wq)
8991 continue;
8992 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
8993 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8994 }
8995 mutex_unlock(&ctx->uring_lock);
8996
8997 return ret;
8998}
8999
9936c7c2
PB
9000static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9001 struct task_struct *task,
3dd0c97a 9002 bool cancel_all)
9936c7c2 9003{
3dd0c97a 9004 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
1b00764f 9005 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9936c7c2
PB
9006
9007 while (1) {
9008 enum io_wq_cancel cret;
9009 bool ret = false;
9010
1b00764f
PB
9011 if (!task) {
9012 ret |= io_uring_try_cancel_iowq(ctx);
9013 } else if (tctx && tctx->io_wq) {
9014 /*
9015 * Cancels requests of all rings, not only @ctx, but
9016 * it's fine as the task is in exit/exec.
9017 */
5aa75ed5 9018 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9936c7c2
PB
9019 &cancel, true);
9020 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9021 }
9022
9023 /* SQPOLL thread does its own polling */
3dd0c97a 9024 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
d052d1d6 9025 (ctx->sq_data && ctx->sq_data->thread == current)) {
9936c7c2
PB
9026 while (!list_empty_careful(&ctx->iopoll_list)) {
9027 io_iopoll_try_reap_events(ctx);
9028 ret = true;
9029 }
9030 }
9031
3dd0c97a
PB
9032 ret |= io_cancel_defer_files(ctx, task, cancel_all);
9033 ret |= io_poll_remove_all(ctx, task, cancel_all);
9034 ret |= io_kill_timeouts(ctx, task, cancel_all);
e5dc480d
PB
9035 if (task)
9036 ret |= io_run_task_work();
9936c7c2
PB
9037 if (!ret)
9038 break;
9039 cond_resched();
9040 }
9041}
9042
eef51daa 9043static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
0f212204 9044{
236434c3 9045 struct io_uring_task *tctx = current->io_uring;
13bf43f5 9046 struct io_tctx_node *node;
a528b04e 9047 int ret;
236434c3
MWO
9048
9049 if (unlikely(!tctx)) {
5aa75ed5 9050 ret = io_uring_alloc_task_context(current, ctx);
0f212204
JA
9051 if (unlikely(ret))
9052 return ret;
236434c3 9053 tctx = current->io_uring;
0f212204 9054 }
cf27f3b1
PB
9055 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9056 node = kmalloc(sizeof(*node), GFP_KERNEL);
9057 if (!node)
9058 return -ENOMEM;
9059 node->ctx = ctx;
9060 node->task = current;
13bf43f5 9061
cf27f3b1
PB
9062 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9063 node, GFP_KERNEL));
9064 if (ret) {
9065 kfree(node);
9066 return ret;
0f212204 9067 }
cf27f3b1
PB
9068
9069 mutex_lock(&ctx->uring_lock);
9070 list_add(&node->ctx_node, &ctx->tctx_list);
9071 mutex_unlock(&ctx->uring_lock);
0f212204 9072 }
cf27f3b1 9073 tctx->last = ctx;
0f212204
JA
9074 return 0;
9075}
9076
cf27f3b1
PB
9077/*
9078 * Note that this task has used io_uring. We use it for cancelation purposes.
9079 */
eef51daa 9080static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
cf27f3b1
PB
9081{
9082 struct io_uring_task *tctx = current->io_uring;
9083
9084 if (likely(tctx && tctx->last == ctx))
9085 return 0;
eef51daa 9086 return __io_uring_add_tctx_node(ctx);
cf27f3b1
PB
9087}
9088
0f212204
JA
9089/*
9090 * Remove this io_uring_file -> task mapping.
9091 */
eef51daa 9092static void io_uring_del_tctx_node(unsigned long index)
0f212204
JA
9093{
9094 struct io_uring_task *tctx = current->io_uring;
13bf43f5 9095 struct io_tctx_node *node;
2941267b 9096
eebd2e37
PB
9097 if (!tctx)
9098 return;
13bf43f5
PB
9099 node = xa_erase(&tctx->xa, index);
9100 if (!node)
2941267b 9101 return;
0f212204 9102
13bf43f5
PB
9103 WARN_ON_ONCE(current != node->task);
9104 WARN_ON_ONCE(list_empty(&node->ctx_node));
9105
9106 mutex_lock(&node->ctx->uring_lock);
9107 list_del(&node->ctx_node);
9108 mutex_unlock(&node->ctx->uring_lock);
9109
baf186c4 9110 if (tctx->last == node->ctx)
0f212204 9111 tctx->last = NULL;
13bf43f5 9112 kfree(node);
0f212204
JA
9113}
9114
8452d4a6 9115static void io_uring_clean_tctx(struct io_uring_task *tctx)
de7f1d9e 9116{
ba5ef6dc 9117 struct io_wq *wq = tctx->io_wq;
13bf43f5 9118 struct io_tctx_node *node;
de7f1d9e
PB
9119 unsigned long index;
9120
13bf43f5 9121 xa_for_each(&tctx->xa, index, node)
eef51daa 9122 io_uring_del_tctx_node(index);
b16ef427
ME
9123 if (wq) {
9124 /*
9125 * Must be after io_uring_del_task_file() (removes nodes under
9126 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9127 */
9128 tctx->io_wq = NULL;
ba5ef6dc 9129 io_wq_put_and_exit(wq);
b16ef427 9130 }
de7f1d9e
PB
9131}
9132
3f48cf18 9133static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
521d6a73 9134{
3f48cf18
PB
9135 if (tracked)
9136 return atomic_read(&tctx->inflight_tracked);
521d6a73
PB
9137 return percpu_counter_sum(&tctx->inflight);
9138}
9139
09899b19
PB
9140static void io_uring_drop_tctx_refs(struct task_struct *task)
9141{
9142 struct io_uring_task *tctx = task->io_uring;
9143 unsigned int refs = tctx->cached_refs;
9144
9145 tctx->cached_refs = 0;
9146 percpu_counter_sub(&tctx->inflight, refs);
9147 put_task_struct_many(task, refs);
9148}
9149
78cc687b
PB
9150/*
9151 * Find any io_uring ctx that this task has registered or done IO on, and cancel
9152 * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
9153 */
9154static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
0e9ddb39 9155{
521d6a73 9156 struct io_uring_task *tctx = current->io_uring;
734551df 9157 struct io_ring_ctx *ctx;
0e9ddb39
PB
9158 s64 inflight;
9159 DEFINE_WAIT(wait);
fdaf083c 9160
78cc687b
PB
9161 WARN_ON_ONCE(sqd && sqd->thread != current);
9162
6d042ffb
PO
9163 if (!current->io_uring)
9164 return;
17a91051
PB
9165 if (tctx->io_wq)
9166 io_wq_exit_start(tctx->io_wq);
9167
09899b19 9168 io_uring_drop_tctx_refs(current);
0e9ddb39
PB
9169 atomic_inc(&tctx->in_idle);
9170 do {
9171 /* read completions before cancelations */
78cc687b 9172 inflight = tctx_inflight(tctx, !cancel_all);
0e9ddb39
PB
9173 if (!inflight)
9174 break;
fdaf083c 9175
78cc687b
PB
9176 if (!sqd) {
9177 struct io_tctx_node *node;
9178 unsigned long index;
0f212204 9179
78cc687b
PB
9180 xa_for_each(&tctx->xa, index, node) {
9181 /* sqpoll task will cancel all its requests */
9182 if (node->ctx->sq_data)
9183 continue;
9184 io_uring_try_cancel_requests(node->ctx, current,
9185 cancel_all);
9186 }
9187 } else {
9188 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9189 io_uring_try_cancel_requests(ctx, current,
9190 cancel_all);
9191 }
17a91051 9192
0f212204 9193 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
0f212204 9194 /*
a1bb3cd5
PB
9195 * If we've seen completions, retry without waiting. This
9196 * avoids a race where a completion comes in before we did
9197 * prepare_to_wait().
0f212204 9198 */
3dd0c97a 9199 if (inflight == tctx_inflight(tctx, !cancel_all))
a1bb3cd5 9200 schedule();
f57555ed 9201 finish_wait(&tctx->wait, &wait);
d8a6df10 9202 } while (1);
fdaf083c 9203 atomic_dec(&tctx->in_idle);
de7f1d9e 9204
8452d4a6 9205 io_uring_clean_tctx(tctx);
3dd0c97a 9206 if (cancel_all) {
3f48cf18
PB
9207 /* for exec all current's requests should be gone, kill tctx */
9208 __io_uring_free(current);
9209 }
44e728b8
PB
9210}
9211
78cc687b
PB
9212void __io_uring_cancel(struct files_struct *files)
9213{
9214 io_uring_cancel_generic(!files, NULL);
9215}
9216
6c5c240e
RP
9217static void *io_uring_validate_mmap_request(struct file *file,
9218 loff_t pgoff, size_t sz)
2b188cc1 9219{
2b188cc1 9220 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 9221 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
9222 struct page *page;
9223 void *ptr;
9224
9225 switch (offset) {
9226 case IORING_OFF_SQ_RING:
75b28aff
HV
9227 case IORING_OFF_CQ_RING:
9228 ptr = ctx->rings;
2b188cc1
JA
9229 break;
9230 case IORING_OFF_SQES:
9231 ptr = ctx->sq_sqes;
9232 break;
2b188cc1 9233 default:
6c5c240e 9234 return ERR_PTR(-EINVAL);
2b188cc1
JA
9235 }
9236
9237 page = virt_to_head_page(ptr);
a50b854e 9238 if (sz > page_size(page))
6c5c240e
RP
9239 return ERR_PTR(-EINVAL);
9240
9241 return ptr;
9242}
9243
9244#ifdef CONFIG_MMU
9245
9246static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9247{
9248 size_t sz = vma->vm_end - vma->vm_start;
9249 unsigned long pfn;
9250 void *ptr;
9251
9252 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9253 if (IS_ERR(ptr))
9254 return PTR_ERR(ptr);
2b188cc1
JA
9255
9256 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9257 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9258}
9259
6c5c240e
RP
9260#else /* !CONFIG_MMU */
9261
9262static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9263{
9264 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9265}
9266
9267static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9268{
9269 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9270}
9271
9272static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9273 unsigned long addr, unsigned long len,
9274 unsigned long pgoff, unsigned long flags)
9275{
9276 void *ptr;
9277
9278 ptr = io_uring_validate_mmap_request(file, pgoff, len);
9279 if (IS_ERR(ptr))
9280 return PTR_ERR(ptr);
9281
9282 return (unsigned long) ptr;
9283}
9284
9285#endif /* !CONFIG_MMU */
9286
d9d05217 9287static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200
JA
9288{
9289 DEFINE_WAIT(wait);
9290
9291 do {
9292 if (!io_sqring_full(ctx))
9293 break;
90554200
JA
9294 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9295
9296 if (!io_sqring_full(ctx))
9297 break;
90554200
JA
9298 schedule();
9299 } while (!signal_pending(current));
9300
9301 finish_wait(&ctx->sqo_sq_wait, &wait);
5199328a 9302 return 0;
90554200
JA
9303}
9304
c73ebb68
HX
9305static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9306 struct __kernel_timespec __user **ts,
9307 const sigset_t __user **sig)
9308{
9309 struct io_uring_getevents_arg arg;
9310
9311 /*
9312 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9313 * is just a pointer to the sigset_t.
9314 */
9315 if (!(flags & IORING_ENTER_EXT_ARG)) {
9316 *sig = (const sigset_t __user *) argp;
9317 *ts = NULL;
9318 return 0;
9319 }
9320
9321 /*
9322 * EXT_ARG is set - ensure we agree on the size of it and copy in our
9323 * timespec and sigset_t pointers if good.
9324 */
9325 if (*argsz != sizeof(arg))
9326 return -EINVAL;
9327 if (copy_from_user(&arg, argp, sizeof(arg)))
9328 return -EFAULT;
9329 *sig = u64_to_user_ptr(arg.sigmask);
9330 *argsz = arg.sigmask_sz;
9331 *ts = u64_to_user_ptr(arg.ts);
9332 return 0;
9333}
9334
2b188cc1 9335SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
9336 u32, min_complete, u32, flags, const void __user *, argp,
9337 size_t, argsz)
2b188cc1
JA
9338{
9339 struct io_ring_ctx *ctx;
2b188cc1
JA
9340 int submitted = 0;
9341 struct fd f;
33f993da 9342 long ret;
2b188cc1 9343
4c6e277c 9344 io_run_task_work();
b41e9852 9345
33f993da
PB
9346 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9347 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
2b188cc1
JA
9348 return -EINVAL;
9349
9350 f = fdget(fd);
33f993da 9351 if (unlikely(!f.file))
2b188cc1
JA
9352 return -EBADF;
9353
9354 ret = -EOPNOTSUPP;
33f993da 9355 if (unlikely(f.file->f_op != &io_uring_fops))
2b188cc1
JA
9356 goto out_fput;
9357
9358 ret = -ENXIO;
9359 ctx = f.file->private_data;
33f993da 9360 if (unlikely(!percpu_ref_tryget(&ctx->refs)))
2b188cc1
JA
9361 goto out_fput;
9362
7e84e1c7 9363 ret = -EBADFD;
33f993da 9364 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
7e84e1c7
SG
9365 goto out;
9366
6c271ce2
JA
9367 /*
9368 * For SQ polling, the thread will do all submissions and completions.
9369 * Just return the requested submit count, and wake the thread if
9370 * we were asked to.
9371 */
b2a9eada 9372 ret = 0;
6c271ce2 9373 if (ctx->flags & IORING_SETUP_SQPOLL) {
6c2450ae 9374 io_cqring_overflow_flush(ctx, false);
89448c47 9375
21f96522
JA
9376 if (unlikely(ctx->sq_data->thread == NULL)) {
9377 ret = -EOWNERDEAD;
04147488 9378 goto out;
21f96522 9379 }
6c271ce2 9380 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 9381 wake_up(&ctx->sq_data->wait);
d9d05217
PB
9382 if (flags & IORING_ENTER_SQ_WAIT) {
9383 ret = io_sqpoll_wait_sq(ctx);
9384 if (ret)
9385 goto out;
9386 }
6c271ce2 9387 submitted = to_submit;
b2a9eada 9388 } else if (to_submit) {
eef51daa 9389 ret = io_uring_add_tctx_node(ctx);
0f212204
JA
9390 if (unlikely(ret))
9391 goto out;
2b188cc1 9392 mutex_lock(&ctx->uring_lock);
0f212204 9393 submitted = io_submit_sqes(ctx, to_submit);
2b188cc1 9394 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
9395
9396 if (submitted != to_submit)
9397 goto out;
2b188cc1
JA
9398 }
9399 if (flags & IORING_ENTER_GETEVENTS) {
c73ebb68
HX
9400 const sigset_t __user *sig;
9401 struct __kernel_timespec __user *ts;
9402
9403 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9404 if (unlikely(ret))
9405 goto out;
9406
2b188cc1
JA
9407 min_complete = min(min_complete, ctx->cq_entries);
9408
32b2244a
XW
9409 /*
9410 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9411 * space applications don't need to do io completion events
9412 * polling again, they can rely on io_sq_thread to do polling
9413 * work, which can reduce cpu usage and uring_lock contention.
9414 */
9415 if (ctx->flags & IORING_SETUP_IOPOLL &&
9416 !(ctx->flags & IORING_SETUP_SQPOLL)) {
7668b92a 9417 ret = io_iopoll_check(ctx, min_complete);
def596e9 9418 } else {
c73ebb68 9419 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
def596e9 9420 }
2b188cc1
JA
9421 }
9422
7c504e65 9423out:
6805b32e 9424 percpu_ref_put(&ctx->refs);
2b188cc1
JA
9425out_fput:
9426 fdput(f);
9427 return submitted ? submitted : ret;
9428}
9429
bebdb65e 9430#ifdef CONFIG_PROC_FS
61cf9370
MWO
9431static int io_uring_show_cred(struct seq_file *m, unsigned int id,
9432 const struct cred *cred)
87ce955b 9433{
87ce955b
JA
9434 struct user_namespace *uns = seq_user_ns(m);
9435 struct group_info *gi;
9436 kernel_cap_t cap;
9437 unsigned __capi;
9438 int g;
9439
9440 seq_printf(m, "%5d\n", id);
9441 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9442 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9443 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9444 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9445 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9446 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9447 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9448 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9449 seq_puts(m, "\n\tGroups:\t");
9450 gi = cred->group_info;
9451 for (g = 0; g < gi->ngroups; g++) {
9452 seq_put_decimal_ull(m, g ? " " : "",
9453 from_kgid_munged(uns, gi->gid[g]));
9454 }
9455 seq_puts(m, "\n\tCapEff:\t");
9456 cap = cred->cap_effective;
9457 CAP_FOR_EACH_U32(__capi)
9458 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9459 seq_putc(m, '\n');
9460 return 0;
9461}
9462
9463static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9464{
dbbe9c64 9465 struct io_sq_data *sq = NULL;
fad8e0de 9466 bool has_lock;
87ce955b
JA
9467 int i;
9468
fad8e0de
JA
9469 /*
9470 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9471 * since fdinfo case grabs it in the opposite direction of normal use
9472 * cases. If we fail to get the lock, we just don't iterate any
9473 * structures that could be going away outside the io_uring mutex.
9474 */
9475 has_lock = mutex_trylock(&ctx->uring_lock);
9476
5f3f26f9 9477 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
dbbe9c64 9478 sq = ctx->sq_data;
5f3f26f9
JA
9479 if (!sq->thread)
9480 sq = NULL;
9481 }
dbbe9c64
JQ
9482
9483 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9484 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 9485 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 9486 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
7b29f92d 9487 struct file *f = io_file_from_index(ctx, i);
87ce955b 9488
87ce955b
JA
9489 if (f)
9490 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9491 else
9492 seq_printf(m, "%5u: <none>\n", i);
9493 }
9494 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 9495 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
41edf1a5 9496 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
4751f53d 9497 unsigned int len = buf->ubuf_end - buf->ubuf;
87ce955b 9498
4751f53d 9499 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
87ce955b 9500 }
61cf9370
MWO
9501 if (has_lock && !xa_empty(&ctx->personalities)) {
9502 unsigned long index;
9503 const struct cred *cred;
9504
87ce955b 9505 seq_printf(m, "Personalities:\n");
61cf9370
MWO
9506 xa_for_each(&ctx->personalities, index, cred)
9507 io_uring_show_cred(m, index, cred);
87ce955b 9508 }
d7718a9d
JA
9509 seq_printf(m, "PollList:\n");
9510 spin_lock_irq(&ctx->completion_lock);
9511 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9512 struct hlist_head *list = &ctx->cancel_hash[i];
9513 struct io_kiocb *req;
9514
9515 hlist_for_each_entry(req, list, hash_node)
9516 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
9517 req->task->task_works != NULL);
9518 }
9519 spin_unlock_irq(&ctx->completion_lock);
fad8e0de
JA
9520 if (has_lock)
9521 mutex_unlock(&ctx->uring_lock);
87ce955b
JA
9522}
9523
9524static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9525{
9526 struct io_ring_ctx *ctx = f->private_data;
9527
9528 if (percpu_ref_tryget(&ctx->refs)) {
9529 __io_uring_show_fdinfo(ctx, m);
9530 percpu_ref_put(&ctx->refs);
9531 }
9532}
bebdb65e 9533#endif
87ce955b 9534
2b188cc1
JA
9535static const struct file_operations io_uring_fops = {
9536 .release = io_uring_release,
9537 .mmap = io_uring_mmap,
6c5c240e
RP
9538#ifndef CONFIG_MMU
9539 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9540 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9541#endif
2b188cc1
JA
9542 .poll = io_uring_poll,
9543 .fasync = io_uring_fasync,
bebdb65e 9544#ifdef CONFIG_PROC_FS
87ce955b 9545 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 9546#endif
2b188cc1
JA
9547};
9548
9549static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9550 struct io_uring_params *p)
9551{
75b28aff
HV
9552 struct io_rings *rings;
9553 size_t size, sq_array_offset;
2b188cc1 9554
bd740481
JA
9555 /* make sure these are sane, as we already accounted them */
9556 ctx->sq_entries = p->sq_entries;
9557 ctx->cq_entries = p->cq_entries;
9558
75b28aff
HV
9559 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9560 if (size == SIZE_MAX)
9561 return -EOVERFLOW;
9562
9563 rings = io_mem_alloc(size);
9564 if (!rings)
2b188cc1
JA
9565 return -ENOMEM;
9566
75b28aff
HV
9567 ctx->rings = rings;
9568 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9569 rings->sq_ring_mask = p->sq_entries - 1;
9570 rings->cq_ring_mask = p->cq_entries - 1;
9571 rings->sq_ring_entries = p->sq_entries;
9572 rings->cq_ring_entries = p->cq_entries;
2b188cc1
JA
9573
9574 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
9575 if (size == SIZE_MAX) {
9576 io_mem_free(ctx->rings);
9577 ctx->rings = NULL;
2b188cc1 9578 return -EOVERFLOW;
eb065d30 9579 }
2b188cc1
JA
9580
9581 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
9582 if (!ctx->sq_sqes) {
9583 io_mem_free(ctx->rings);
9584 ctx->rings = NULL;
2b188cc1 9585 return -ENOMEM;
eb065d30 9586 }
2b188cc1 9587
2b188cc1
JA
9588 return 0;
9589}
9590
9faadcc8
PB
9591static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9592{
9593 int ret, fd;
9594
9595 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9596 if (fd < 0)
9597 return fd;
9598
eef51daa 9599 ret = io_uring_add_tctx_node(ctx);
9faadcc8
PB
9600 if (ret) {
9601 put_unused_fd(fd);
9602 return ret;
9603 }
9604 fd_install(fd, file);
9605 return fd;
9606}
9607
2b188cc1
JA
9608/*
9609 * Allocate an anonymous fd, this is what constitutes the application
9610 * visible backing of an io_uring instance. The application mmaps this
9611 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9612 * we have to tie this fd to a socket for file garbage collection purposes.
9613 */
9faadcc8 9614static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
9615{
9616 struct file *file;
9faadcc8 9617#if defined(CONFIG_UNIX)
2b188cc1
JA
9618 int ret;
9619
2b188cc1
JA
9620 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9621 &ctx->ring_sock);
9622 if (ret)
9faadcc8 9623 return ERR_PTR(ret);
2b188cc1
JA
9624#endif
9625
2b188cc1
JA
9626 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9627 O_RDWR | O_CLOEXEC);
2b188cc1 9628#if defined(CONFIG_UNIX)
9faadcc8
PB
9629 if (IS_ERR(file)) {
9630 sock_release(ctx->ring_sock);
9631 ctx->ring_sock = NULL;
9632 } else {
9633 ctx->ring_sock->file = file;
0f212204 9634 }
2b188cc1 9635#endif
9faadcc8 9636 return file;
2b188cc1
JA
9637}
9638
7f13657d
XW
9639static int io_uring_create(unsigned entries, struct io_uring_params *p,
9640 struct io_uring_params __user *params)
2b188cc1 9641{
2b188cc1 9642 struct io_ring_ctx *ctx;
9faadcc8 9643 struct file *file;
2b188cc1
JA
9644 int ret;
9645
8110c1a6 9646 if (!entries)
2b188cc1 9647 return -EINVAL;
8110c1a6
JA
9648 if (entries > IORING_MAX_ENTRIES) {
9649 if (!(p->flags & IORING_SETUP_CLAMP))
9650 return -EINVAL;
9651 entries = IORING_MAX_ENTRIES;
9652 }
2b188cc1
JA
9653
9654 /*
9655 * Use twice as many entries for the CQ ring. It's possible for the
9656 * application to drive a higher depth than the size of the SQ ring,
9657 * since the sqes are only used at submission time. This allows for
33a107f0
JA
9658 * some flexibility in overcommitting a bit. If the application has
9659 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9660 * of CQ ring entries manually.
2b188cc1
JA
9661 */
9662 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
9663 if (p->flags & IORING_SETUP_CQSIZE) {
9664 /*
9665 * If IORING_SETUP_CQSIZE is set, we do the same roundup
9666 * to a power-of-two, if it isn't already. We do NOT impose
9667 * any cq vs sq ring sizing.
9668 */
eb2667b3 9669 if (!p->cq_entries)
33a107f0 9670 return -EINVAL;
8110c1a6
JA
9671 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9672 if (!(p->flags & IORING_SETUP_CLAMP))
9673 return -EINVAL;
9674 p->cq_entries = IORING_MAX_CQ_ENTRIES;
9675 }
eb2667b3
JQ
9676 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9677 if (p->cq_entries < p->sq_entries)
9678 return -EINVAL;
33a107f0
JA
9679 } else {
9680 p->cq_entries = 2 * p->sq_entries;
9681 }
2b188cc1 9682
2b188cc1 9683 ctx = io_ring_ctx_alloc(p);
62e398be 9684 if (!ctx)
2b188cc1 9685 return -ENOMEM;
2b188cc1 9686 ctx->compat = in_compat_syscall();
62e398be
JA
9687 if (!capable(CAP_IPC_LOCK))
9688 ctx->user = get_uid(current_user());
2aede0e4
JA
9689
9690 /*
9691 * This is just grabbed for accounting purposes. When a process exits,
9692 * the mm is exited and dropped before the files, hence we need to hang
9693 * on to this mm purely for the purposes of being able to unaccount
9694 * memory (locked/pinned vm). It's not used for anything else.
9695 */
6b7898eb 9696 mmgrab(current->mm);
2aede0e4 9697 ctx->mm_account = current->mm;
6b7898eb 9698
2b188cc1
JA
9699 ret = io_allocate_scq_urings(ctx, p);
9700 if (ret)
9701 goto err;
9702
7e84e1c7 9703 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
9704 if (ret)
9705 goto err;
eae071c9 9706 /* always set a rsrc node */
47b228ce
PB
9707 ret = io_rsrc_node_switch_start(ctx);
9708 if (ret)
9709 goto err;
eae071c9 9710 io_rsrc_node_switch(ctx, NULL);
2b188cc1 9711
2b188cc1 9712 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
9713 p->sq_off.head = offsetof(struct io_rings, sq.head);
9714 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9715 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9716 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9717 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9718 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9719 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
9720
9721 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
9722 p->cq_off.head = offsetof(struct io_rings, cq.head);
9723 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9724 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9725 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9726 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9727 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 9728 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 9729
7f13657d
XW
9730 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9731 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 9732 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68 9733 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9690557e
PB
9734 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
9735 IORING_FEAT_RSRC_TAGS;
7f13657d
XW
9736
9737 if (copy_to_user(params, p, sizeof(*p))) {
9738 ret = -EFAULT;
9739 goto err;
9740 }
d1719f70 9741
9faadcc8
PB
9742 file = io_uring_get_file(ctx);
9743 if (IS_ERR(file)) {
9744 ret = PTR_ERR(file);
9745 goto err;
9746 }
9747
044c1ab3
JA
9748 /*
9749 * Install ring fd as the very last thing, so we don't risk someone
9750 * having closed it before we finish setup
9751 */
9faadcc8
PB
9752 ret = io_uring_install_fd(ctx, file);
9753 if (ret < 0) {
9754 /* fput will clean it up */
9755 fput(file);
9756 return ret;
9757 }
044c1ab3 9758
c826bd7a 9759 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
9760 return ret;
9761err:
9762 io_ring_ctx_wait_and_kill(ctx);
9763 return ret;
9764}
9765
9766/*
9767 * Sets up an aio uring context, and returns the fd. Applications asks for a
9768 * ring size, we return the actual sq/cq ring sizes (among other things) in the
9769 * params structure passed in.
9770 */
9771static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9772{
9773 struct io_uring_params p;
2b188cc1
JA
9774 int i;
9775
9776 if (copy_from_user(&p, params, sizeof(p)))
9777 return -EFAULT;
9778 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9779 if (p.resv[i])
9780 return -EINVAL;
9781 }
9782
6c271ce2 9783 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 9784 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7
SG
9785 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9786 IORING_SETUP_R_DISABLED))
2b188cc1
JA
9787 return -EINVAL;
9788
7f13657d 9789 return io_uring_create(entries, &p, params);
2b188cc1
JA
9790}
9791
9792SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9793 struct io_uring_params __user *, params)
9794{
9795 return io_uring_setup(entries, params);
9796}
9797
66f4af93
JA
9798static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9799{
9800 struct io_uring_probe *p;
9801 size_t size;
9802 int i, ret;
9803
9804 size = struct_size(p, ops, nr_args);
9805 if (size == SIZE_MAX)
9806 return -EOVERFLOW;
9807 p = kzalloc(size, GFP_KERNEL);
9808 if (!p)
9809 return -ENOMEM;
9810
9811 ret = -EFAULT;
9812 if (copy_from_user(p, arg, size))
9813 goto out;
9814 ret = -EINVAL;
9815 if (memchr_inv(p, 0, size))
9816 goto out;
9817
9818 p->last_op = IORING_OP_LAST - 1;
9819 if (nr_args > IORING_OP_LAST)
9820 nr_args = IORING_OP_LAST;
9821
9822 for (i = 0; i < nr_args; i++) {
9823 p->ops[i].op = i;
9824 if (!io_op_defs[i].not_supported)
9825 p->ops[i].flags = IO_URING_OP_SUPPORTED;
9826 }
9827 p->ops_len = i;
9828
9829 ret = 0;
9830 if (copy_to_user(arg, p, size))
9831 ret = -EFAULT;
9832out:
9833 kfree(p);
9834 return ret;
9835}
9836
071698e1
JA
9837static int io_register_personality(struct io_ring_ctx *ctx)
9838{
4379bf8b 9839 const struct cred *creds;
61cf9370 9840 u32 id;
1e6fa521 9841 int ret;
071698e1 9842
4379bf8b 9843 creds = get_current_cred();
1e6fa521 9844
61cf9370
MWO
9845 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
9846 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
a30f895a
JA
9847 if (ret < 0) {
9848 put_cred(creds);
9849 return ret;
9850 }
9851 return id;
071698e1
JA
9852}
9853
21b55dbc
SG
9854static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9855 unsigned int nr_args)
9856{
9857 struct io_uring_restriction *res;
9858 size_t size;
9859 int i, ret;
9860
7e84e1c7
SG
9861 /* Restrictions allowed only if rings started disabled */
9862 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9863 return -EBADFD;
9864
21b55dbc 9865 /* We allow only a single restrictions registration */
7e84e1c7 9866 if (ctx->restrictions.registered)
21b55dbc
SG
9867 return -EBUSY;
9868
9869 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9870 return -EINVAL;
9871
9872 size = array_size(nr_args, sizeof(*res));
9873 if (size == SIZE_MAX)
9874 return -EOVERFLOW;
9875
9876 res = memdup_user(arg, size);
9877 if (IS_ERR(res))
9878 return PTR_ERR(res);
9879
9880 ret = 0;
9881
9882 for (i = 0; i < nr_args; i++) {
9883 switch (res[i].opcode) {
9884 case IORING_RESTRICTION_REGISTER_OP:
9885 if (res[i].register_op >= IORING_REGISTER_LAST) {
9886 ret = -EINVAL;
9887 goto out;
9888 }
9889
9890 __set_bit(res[i].register_op,
9891 ctx->restrictions.register_op);
9892 break;
9893 case IORING_RESTRICTION_SQE_OP:
9894 if (res[i].sqe_op >= IORING_OP_LAST) {
9895 ret = -EINVAL;
9896 goto out;
9897 }
9898
9899 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9900 break;
9901 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9902 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9903 break;
9904 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9905 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9906 break;
9907 default:
9908 ret = -EINVAL;
9909 goto out;
9910 }
9911 }
9912
9913out:
9914 /* Reset all restrictions if an error happened */
9915 if (ret != 0)
9916 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9917 else
7e84e1c7 9918 ctx->restrictions.registered = true;
21b55dbc
SG
9919
9920 kfree(res);
9921 return ret;
9922}
9923
7e84e1c7
SG
9924static int io_register_enable_rings(struct io_ring_ctx *ctx)
9925{
9926 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9927 return -EBADFD;
9928
9929 if (ctx->restrictions.registered)
9930 ctx->restricted = 1;
9931
0298ef96
PB
9932 ctx->flags &= ~IORING_SETUP_R_DISABLED;
9933 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
9934 wake_up(&ctx->sq_data->wait);
7e84e1c7
SG
9935 return 0;
9936}
9937
fdecb662 9938static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 9939 struct io_uring_rsrc_update2 *up,
98f0b3b4
PB
9940 unsigned nr_args)
9941{
9942 __u32 tmp;
9943 int err;
9944
c3bdad02
PB
9945 if (up->resv)
9946 return -EINVAL;
98f0b3b4
PB
9947 if (check_add_overflow(up->offset, nr_args, &tmp))
9948 return -EOVERFLOW;
9949 err = io_rsrc_node_switch_start(ctx);
9950 if (err)
9951 return err;
9952
fdecb662
PB
9953 switch (type) {
9954 case IORING_RSRC_FILE:
98f0b3b4 9955 return __io_sqe_files_update(ctx, up, nr_args);
634d00df
PB
9956 case IORING_RSRC_BUFFER:
9957 return __io_sqe_buffers_update(ctx, up, nr_args);
98f0b3b4
PB
9958 }
9959 return -EINVAL;
9960}
9961
c3bdad02
PB
9962static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
9963 unsigned nr_args)
98f0b3b4 9964{
c3bdad02 9965 struct io_uring_rsrc_update2 up;
98f0b3b4
PB
9966
9967 if (!nr_args)
9968 return -EINVAL;
c3bdad02
PB
9969 memset(&up, 0, sizeof(up));
9970 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
9971 return -EFAULT;
9972 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
9973}
9974
9975static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
992da01a 9976 unsigned size, unsigned type)
c3bdad02
PB
9977{
9978 struct io_uring_rsrc_update2 up;
9979
9980 if (size != sizeof(up))
9981 return -EINVAL;
98f0b3b4
PB
9982 if (copy_from_user(&up, arg, sizeof(up)))
9983 return -EFAULT;
992da01a 9984 if (!up.nr || up.resv)
98f0b3b4 9985 return -EINVAL;
992da01a 9986 return __io_register_rsrc_update(ctx, type, &up, up.nr);
98f0b3b4
PB
9987}
9988
792e3582 9989static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
992da01a 9990 unsigned int size, unsigned int type)
792e3582
PB
9991{
9992 struct io_uring_rsrc_register rr;
9993
9994 /* keep it extendible */
9995 if (size != sizeof(rr))
9996 return -EINVAL;
9997
9998 memset(&rr, 0, sizeof(rr));
9999 if (copy_from_user(&rr, arg, size))
10000 return -EFAULT;
992da01a 10001 if (!rr.nr || rr.resv || rr.resv2)
792e3582
PB
10002 return -EINVAL;
10003
992da01a 10004 switch (type) {
792e3582
PB
10005 case IORING_RSRC_FILE:
10006 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10007 rr.nr, u64_to_user_ptr(rr.tags));
634d00df
PB
10008 case IORING_RSRC_BUFFER:
10009 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10010 rr.nr, u64_to_user_ptr(rr.tags));
792e3582
PB
10011 }
10012 return -EINVAL;
10013}
10014
fe76421d
JA
10015static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
10016 unsigned len)
10017{
10018 struct io_uring_task *tctx = current->io_uring;
10019 cpumask_var_t new_mask;
10020 int ret;
10021
10022 if (!tctx || !tctx->io_wq)
10023 return -EINVAL;
10024
10025 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10026 return -ENOMEM;
10027
10028 cpumask_clear(new_mask);
10029 if (len > cpumask_size())
10030 len = cpumask_size();
10031
10032 if (copy_from_user(new_mask, arg, len)) {
10033 free_cpumask_var(new_mask);
10034 return -EFAULT;
10035 }
10036
10037 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10038 free_cpumask_var(new_mask);
10039 return ret;
10040}
10041
10042static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10043{
10044 struct io_uring_task *tctx = current->io_uring;
10045
10046 if (!tctx || !tctx->io_wq)
10047 return -EINVAL;
10048
10049 return io_wq_cpu_affinity(tctx->io_wq, NULL);
10050}
10051
071698e1
JA
10052static bool io_register_op_must_quiesce(int op)
10053{
10054 switch (op) {
bd54b6fe
BM
10055 case IORING_REGISTER_BUFFERS:
10056 case IORING_UNREGISTER_BUFFERS:
f4f7d21c 10057 case IORING_REGISTER_FILES:
071698e1
JA
10058 case IORING_UNREGISTER_FILES:
10059 case IORING_REGISTER_FILES_UPDATE:
10060 case IORING_REGISTER_PROBE:
10061 case IORING_REGISTER_PERSONALITY:
10062 case IORING_UNREGISTER_PERSONALITY:
992da01a
PB
10063 case IORING_REGISTER_FILES2:
10064 case IORING_REGISTER_FILES_UPDATE2:
10065 case IORING_REGISTER_BUFFERS2:
10066 case IORING_REGISTER_BUFFERS_UPDATE:
fe76421d
JA
10067 case IORING_REGISTER_IOWQ_AFF:
10068 case IORING_UNREGISTER_IOWQ_AFF:
071698e1
JA
10069 return false;
10070 default:
10071 return true;
10072 }
10073}
10074
e73c5c7c
PB
10075static int io_ctx_quiesce(struct io_ring_ctx *ctx)
10076{
10077 long ret;
10078
10079 percpu_ref_kill(&ctx->refs);
10080
10081 /*
10082 * Drop uring mutex before waiting for references to exit. If another
10083 * thread is currently inside io_uring_enter() it might need to grab the
10084 * uring_lock to make progress. If we hold it here across the drain
10085 * wait, then we can deadlock. It's safe to drop the mutex here, since
10086 * no new references will come in after we've killed the percpu ref.
10087 */
10088 mutex_unlock(&ctx->uring_lock);
10089 do {
10090 ret = wait_for_completion_interruptible(&ctx->ref_comp);
10091 if (!ret)
10092 break;
10093 ret = io_run_task_work_sig();
10094 } while (ret >= 0);
10095 mutex_lock(&ctx->uring_lock);
10096
10097 if (ret)
10098 io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10099 return ret;
10100}
10101
edafccee
JA
10102static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10103 void __user *arg, unsigned nr_args)
b19062a5
JA
10104 __releases(ctx->uring_lock)
10105 __acquires(ctx->uring_lock)
edafccee
JA
10106{
10107 int ret;
10108
35fa71a0
JA
10109 /*
10110 * We're inside the ring mutex, if the ref is already dying, then
10111 * someone else killed the ctx or is already going through
10112 * io_uring_register().
10113 */
10114 if (percpu_ref_is_dying(&ctx->refs))
10115 return -ENXIO;
10116
75c4021a
PB
10117 if (ctx->restricted) {
10118 if (opcode >= IORING_REGISTER_LAST)
10119 return -EINVAL;
10120 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10121 if (!test_bit(opcode, ctx->restrictions.register_op))
10122 return -EACCES;
10123 }
10124
071698e1 10125 if (io_register_op_must_quiesce(opcode)) {
e73c5c7c
PB
10126 ret = io_ctx_quiesce(ctx);
10127 if (ret)
f70865db 10128 return ret;
05f3fb3c 10129 }
edafccee
JA
10130
10131 switch (opcode) {
10132 case IORING_REGISTER_BUFFERS:
634d00df 10133 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
edafccee
JA
10134 break;
10135 case IORING_UNREGISTER_BUFFERS:
10136 ret = -EINVAL;
10137 if (arg || nr_args)
10138 break;
0a96bbe4 10139 ret = io_sqe_buffers_unregister(ctx);
edafccee 10140 break;
6b06314c 10141 case IORING_REGISTER_FILES:
792e3582 10142 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
6b06314c
JA
10143 break;
10144 case IORING_UNREGISTER_FILES:
10145 ret = -EINVAL;
10146 if (arg || nr_args)
10147 break;
10148 ret = io_sqe_files_unregister(ctx);
10149 break;
c3a31e60 10150 case IORING_REGISTER_FILES_UPDATE:
c3bdad02 10151 ret = io_register_files_update(ctx, arg, nr_args);
c3a31e60 10152 break;
9b402849 10153 case IORING_REGISTER_EVENTFD:
f2842ab5 10154 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
10155 ret = -EINVAL;
10156 if (nr_args != 1)
10157 break;
10158 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
10159 if (ret)
10160 break;
10161 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10162 ctx->eventfd_async = 1;
10163 else
10164 ctx->eventfd_async = 0;
9b402849
JA
10165 break;
10166 case IORING_UNREGISTER_EVENTFD:
10167 ret = -EINVAL;
10168 if (arg || nr_args)
10169 break;
10170 ret = io_eventfd_unregister(ctx);
10171 break;
66f4af93
JA
10172 case IORING_REGISTER_PROBE:
10173 ret = -EINVAL;
10174 if (!arg || nr_args > 256)
10175 break;
10176 ret = io_probe(ctx, arg, nr_args);
10177 break;
071698e1
JA
10178 case IORING_REGISTER_PERSONALITY:
10179 ret = -EINVAL;
10180 if (arg || nr_args)
10181 break;
10182 ret = io_register_personality(ctx);
10183 break;
10184 case IORING_UNREGISTER_PERSONALITY:
10185 ret = -EINVAL;
10186 if (arg)
10187 break;
10188 ret = io_unregister_personality(ctx, nr_args);
10189 break;
7e84e1c7
SG
10190 case IORING_REGISTER_ENABLE_RINGS:
10191 ret = -EINVAL;
10192 if (arg || nr_args)
10193 break;
10194 ret = io_register_enable_rings(ctx);
10195 break;
21b55dbc
SG
10196 case IORING_REGISTER_RESTRICTIONS:
10197 ret = io_register_restrictions(ctx, arg, nr_args);
10198 break;
992da01a
PB
10199 case IORING_REGISTER_FILES2:
10200 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10201 break;
10202 case IORING_REGISTER_FILES_UPDATE2:
10203 ret = io_register_rsrc_update(ctx, arg, nr_args,
10204 IORING_RSRC_FILE);
10205 break;
10206 case IORING_REGISTER_BUFFERS2:
10207 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
792e3582 10208 break;
992da01a
PB
10209 case IORING_REGISTER_BUFFERS_UPDATE:
10210 ret = io_register_rsrc_update(ctx, arg, nr_args,
10211 IORING_RSRC_BUFFER);
c3bdad02 10212 break;
fe76421d
JA
10213 case IORING_REGISTER_IOWQ_AFF:
10214 ret = -EINVAL;
10215 if (!arg || !nr_args)
10216 break;
10217 ret = io_register_iowq_aff(ctx, arg, nr_args);
10218 break;
10219 case IORING_UNREGISTER_IOWQ_AFF:
10220 ret = -EINVAL;
10221 if (arg || nr_args)
10222 break;
10223 ret = io_unregister_iowq_aff(ctx);
10224 break;
edafccee
JA
10225 default:
10226 ret = -EINVAL;
10227 break;
10228 }
10229
071698e1 10230 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 10231 /* bring the ctx back to life */
05f3fb3c 10232 percpu_ref_reinit(&ctx->refs);
0f158b4c 10233 reinit_completion(&ctx->ref_comp);
05f3fb3c 10234 }
edafccee
JA
10235 return ret;
10236}
10237
10238SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10239 void __user *, arg, unsigned int, nr_args)
10240{
10241 struct io_ring_ctx *ctx;
10242 long ret = -EBADF;
10243 struct fd f;
10244
10245 f = fdget(fd);
10246 if (!f.file)
10247 return -EBADF;
10248
10249 ret = -EOPNOTSUPP;
10250 if (f.file->f_op != &io_uring_fops)
10251 goto out_fput;
10252
10253 ctx = f.file->private_data;
10254
b6c23dd5
PB
10255 io_run_task_work();
10256
edafccee
JA
10257 mutex_lock(&ctx->uring_lock);
10258 ret = __io_uring_register(ctx, opcode, arg, nr_args);
10259 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
10260 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10261 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
10262out_fput:
10263 fdput(f);
10264 return ret;
10265}
10266
2b188cc1
JA
10267static int __init io_uring_init(void)
10268{
d7f62e82
SM
10269#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10270 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10271 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10272} while (0)
10273
10274#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10275 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10276 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10277 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
10278 BUILD_BUG_SQE_ELEM(1, __u8, flags);
10279 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
10280 BUILD_BUG_SQE_ELEM(4, __s32, fd);
10281 BUILD_BUG_SQE_ELEM(8, __u64, off);
10282 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
10283 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 10284 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
10285 BUILD_BUG_SQE_ELEM(24, __u32, len);
10286 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
10287 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
10288 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10289 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
10290 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
10291 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
10292 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
10293 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
10294 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
10295 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
10296 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
10297 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
10298 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
10299 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 10300 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
10301 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
10302 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
16340eab 10303 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
d7f62e82 10304 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 10305 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
d7f62e82 10306
b0d658ec
PB
10307 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
10308 sizeof(struct io_uring_rsrc_update));
10309 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
10310 sizeof(struct io_uring_rsrc_update2));
10311 /* should fit into one byte */
10312 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
10313
d3656344 10314 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
84557871 10315 BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
16340eab 10316
91f245d5
JA
10317 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
10318 SLAB_ACCOUNT);
2b188cc1
JA
10319 return 0;
10320};
10321__initcall(io_uring_init);