]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/io_uring.c
io_uring: cancel pending async work if task exits
[mirror_ubuntu-jammy-kernel.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <linux/refcount.h>
48#include <linux/uio.h>
6b47ee6e 49#include <linux/bits.h>
2b188cc1
JA
50
51#include <linux/sched/signal.h>
52#include <linux/fs.h>
53#include <linux/file.h>
54#include <linux/fdtable.h>
55#include <linux/mm.h>
56#include <linux/mman.h>
57#include <linux/mmu_context.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
6c271ce2 60#include <linux/kthread.h>
2b188cc1 61#include <linux/blkdev.h>
edafccee 62#include <linux/bvec.h>
2b188cc1
JA
63#include <linux/net.h>
64#include <net/sock.h>
65#include <net/af_unix.h>
6b06314c 66#include <net/scm.h>
2b188cc1
JA
67#include <linux/anon_inodes.h>
68#include <linux/sched/mm.h>
69#include <linux/uaccess.h>
70#include <linux/nospec.h>
edafccee
JA
71#include <linux/sizes.h>
72#include <linux/hugetlb.h>
aa4c3967 73#include <linux/highmem.h>
15b71abe
JA
74#include <linux/namei.h>
75#include <linux/fsnotify.h>
4840e418 76#include <linux/fadvise.h>
3e4827b0 77#include <linux/eventpoll.h>
ff002b30 78#include <linux/fs_struct.h>
2b188cc1 79
c826bd7a
DD
80#define CREATE_TRACE_POINTS
81#include <trace/events/io_uring.h>
82
2b188cc1
JA
83#include <uapi/linux/io_uring.h>
84
85#include "internal.h"
561fb04a 86#include "io-wq.h"
2b188cc1 87
5277deaa 88#define IORING_MAX_ENTRIES 32768
33a107f0 89#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
65e19f54
JA
90
91/*
92 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
93 */
94#define IORING_FILE_TABLE_SHIFT 9
95#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
96#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
97#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
2b188cc1
JA
98
99struct io_uring {
100 u32 head ____cacheline_aligned_in_smp;
101 u32 tail ____cacheline_aligned_in_smp;
102};
103
1e84b97b 104/*
75b28aff
HV
105 * This data is shared with the application through the mmap at offsets
106 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
107 *
108 * The offsets to the member fields are published through struct
109 * io_sqring_offsets when calling io_uring_setup.
110 */
75b28aff 111struct io_rings {
1e84b97b
SB
112 /*
113 * Head and tail offsets into the ring; the offsets need to be
114 * masked to get valid indices.
115 *
75b28aff
HV
116 * The kernel controls head of the sq ring and the tail of the cq ring,
117 * and the application controls tail of the sq ring and the head of the
118 * cq ring.
1e84b97b 119 */
75b28aff 120 struct io_uring sq, cq;
1e84b97b 121 /*
75b28aff 122 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
123 * ring_entries - 1)
124 */
75b28aff
HV
125 u32 sq_ring_mask, cq_ring_mask;
126 /* Ring sizes (constant, power of 2) */
127 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
128 /*
129 * Number of invalid entries dropped by the kernel due to
130 * invalid index stored in array
131 *
132 * Written by the kernel, shouldn't be modified by the
133 * application (i.e. get number of "new events" by comparing to
134 * cached value).
135 *
136 * After a new SQ head value was read by the application this
137 * counter includes all submissions that were dropped reaching
138 * the new SQ head (and possibly more).
139 */
75b28aff 140 u32 sq_dropped;
1e84b97b
SB
141 /*
142 * Runtime flags
143 *
144 * Written by the kernel, shouldn't be modified by the
145 * application.
146 *
147 * The application needs a full memory barrier before checking
148 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
149 */
75b28aff 150 u32 sq_flags;
1e84b97b
SB
151 /*
152 * Number of completion events lost because the queue was full;
153 * this should be avoided by the application by making sure
0b4295b5 154 * there are not more requests pending than there is space in
1e84b97b
SB
155 * the completion queue.
156 *
157 * Written by the kernel, shouldn't be modified by the
158 * application (i.e. get number of "new events" by comparing to
159 * cached value).
160 *
161 * As completion events come in out of order this counter is not
162 * ordered with any other data.
163 */
75b28aff 164 u32 cq_overflow;
1e84b97b
SB
165 /*
166 * Ring buffer of completion events.
167 *
168 * The kernel writes completion events fresh every time they are
169 * produced, so the application is allowed to modify pending
170 * entries.
171 */
75b28aff 172 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
173};
174
edafccee
JA
175struct io_mapped_ubuf {
176 u64 ubuf;
177 size_t len;
178 struct bio_vec *bvec;
179 unsigned int nr_bvecs;
180};
181
65e19f54
JA
182struct fixed_file_table {
183 struct file **files;
31b51510
JA
184};
185
05f3fb3c
JA
186enum {
187 FFD_F_ATOMIC,
188};
189
190struct fixed_file_data {
191 struct fixed_file_table *table;
192 struct io_ring_ctx *ctx;
193
194 struct percpu_ref refs;
195 struct llist_head put_llist;
196 unsigned long state;
197 struct work_struct ref_work;
198 struct completion done;
199};
200
2b188cc1
JA
201struct io_ring_ctx {
202 struct {
203 struct percpu_ref refs;
204 } ____cacheline_aligned_in_smp;
205
206 struct {
207 unsigned int flags;
e1d85334
RD
208 unsigned int compat: 1;
209 unsigned int account_mem: 1;
210 unsigned int cq_overflow_flushed: 1;
211 unsigned int drain_next: 1;
212 unsigned int eventfd_async: 1;
2b188cc1 213
75b28aff
HV
214 /*
215 * Ring buffer of indices into array of io_uring_sqe, which is
216 * mmapped by the application using the IORING_OFF_SQES offset.
217 *
218 * This indirection could e.g. be used to assign fixed
219 * io_uring_sqe entries to operations and only submit them to
220 * the queue when needed.
221 *
222 * The kernel modifies neither the indices array nor the entries
223 * array.
224 */
225 u32 *sq_array;
2b188cc1
JA
226 unsigned cached_sq_head;
227 unsigned sq_entries;
228 unsigned sq_mask;
6c271ce2 229 unsigned sq_thread_idle;
498ccd9e 230 unsigned cached_sq_dropped;
206aefde 231 atomic_t cached_cq_overflow;
ad3eb2c8 232 unsigned long sq_check_overflow;
de0617e4
JA
233
234 struct list_head defer_list;
5262f567 235 struct list_head timeout_list;
1d7bb1d5 236 struct list_head cq_overflow_list;
fcb323cc
JA
237
238 wait_queue_head_t inflight_wait;
ad3eb2c8 239 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
240 } ____cacheline_aligned_in_smp;
241
206aefde
JA
242 struct io_rings *rings;
243
2b188cc1 244 /* IO offload */
561fb04a 245 struct io_wq *io_wq;
6c271ce2 246 struct task_struct *sqo_thread; /* if using sq thread polling */
2b188cc1 247 struct mm_struct *sqo_mm;
6c271ce2 248 wait_queue_head_t sqo_wait;
75b28aff 249
6b06314c
JA
250 /*
251 * If used, fixed file set. Writers must ensure that ->refs is dead,
252 * readers must ensure that ->refs is alive as long as the file* is
253 * used. Only updated through io_uring_register(2).
254 */
05f3fb3c 255 struct fixed_file_data *file_data;
6b06314c 256 unsigned nr_user_files;
b14cca0c
PB
257 int ring_fd;
258 struct file *ring_file;
6b06314c 259
edafccee
JA
260 /* if used, fixed mapped user buffers */
261 unsigned nr_user_bufs;
262 struct io_mapped_ubuf *user_bufs;
263
2b188cc1
JA
264 struct user_struct *user;
265
0b8c0ec7 266 const struct cred *creds;
181e448d 267
206aefde
JA
268 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
269 struct completion *completions;
270
0ddf92e8
JA
271 /* if all else fails... */
272 struct io_kiocb *fallback_req;
273
206aefde
JA
274#if defined(CONFIG_UNIX)
275 struct socket *ring_sock;
276#endif
277
071698e1
JA
278 struct idr personality_idr;
279
206aefde
JA
280 struct {
281 unsigned cached_cq_tail;
282 unsigned cq_entries;
283 unsigned cq_mask;
284 atomic_t cq_timeouts;
ad3eb2c8 285 unsigned long cq_check_overflow;
206aefde
JA
286 struct wait_queue_head cq_wait;
287 struct fasync_struct *cq_fasync;
288 struct eventfd_ctx *cq_ev_fd;
289 } ____cacheline_aligned_in_smp;
2b188cc1
JA
290
291 struct {
292 struct mutex uring_lock;
293 wait_queue_head_t wait;
294 } ____cacheline_aligned_in_smp;
295
296 struct {
297 spinlock_t completion_lock;
e94f141b
JA
298 struct llist_head poll_llist;
299
def596e9
JA
300 /*
301 * ->poll_list is protected by the ctx->uring_lock for
302 * io_uring instances that don't use IORING_SETUP_SQPOLL.
303 * For SQPOLL, only the single threaded io_sq_thread() will
304 * manipulate the list, hence no extra locking is needed there.
305 */
306 struct list_head poll_list;
78076bb6
JA
307 struct hlist_head *cancel_hash;
308 unsigned cancel_hash_bits;
e94f141b 309 bool poll_multi_file;
31b51510 310
fcb323cc
JA
311 spinlock_t inflight_lock;
312 struct list_head inflight_list;
2b188cc1 313 } ____cacheline_aligned_in_smp;
2b188cc1
JA
314};
315
09bb8394
JA
316/*
317 * First field must be the file pointer in all the
318 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
319 */
221c5eb2
JA
320struct io_poll_iocb {
321 struct file *file;
0969e783
JA
322 union {
323 struct wait_queue_head *head;
324 u64 addr;
325 };
221c5eb2 326 __poll_t events;
8c838788 327 bool done;
221c5eb2 328 bool canceled;
392edb45 329 struct wait_queue_entry wait;
221c5eb2
JA
330};
331
b5dba59e
JA
332struct io_close {
333 struct file *file;
334 struct file *put_file;
335 int fd;
336};
337
ad8a48ac
JA
338struct io_timeout_data {
339 struct io_kiocb *req;
340 struct hrtimer timer;
341 struct timespec64 ts;
342 enum hrtimer_mode mode;
cc42e0ac 343 u32 seq_offset;
ad8a48ac
JA
344};
345
8ed8d3c3
JA
346struct io_accept {
347 struct file *file;
348 struct sockaddr __user *addr;
349 int __user *addr_len;
350 int flags;
351};
352
353struct io_sync {
354 struct file *file;
355 loff_t len;
356 loff_t off;
357 int flags;
d63d1b5e 358 int mode;
8ed8d3c3
JA
359};
360
fbf23849
JA
361struct io_cancel {
362 struct file *file;
363 u64 addr;
364};
365
b29472ee
JA
366struct io_timeout {
367 struct file *file;
368 u64 addr;
369 int flags;
26a61679 370 unsigned count;
b29472ee
JA
371};
372
9adbd45d
JA
373struct io_rw {
374 /* NOTE: kiocb has the file as the first member, so don't do it here */
375 struct kiocb kiocb;
376 u64 addr;
377 u64 len;
378};
379
3fbb51c1
JA
380struct io_connect {
381 struct file *file;
382 struct sockaddr __user *addr;
383 int addr_len;
384};
385
e47293fd
JA
386struct io_sr_msg {
387 struct file *file;
fddaface
JA
388 union {
389 struct user_msghdr __user *msg;
390 void __user *buf;
391 };
e47293fd 392 int msg_flags;
fddaface 393 size_t len;
e47293fd
JA
394};
395
15b71abe
JA
396struct io_open {
397 struct file *file;
398 int dfd;
eddc7ef5 399 union {
eddc7ef5
JA
400 unsigned mask;
401 };
15b71abe 402 struct filename *filename;
eddc7ef5 403 struct statx __user *buffer;
c12cedf2 404 struct open_how how;
15b71abe
JA
405};
406
05f3fb3c
JA
407struct io_files_update {
408 struct file *file;
409 u64 arg;
410 u32 nr_args;
411 u32 offset;
412};
413
4840e418
JA
414struct io_fadvise {
415 struct file *file;
416 u64 offset;
417 u32 len;
418 u32 advice;
419};
420
c1ca757b
JA
421struct io_madvise {
422 struct file *file;
423 u64 addr;
424 u32 len;
425 u32 advice;
426};
427
3e4827b0
JA
428struct io_epoll {
429 struct file *file;
430 int epfd;
431 int op;
432 int fd;
433 struct epoll_event event;
e47293fd
JA
434};
435
f499a021
JA
436struct io_async_connect {
437 struct sockaddr_storage address;
438};
439
03b1230c
JA
440struct io_async_msghdr {
441 struct iovec fast_iov[UIO_FASTIOV];
442 struct iovec *iov;
443 struct sockaddr __user *uaddr;
444 struct msghdr msg;
445};
446
f67676d1
JA
447struct io_async_rw {
448 struct iovec fast_iov[UIO_FASTIOV];
449 struct iovec *iov;
450 ssize_t nr_segs;
451 ssize_t size;
452};
453
1a6b74fc 454struct io_async_ctx {
f67676d1
JA
455 union {
456 struct io_async_rw rw;
03b1230c 457 struct io_async_msghdr msg;
f499a021 458 struct io_async_connect connect;
2d28390a 459 struct io_timeout_data timeout;
f67676d1 460 };
1a6b74fc
JA
461};
462
6b47ee6e
PB
463enum {
464 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
465 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
466 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
467 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
468 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
469
470 REQ_F_LINK_NEXT_BIT,
471 REQ_F_FAIL_LINK_BIT,
472 REQ_F_INFLIGHT_BIT,
473 REQ_F_CUR_POS_BIT,
474 REQ_F_NOWAIT_BIT,
475 REQ_F_IOPOLL_COMPLETED_BIT,
476 REQ_F_LINK_TIMEOUT_BIT,
477 REQ_F_TIMEOUT_BIT,
478 REQ_F_ISREG_BIT,
479 REQ_F_MUST_PUNT_BIT,
480 REQ_F_TIMEOUT_NOSEQ_BIT,
481 REQ_F_COMP_LOCKED_BIT,
99bc4c38 482 REQ_F_NEED_CLEANUP_BIT,
6b47ee6e
PB
483};
484
485enum {
486 /* ctx owns file */
487 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
488 /* drain existing IO first */
489 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
490 /* linked sqes */
491 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
492 /* doesn't sever on completion < 0 */
493 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
494 /* IOSQE_ASYNC */
495 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
496
497 /* already grabbed next link */
498 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
499 /* fail rest of links */
500 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
501 /* on inflight list */
502 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
503 /* read/write uses file position */
504 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
505 /* must not punt to workers */
506 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
507 /* polled IO has completed */
508 REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
509 /* has linked timeout */
510 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
511 /* timeout request */
512 REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
513 /* regular file */
514 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
515 /* must be punted even for NONBLOCK */
516 REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
517 /* no timeout sequence */
518 REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
519 /* completion under lock */
520 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
99bc4c38
PB
521 /* needs cleanup */
522 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
6b47ee6e
PB
523};
524
09bb8394
JA
525/*
526 * NOTE! Each of the iocb union members has the file pointer
527 * as the first entry in their struct definition. So you can
528 * access the file pointer through any of the sub-structs,
529 * or directly as just 'ki_filp' in this struct.
530 */
2b188cc1 531struct io_kiocb {
221c5eb2 532 union {
09bb8394 533 struct file *file;
9adbd45d 534 struct io_rw rw;
221c5eb2 535 struct io_poll_iocb poll;
8ed8d3c3
JA
536 struct io_accept accept;
537 struct io_sync sync;
fbf23849 538 struct io_cancel cancel;
b29472ee 539 struct io_timeout timeout;
3fbb51c1 540 struct io_connect connect;
e47293fd 541 struct io_sr_msg sr_msg;
15b71abe 542 struct io_open open;
b5dba59e 543 struct io_close close;
05f3fb3c 544 struct io_files_update files_update;
4840e418 545 struct io_fadvise fadvise;
c1ca757b 546 struct io_madvise madvise;
3e4827b0 547 struct io_epoll epoll;
221c5eb2 548 };
2b188cc1 549
1a6b74fc 550 struct io_async_ctx *io;
b14cca0c
PB
551 /*
552 * llist_node is only used for poll deferred completions
553 */
554 struct llist_node llist_node;
cf6fd4bd
PB
555 bool in_async;
556 bool needs_fixed_file;
d625c6ee 557 u8 opcode;
2b188cc1
JA
558
559 struct io_ring_ctx *ctx;
eac406c6
JA
560 union {
561 struct list_head list;
78076bb6 562 struct hlist_node hash_node;
eac406c6 563 };
9e645e11 564 struct list_head link_list;
2b188cc1 565 unsigned int flags;
c16361c1 566 refcount_t refs;
2b188cc1 567 u64 user_data;
9e645e11 568 u32 result;
de0617e4 569 u32 sequence;
2b188cc1 570
fcb323cc
JA
571 struct list_head inflight_entry;
572
561fb04a 573 struct io_wq_work work;
2b188cc1
JA
574};
575
576#define IO_PLUG_THRESHOLD 2
def596e9 577#define IO_IOPOLL_BATCH 8
2b188cc1 578
9a56a232
JA
579struct io_submit_state {
580 struct blk_plug plug;
581
2579f913
JA
582 /*
583 * io_kiocb alloc cache
584 */
585 void *reqs[IO_IOPOLL_BATCH];
6c8a3134 586 unsigned int free_reqs;
2579f913 587
9a56a232
JA
588 /*
589 * File reference cache
590 */
591 struct file *file;
592 unsigned int fd;
593 unsigned int has_refs;
594 unsigned int used_refs;
595 unsigned int ios_left;
596};
597
d3656344
JA
598struct io_op_def {
599 /* needs req->io allocated for deferral/async */
600 unsigned async_ctx : 1;
601 /* needs current->mm setup, does mm access */
602 unsigned needs_mm : 1;
603 /* needs req->file assigned */
604 unsigned needs_file : 1;
605 /* needs req->file assigned IFF fd is >= 0 */
606 unsigned fd_non_neg : 1;
607 /* hash wq insertion if file is a regular file */
608 unsigned hash_reg_file : 1;
609 /* unbound wq insertion if file is a non-regular file */
610 unsigned unbound_nonreg_file : 1;
66f4af93
JA
611 /* opcode is not supported by this kernel */
612 unsigned not_supported : 1;
f86cd20c
JA
613 /* needs file table */
614 unsigned file_table : 1;
ff002b30
JA
615 /* needs ->fs */
616 unsigned needs_fs : 1;
d3656344
JA
617};
618
619static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
620 [IORING_OP_NOP] = {},
621 [IORING_OP_READV] = {
d3656344
JA
622 .async_ctx = 1,
623 .needs_mm = 1,
624 .needs_file = 1,
625 .unbound_nonreg_file = 1,
626 },
0463b6c5 627 [IORING_OP_WRITEV] = {
d3656344
JA
628 .async_ctx = 1,
629 .needs_mm = 1,
630 .needs_file = 1,
631 .hash_reg_file = 1,
632 .unbound_nonreg_file = 1,
633 },
0463b6c5 634 [IORING_OP_FSYNC] = {
d3656344
JA
635 .needs_file = 1,
636 },
0463b6c5 637 [IORING_OP_READ_FIXED] = {
d3656344
JA
638 .needs_file = 1,
639 .unbound_nonreg_file = 1,
640 },
0463b6c5 641 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
642 .needs_file = 1,
643 .hash_reg_file = 1,
644 .unbound_nonreg_file = 1,
645 },
0463b6c5 646 [IORING_OP_POLL_ADD] = {
d3656344
JA
647 .needs_file = 1,
648 .unbound_nonreg_file = 1,
649 },
0463b6c5
PB
650 [IORING_OP_POLL_REMOVE] = {},
651 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
652 .needs_file = 1,
653 },
0463b6c5 654 [IORING_OP_SENDMSG] = {
d3656344
JA
655 .async_ctx = 1,
656 .needs_mm = 1,
657 .needs_file = 1,
658 .unbound_nonreg_file = 1,
ff002b30 659 .needs_fs = 1,
d3656344 660 },
0463b6c5 661 [IORING_OP_RECVMSG] = {
d3656344
JA
662 .async_ctx = 1,
663 .needs_mm = 1,
664 .needs_file = 1,
665 .unbound_nonreg_file = 1,
ff002b30 666 .needs_fs = 1,
d3656344 667 },
0463b6c5 668 [IORING_OP_TIMEOUT] = {
d3656344
JA
669 .async_ctx = 1,
670 .needs_mm = 1,
671 },
0463b6c5
PB
672 [IORING_OP_TIMEOUT_REMOVE] = {},
673 [IORING_OP_ACCEPT] = {
d3656344
JA
674 .needs_mm = 1,
675 .needs_file = 1,
676 .unbound_nonreg_file = 1,
f86cd20c 677 .file_table = 1,
d3656344 678 },
0463b6c5
PB
679 [IORING_OP_ASYNC_CANCEL] = {},
680 [IORING_OP_LINK_TIMEOUT] = {
d3656344
JA
681 .async_ctx = 1,
682 .needs_mm = 1,
683 },
0463b6c5 684 [IORING_OP_CONNECT] = {
d3656344
JA
685 .async_ctx = 1,
686 .needs_mm = 1,
687 .needs_file = 1,
688 .unbound_nonreg_file = 1,
689 },
0463b6c5 690 [IORING_OP_FALLOCATE] = {
d3656344
JA
691 .needs_file = 1,
692 },
0463b6c5 693 [IORING_OP_OPENAT] = {
d3656344
JA
694 .needs_file = 1,
695 .fd_non_neg = 1,
f86cd20c 696 .file_table = 1,
ff002b30 697 .needs_fs = 1,
d3656344 698 },
0463b6c5 699 [IORING_OP_CLOSE] = {
d3656344 700 .needs_file = 1,
f86cd20c 701 .file_table = 1,
d3656344 702 },
0463b6c5 703 [IORING_OP_FILES_UPDATE] = {
d3656344 704 .needs_mm = 1,
f86cd20c 705 .file_table = 1,
d3656344 706 },
0463b6c5 707 [IORING_OP_STATX] = {
d3656344
JA
708 .needs_mm = 1,
709 .needs_file = 1,
710 .fd_non_neg = 1,
ff002b30 711 .needs_fs = 1,
d3656344 712 },
0463b6c5 713 [IORING_OP_READ] = {
3a6820f2
JA
714 .needs_mm = 1,
715 .needs_file = 1,
716 .unbound_nonreg_file = 1,
717 },
0463b6c5 718 [IORING_OP_WRITE] = {
3a6820f2
JA
719 .needs_mm = 1,
720 .needs_file = 1,
721 .unbound_nonreg_file = 1,
722 },
0463b6c5 723 [IORING_OP_FADVISE] = {
4840e418
JA
724 .needs_file = 1,
725 },
0463b6c5 726 [IORING_OP_MADVISE] = {
c1ca757b
JA
727 .needs_mm = 1,
728 },
0463b6c5 729 [IORING_OP_SEND] = {
fddaface
JA
730 .needs_mm = 1,
731 .needs_file = 1,
732 .unbound_nonreg_file = 1,
733 },
0463b6c5 734 [IORING_OP_RECV] = {
fddaface
JA
735 .needs_mm = 1,
736 .needs_file = 1,
737 .unbound_nonreg_file = 1,
738 },
0463b6c5 739 [IORING_OP_OPENAT2] = {
cebdb986
JA
740 .needs_file = 1,
741 .fd_non_neg = 1,
f86cd20c 742 .file_table = 1,
ff002b30 743 .needs_fs = 1,
cebdb986 744 },
3e4827b0
JA
745 [IORING_OP_EPOLL_CTL] = {
746 .unbound_nonreg_file = 1,
747 .file_table = 1,
748 },
d3656344
JA
749};
750
561fb04a 751static void io_wq_submit_work(struct io_wq_work **workptr);
78e19bbe 752static void io_cqring_fill_event(struct io_kiocb *req, long res);
ec9c02ad 753static void io_put_req(struct io_kiocb *req);
978db57e 754static void __io_double_put_req(struct io_kiocb *req);
94ae5e77
JA
755static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
756static void io_queue_linked_timeout(struct io_kiocb *req);
05f3fb3c
JA
757static int __io_sqe_files_update(struct io_ring_ctx *ctx,
758 struct io_uring_files_update *ip,
759 unsigned nr_args);
f86cd20c 760static int io_grab_files(struct io_kiocb *req);
2faf852d 761static void io_ring_file_ref_flush(struct fixed_file_data *data);
99bc4c38 762static void io_cleanup_req(struct io_kiocb *req);
de0617e4 763
2b188cc1
JA
764static struct kmem_cache *req_cachep;
765
766static const struct file_operations io_uring_fops;
767
768struct sock *io_uring_get_socket(struct file *file)
769{
770#if defined(CONFIG_UNIX)
771 if (file->f_op == &io_uring_fops) {
772 struct io_ring_ctx *ctx = file->private_data;
773
774 return ctx->ring_sock->sk;
775 }
776#endif
777 return NULL;
778}
779EXPORT_SYMBOL(io_uring_get_socket);
780
781static void io_ring_ctx_ref_free(struct percpu_ref *ref)
782{
783 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
784
206aefde 785 complete(&ctx->completions[0]);
2b188cc1
JA
786}
787
788static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
789{
790 struct io_ring_ctx *ctx;
78076bb6 791 int hash_bits;
2b188cc1
JA
792
793 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
794 if (!ctx)
795 return NULL;
796
0ddf92e8
JA
797 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
798 if (!ctx->fallback_req)
799 goto err;
800
206aefde
JA
801 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
802 if (!ctx->completions)
803 goto err;
804
78076bb6
JA
805 /*
806 * Use 5 bits less than the max cq entries, that should give us around
807 * 32 entries per hash list if totally full and uniformly spread.
808 */
809 hash_bits = ilog2(p->cq_entries);
810 hash_bits -= 5;
811 if (hash_bits <= 0)
812 hash_bits = 1;
813 ctx->cancel_hash_bits = hash_bits;
814 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
815 GFP_KERNEL);
816 if (!ctx->cancel_hash)
817 goto err;
818 __hash_init(ctx->cancel_hash, 1U << hash_bits);
819
21482896 820 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
821 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
822 goto err;
2b188cc1
JA
823
824 ctx->flags = p->flags;
825 init_waitqueue_head(&ctx->cq_wait);
1d7bb1d5 826 INIT_LIST_HEAD(&ctx->cq_overflow_list);
206aefde
JA
827 init_completion(&ctx->completions[0]);
828 init_completion(&ctx->completions[1]);
071698e1 829 idr_init(&ctx->personality_idr);
2b188cc1
JA
830 mutex_init(&ctx->uring_lock);
831 init_waitqueue_head(&ctx->wait);
832 spin_lock_init(&ctx->completion_lock);
e94f141b 833 init_llist_head(&ctx->poll_llist);
def596e9 834 INIT_LIST_HEAD(&ctx->poll_list);
de0617e4 835 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 836 INIT_LIST_HEAD(&ctx->timeout_list);
fcb323cc
JA
837 init_waitqueue_head(&ctx->inflight_wait);
838 spin_lock_init(&ctx->inflight_lock);
839 INIT_LIST_HEAD(&ctx->inflight_list);
2b188cc1 840 return ctx;
206aefde 841err:
0ddf92e8
JA
842 if (ctx->fallback_req)
843 kmem_cache_free(req_cachep, ctx->fallback_req);
206aefde 844 kfree(ctx->completions);
78076bb6 845 kfree(ctx->cancel_hash);
206aefde
JA
846 kfree(ctx);
847 return NULL;
2b188cc1
JA
848}
849
9d858b21 850static inline bool __req_need_defer(struct io_kiocb *req)
7adf4eaf 851{
a197f664
JL
852 struct io_ring_ctx *ctx = req->ctx;
853
498ccd9e
JA
854 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
855 + atomic_read(&ctx->cached_cq_overflow);
7adf4eaf
JA
856}
857
9d858b21 858static inline bool req_need_defer(struct io_kiocb *req)
de0617e4 859{
87987898 860 if (unlikely(req->flags & REQ_F_IO_DRAIN))
9d858b21 861 return __req_need_defer(req);
de0617e4 862
9d858b21 863 return false;
de0617e4
JA
864}
865
7adf4eaf 866static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
de0617e4
JA
867{
868 struct io_kiocb *req;
869
7adf4eaf 870 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
9d858b21 871 if (req && !req_need_defer(req)) {
de0617e4
JA
872 list_del_init(&req->list);
873 return req;
874 }
875
876 return NULL;
877}
878
5262f567
JA
879static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
880{
7adf4eaf
JA
881 struct io_kiocb *req;
882
883 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
93bd25bb
JA
884 if (req) {
885 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
886 return NULL;
fb4b3d3f 887 if (!__req_need_defer(req)) {
93bd25bb
JA
888 list_del_init(&req->list);
889 return req;
890 }
7adf4eaf
JA
891 }
892
893 return NULL;
5262f567
JA
894}
895
de0617e4 896static void __io_commit_cqring(struct io_ring_ctx *ctx)
2b188cc1 897{
75b28aff 898 struct io_rings *rings = ctx->rings;
2b188cc1 899
07910158
PB
900 /* order cqe stores with ring update */
901 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
2b188cc1 902
07910158
PB
903 if (wq_has_sleeper(&ctx->cq_wait)) {
904 wake_up_interruptible(&ctx->cq_wait);
905 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
2b188cc1
JA
906 }
907}
908
cccf0ee8
JA
909static inline void io_req_work_grab_env(struct io_kiocb *req,
910 const struct io_op_def *def)
911{
912 if (!req->work.mm && def->needs_mm) {
913 mmgrab(current->mm);
914 req->work.mm = current->mm;
2b188cc1 915 }
cccf0ee8
JA
916 if (!req->work.creds)
917 req->work.creds = get_current_cred();
ff002b30
JA
918 if (!req->work.fs && def->needs_fs) {
919 spin_lock(&current->fs->lock);
920 if (!current->fs->in_exec) {
921 req->work.fs = current->fs;
922 req->work.fs->users++;
923 } else {
924 req->work.flags |= IO_WQ_WORK_CANCEL;
925 }
926 spin_unlock(&current->fs->lock);
927 }
6ab23144
JA
928 if (!req->work.task_pid)
929 req->work.task_pid = task_pid_vnr(current);
2b188cc1
JA
930}
931
cccf0ee8 932static inline void io_req_work_drop_env(struct io_kiocb *req)
18d9be1a 933{
cccf0ee8
JA
934 if (req->work.mm) {
935 mmdrop(req->work.mm);
936 req->work.mm = NULL;
937 }
938 if (req->work.creds) {
939 put_cred(req->work.creds);
940 req->work.creds = NULL;
941 }
ff002b30
JA
942 if (req->work.fs) {
943 struct fs_struct *fs = req->work.fs;
944
945 spin_lock(&req->work.fs->lock);
946 if (--fs->users)
947 fs = NULL;
948 spin_unlock(&req->work.fs->lock);
949 if (fs)
950 free_fs_struct(fs);
951 }
561fb04a
JA
952}
953
94ae5e77
JA
954static inline bool io_prep_async_work(struct io_kiocb *req,
955 struct io_kiocb **link)
18d9be1a 956{
d3656344 957 const struct io_op_def *def = &io_op_defs[req->opcode];
561fb04a 958 bool do_hashed = false;
54a91f3b 959
d3656344
JA
960 if (req->flags & REQ_F_ISREG) {
961 if (def->hash_reg_file)
3529d8c2 962 do_hashed = true;
d3656344
JA
963 } else {
964 if (def->unbound_nonreg_file)
3529d8c2 965 req->work.flags |= IO_WQ_WORK_UNBOUND;
54a91f3b 966 }
cccf0ee8
JA
967
968 io_req_work_grab_env(req, def);
54a91f3b 969
94ae5e77 970 *link = io_prep_linked_timeout(req);
561fb04a
JA
971 return do_hashed;
972}
973
a197f664 974static inline void io_queue_async_work(struct io_kiocb *req)
561fb04a 975{
a197f664 976 struct io_ring_ctx *ctx = req->ctx;
94ae5e77
JA
977 struct io_kiocb *link;
978 bool do_hashed;
979
980 do_hashed = io_prep_async_work(req, &link);
561fb04a
JA
981
982 trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
983 req->flags);
984 if (!do_hashed) {
985 io_wq_enqueue(ctx->io_wq, &req->work);
986 } else {
987 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
988 file_inode(req->file));
989 }
94ae5e77
JA
990
991 if (link)
992 io_queue_linked_timeout(link);
18d9be1a
JA
993}
994
5262f567
JA
995static void io_kill_timeout(struct io_kiocb *req)
996{
997 int ret;
998
2d28390a 999 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
5262f567
JA
1000 if (ret != -1) {
1001 atomic_inc(&req->ctx->cq_timeouts);
842f9612 1002 list_del_init(&req->list);
78e19bbe 1003 io_cqring_fill_event(req, 0);
ec9c02ad 1004 io_put_req(req);
5262f567
JA
1005 }
1006}
1007
1008static void io_kill_timeouts(struct io_ring_ctx *ctx)
1009{
1010 struct io_kiocb *req, *tmp;
1011
1012 spin_lock_irq(&ctx->completion_lock);
1013 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1014 io_kill_timeout(req);
1015 spin_unlock_irq(&ctx->completion_lock);
1016}
1017
de0617e4
JA
1018static void io_commit_cqring(struct io_ring_ctx *ctx)
1019{
1020 struct io_kiocb *req;
1021
5262f567
JA
1022 while ((req = io_get_timeout_req(ctx)) != NULL)
1023 io_kill_timeout(req);
1024
de0617e4
JA
1025 __io_commit_cqring(ctx);
1026
87987898 1027 while ((req = io_get_deferred_req(ctx)) != NULL)
a197f664 1028 io_queue_async_work(req);
de0617e4
JA
1029}
1030
2b188cc1
JA
1031static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1032{
75b28aff 1033 struct io_rings *rings = ctx->rings;
2b188cc1
JA
1034 unsigned tail;
1035
1036 tail = ctx->cached_cq_tail;
115e12e5
SB
1037 /*
1038 * writes to the cq entry need to come after reading head; the
1039 * control dependency is enough as we're using WRITE_ONCE to
1040 * fill the cq entry
1041 */
75b28aff 1042 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
2b188cc1
JA
1043 return NULL;
1044
1045 ctx->cached_cq_tail++;
75b28aff 1046 return &rings->cqes[tail & ctx->cq_mask];
2b188cc1
JA
1047}
1048
f2842ab5
JA
1049static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1050{
f0b493e6
JA
1051 if (!ctx->cq_ev_fd)
1052 return false;
f2842ab5
JA
1053 if (!ctx->eventfd_async)
1054 return true;
1055 return io_wq_current_is_worker() || in_interrupt();
1056}
1057
f0b493e6 1058static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
1d7bb1d5
JA
1059{
1060 if (waitqueue_active(&ctx->wait))
1061 wake_up(&ctx->wait);
1062 if (waitqueue_active(&ctx->sqo_wait))
1063 wake_up(&ctx->sqo_wait);
f0b493e6 1064 if (trigger_ev)
1d7bb1d5
JA
1065 eventfd_signal(ctx->cq_ev_fd, 1);
1066}
1067
f0b493e6
JA
1068static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1069{
1070 __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
1071}
1072
c4a2ed72
JA
1073/* Returns true if there are no backlogged entries after the flush */
1074static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5
JA
1075{
1076 struct io_rings *rings = ctx->rings;
1077 struct io_uring_cqe *cqe;
1078 struct io_kiocb *req;
1079 unsigned long flags;
1080 LIST_HEAD(list);
1081
1082 if (!force) {
1083 if (list_empty_careful(&ctx->cq_overflow_list))
c4a2ed72 1084 return true;
1d7bb1d5
JA
1085 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1086 rings->cq_ring_entries))
c4a2ed72 1087 return false;
1d7bb1d5
JA
1088 }
1089
1090 spin_lock_irqsave(&ctx->completion_lock, flags);
1091
1092 /* if force is set, the ring is going away. always drop after that */
1093 if (force)
69b3e546 1094 ctx->cq_overflow_flushed = 1;
1d7bb1d5 1095
c4a2ed72 1096 cqe = NULL;
1d7bb1d5
JA
1097 while (!list_empty(&ctx->cq_overflow_list)) {
1098 cqe = io_get_cqring(ctx);
1099 if (!cqe && !force)
1100 break;
1101
1102 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1103 list);
1104 list_move(&req->list, &list);
1105 if (cqe) {
1106 WRITE_ONCE(cqe->user_data, req->user_data);
1107 WRITE_ONCE(cqe->res, req->result);
1108 WRITE_ONCE(cqe->flags, 0);
1109 } else {
1110 WRITE_ONCE(ctx->rings->cq_overflow,
1111 atomic_inc_return(&ctx->cached_cq_overflow));
1112 }
1113 }
1114
1115 io_commit_cqring(ctx);
ad3eb2c8
JA
1116 if (cqe) {
1117 clear_bit(0, &ctx->sq_check_overflow);
1118 clear_bit(0, &ctx->cq_check_overflow);
1119 }
1d7bb1d5
JA
1120 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1121 io_cqring_ev_posted(ctx);
1122
1123 while (!list_empty(&list)) {
1124 req = list_first_entry(&list, struct io_kiocb, list);
1125 list_del(&req->list);
ec9c02ad 1126 io_put_req(req);
1d7bb1d5 1127 }
c4a2ed72
JA
1128
1129 return cqe != NULL;
1d7bb1d5
JA
1130}
1131
78e19bbe 1132static void io_cqring_fill_event(struct io_kiocb *req, long res)
2b188cc1 1133{
78e19bbe 1134 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1135 struct io_uring_cqe *cqe;
1136
78e19bbe 1137 trace_io_uring_complete(ctx, req->user_data, res);
51c3ff62 1138
2b188cc1
JA
1139 /*
1140 * If we can't get a cq entry, userspace overflowed the
1141 * submission (by quite a lot). Increment the overflow count in
1142 * the ring.
1143 */
1144 cqe = io_get_cqring(ctx);
1d7bb1d5 1145 if (likely(cqe)) {
78e19bbe 1146 WRITE_ONCE(cqe->user_data, req->user_data);
2b188cc1 1147 WRITE_ONCE(cqe->res, res);
c71ffb67 1148 WRITE_ONCE(cqe->flags, 0);
1d7bb1d5 1149 } else if (ctx->cq_overflow_flushed) {
498ccd9e
JA
1150 WRITE_ONCE(ctx->rings->cq_overflow,
1151 atomic_inc_return(&ctx->cached_cq_overflow));
1d7bb1d5 1152 } else {
ad3eb2c8
JA
1153 if (list_empty(&ctx->cq_overflow_list)) {
1154 set_bit(0, &ctx->sq_check_overflow);
1155 set_bit(0, &ctx->cq_check_overflow);
1156 }
1d7bb1d5
JA
1157 refcount_inc(&req->refs);
1158 req->result = res;
1159 list_add_tail(&req->list, &ctx->cq_overflow_list);
2b188cc1
JA
1160 }
1161}
1162
78e19bbe 1163static void io_cqring_add_event(struct io_kiocb *req, long res)
2b188cc1 1164{
78e19bbe 1165 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1166 unsigned long flags;
1167
1168 spin_lock_irqsave(&ctx->completion_lock, flags);
78e19bbe 1169 io_cqring_fill_event(req, res);
2b188cc1
JA
1170 io_commit_cqring(ctx);
1171 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1172
8c838788 1173 io_cqring_ev_posted(ctx);
2b188cc1
JA
1174}
1175
0ddf92e8
JA
1176static inline bool io_is_fallback_req(struct io_kiocb *req)
1177{
1178 return req == (struct io_kiocb *)
1179 ((unsigned long) req->ctx->fallback_req & ~1UL);
1180}
1181
1182static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1183{
1184 struct io_kiocb *req;
1185
1186 req = ctx->fallback_req;
1187 if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
1188 return req;
1189
1190 return NULL;
1191}
1192
2579f913
JA
1193static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
1194 struct io_submit_state *state)
2b188cc1 1195{
fd6fab2c 1196 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2b188cc1
JA
1197 struct io_kiocb *req;
1198
2579f913 1199 if (!state) {
fd6fab2c 1200 req = kmem_cache_alloc(req_cachep, gfp);
2579f913 1201 if (unlikely(!req))
0ddf92e8 1202 goto fallback;
2579f913
JA
1203 } else if (!state->free_reqs) {
1204 size_t sz;
1205 int ret;
1206
1207 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
fd6fab2c
JA
1208 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1209
1210 /*
1211 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1212 * retry single alloc to be on the safe side.
1213 */
1214 if (unlikely(ret <= 0)) {
1215 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1216 if (!state->reqs[0])
0ddf92e8 1217 goto fallback;
fd6fab2c
JA
1218 ret = 1;
1219 }
2579f913 1220 state->free_reqs = ret - 1;
6c8a3134 1221 req = state->reqs[ret - 1];
2579f913 1222 } else {
2579f913 1223 state->free_reqs--;
6c8a3134 1224 req = state->reqs[state->free_reqs];
2b188cc1
JA
1225 }
1226
0ddf92e8 1227got_it:
1a6b74fc 1228 req->io = NULL;
60c112b0 1229 req->file = NULL;
2579f913
JA
1230 req->ctx = ctx;
1231 req->flags = 0;
e65ef56d
JA
1232 /* one is dropped after submission, the other at completion */
1233 refcount_set(&req->refs, 2);
9e645e11 1234 req->result = 0;
561fb04a 1235 INIT_IO_WORK(&req->work, io_wq_submit_work);
2579f913 1236 return req;
0ddf92e8
JA
1237fallback:
1238 req = io_get_fallback_req(ctx);
1239 if (req)
1240 goto got_it;
6805b32e 1241 percpu_ref_put(&ctx->refs);
2b188cc1
JA
1242 return NULL;
1243}
1244
2b85edfc 1245static void __io_req_do_free(struct io_kiocb *req)
def596e9 1246{
2b85edfc
PB
1247 if (likely(!io_is_fallback_req(req)))
1248 kmem_cache_free(req_cachep, req);
1249 else
1250 clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
1251}
1252
c6ca97b3 1253static void __io_req_aux_free(struct io_kiocb *req)
2b188cc1 1254{
fcb323cc
JA
1255 struct io_ring_ctx *ctx = req->ctx;
1256
96fd84d8 1257 kfree(req->io);
05f3fb3c
JA
1258 if (req->file) {
1259 if (req->flags & REQ_F_FIXED_FILE)
1260 percpu_ref_put(&ctx->file_data->refs);
1261 else
1262 fput(req->file);
def596e9 1263 }
cccf0ee8
JA
1264
1265 io_req_work_drop_env(req);
def596e9
JA
1266}
1267
9e645e11 1268static void __io_free_req(struct io_kiocb *req)
2b188cc1 1269{
c6ca97b3 1270 __io_req_aux_free(req);
fcb323cc 1271
99bc4c38
PB
1272 if (req->flags & REQ_F_NEED_CLEANUP)
1273 io_cleanup_req(req);
1274
fcb323cc 1275 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3 1276 struct io_ring_ctx *ctx = req->ctx;
fcb323cc
JA
1277 unsigned long flags;
1278
1279 spin_lock_irqsave(&ctx->inflight_lock, flags);
1280 list_del(&req->inflight_entry);
1281 if (waitqueue_active(&ctx->inflight_wait))
1282 wake_up(&ctx->inflight_wait);
1283 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1284 }
2b85edfc
PB
1285
1286 percpu_ref_put(&req->ctx->refs);
1287 __io_req_do_free(req);
e65ef56d
JA
1288}
1289
c6ca97b3
JA
1290struct req_batch {
1291 void *reqs[IO_IOPOLL_BATCH];
1292 int to_free;
1293 int need_iter;
1294};
1295
1296static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1297{
10fef4be
JA
1298 int fixed_refs = rb->to_free;
1299
c6ca97b3
JA
1300 if (!rb->to_free)
1301 return;
1302 if (rb->need_iter) {
1303 int i, inflight = 0;
1304 unsigned long flags;
1305
10fef4be 1306 fixed_refs = 0;
c6ca97b3
JA
1307 for (i = 0; i < rb->to_free; i++) {
1308 struct io_kiocb *req = rb->reqs[i];
1309
10fef4be 1310 if (req->flags & REQ_F_FIXED_FILE) {
c6ca97b3 1311 req->file = NULL;
10fef4be
JA
1312 fixed_refs++;
1313 }
c6ca97b3
JA
1314 if (req->flags & REQ_F_INFLIGHT)
1315 inflight++;
c6ca97b3
JA
1316 __io_req_aux_free(req);
1317 }
1318 if (!inflight)
1319 goto do_free;
1320
1321 spin_lock_irqsave(&ctx->inflight_lock, flags);
1322 for (i = 0; i < rb->to_free; i++) {
1323 struct io_kiocb *req = rb->reqs[i];
1324
10fef4be 1325 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3
JA
1326 list_del(&req->inflight_entry);
1327 if (!--inflight)
1328 break;
1329 }
1330 }
1331 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1332
1333 if (waitqueue_active(&ctx->inflight_wait))
1334 wake_up(&ctx->inflight_wait);
1335 }
1336do_free:
1337 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
10fef4be
JA
1338 if (fixed_refs)
1339 percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
c6ca97b3 1340 percpu_ref_put_many(&ctx->refs, rb->to_free);
c6ca97b3 1341 rb->to_free = rb->need_iter = 0;
e65ef56d
JA
1342}
1343
a197f664 1344static bool io_link_cancel_timeout(struct io_kiocb *req)
2665abfd 1345{
a197f664 1346 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1347 int ret;
1348
2d28390a 1349 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
2665abfd 1350 if (ret != -1) {
78e19bbe 1351 io_cqring_fill_event(req, -ECANCELED);
2665abfd
JA
1352 io_commit_cqring(ctx);
1353 req->flags &= ~REQ_F_LINK;
ec9c02ad 1354 io_put_req(req);
2665abfd
JA
1355 return true;
1356 }
1357
1358 return false;
e65ef56d
JA
1359}
1360
ba816ad6 1361static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
9e645e11 1362{
2665abfd 1363 struct io_ring_ctx *ctx = req->ctx;
2665abfd 1364 bool wake_ev = false;
9e645e11 1365
4d7dd462
JA
1366 /* Already got next link */
1367 if (req->flags & REQ_F_LINK_NEXT)
1368 return;
1369
9e645e11
JA
1370 /*
1371 * The list should never be empty when we are called here. But could
1372 * potentially happen if the chain is messed up, check to be on the
1373 * safe side.
1374 */
4493233e
PB
1375 while (!list_empty(&req->link_list)) {
1376 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1377 struct io_kiocb, link_list);
94ae5e77 1378
4493233e
PB
1379 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1380 (nxt->flags & REQ_F_TIMEOUT))) {
1381 list_del_init(&nxt->link_list);
94ae5e77 1382 wake_ev |= io_link_cancel_timeout(nxt);
94ae5e77
JA
1383 req->flags &= ~REQ_F_LINK_TIMEOUT;
1384 continue;
1385 }
9e645e11 1386
4493233e
PB
1387 list_del_init(&req->link_list);
1388 if (!list_empty(&nxt->link_list))
1389 nxt->flags |= REQ_F_LINK;
b18fdf71 1390 *nxtptr = nxt;
94ae5e77 1391 break;
9e645e11 1392 }
2665abfd 1393
4d7dd462 1394 req->flags |= REQ_F_LINK_NEXT;
2665abfd
JA
1395 if (wake_ev)
1396 io_cqring_ev_posted(ctx);
9e645e11
JA
1397}
1398
1399/*
1400 * Called if REQ_F_LINK is set, and we fail the head request
1401 */
1402static void io_fail_links(struct io_kiocb *req)
1403{
2665abfd 1404 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1405 unsigned long flags;
1406
1407 spin_lock_irqsave(&ctx->completion_lock, flags);
9e645e11
JA
1408
1409 while (!list_empty(&req->link_list)) {
4493233e
PB
1410 struct io_kiocb *link = list_first_entry(&req->link_list,
1411 struct io_kiocb, link_list);
9e645e11 1412
4493233e 1413 list_del_init(&link->link_list);
c826bd7a 1414 trace_io_uring_fail_link(req, link);
2665abfd
JA
1415
1416 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
d625c6ee 1417 link->opcode == IORING_OP_LINK_TIMEOUT) {
a197f664 1418 io_link_cancel_timeout(link);
2665abfd 1419 } else {
78e19bbe 1420 io_cqring_fill_event(link, -ECANCELED);
978db57e 1421 __io_double_put_req(link);
2665abfd 1422 }
5d960724 1423 req->flags &= ~REQ_F_LINK_TIMEOUT;
9e645e11 1424 }
2665abfd
JA
1425
1426 io_commit_cqring(ctx);
1427 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1428 io_cqring_ev_posted(ctx);
9e645e11
JA
1429}
1430
4d7dd462 1431static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
9e645e11 1432{
4d7dd462 1433 if (likely(!(req->flags & REQ_F_LINK)))
2665abfd 1434 return;
2665abfd 1435
9e645e11
JA
1436 /*
1437 * If LINK is set, we have dependent requests in this chain. If we
1438 * didn't fail this request, queue the first one up, moving any other
1439 * dependencies to the next request. In case of failure, fail the rest
1440 * of the chain.
1441 */
2665abfd
JA
1442 if (req->flags & REQ_F_FAIL_LINK) {
1443 io_fail_links(req);
7c9e7f0f
JA
1444 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1445 REQ_F_LINK_TIMEOUT) {
2665abfd
JA
1446 struct io_ring_ctx *ctx = req->ctx;
1447 unsigned long flags;
1448
1449 /*
1450 * If this is a timeout link, we could be racing with the
1451 * timeout timer. Grab the completion lock for this case to
7c9e7f0f 1452 * protect against that.
2665abfd
JA
1453 */
1454 spin_lock_irqsave(&ctx->completion_lock, flags);
1455 io_req_link_next(req, nxt);
1456 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1457 } else {
1458 io_req_link_next(req, nxt);
9e645e11 1459 }
4d7dd462 1460}
9e645e11 1461
c69f8dbe
JL
1462static void io_free_req(struct io_kiocb *req)
1463{
944e58bf
PB
1464 struct io_kiocb *nxt = NULL;
1465
1466 io_req_find_next(req, &nxt);
70cf9f32 1467 __io_free_req(req);
944e58bf
PB
1468
1469 if (nxt)
1470 io_queue_async_work(nxt);
c69f8dbe
JL
1471}
1472
ba816ad6
JA
1473/*
1474 * Drop reference to request, return next in chain (if there is one) if this
1475 * was the last reference to this request.
1476 */
f9bd67f6 1477__attribute__((nonnull))
ec9c02ad 1478static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
e65ef56d 1479{
f9bd67f6 1480 io_req_find_next(req, nxtptr);
4d7dd462 1481
e65ef56d 1482 if (refcount_dec_and_test(&req->refs))
4d7dd462 1483 __io_free_req(req);
2b188cc1
JA
1484}
1485
e65ef56d
JA
1486static void io_put_req(struct io_kiocb *req)
1487{
1488 if (refcount_dec_and_test(&req->refs))
1489 io_free_req(req);
2b188cc1
JA
1490}
1491
978db57e
JA
1492/*
1493 * Must only be used if we don't need to care about links, usually from
1494 * within the completion handling itself.
1495 */
1496static void __io_double_put_req(struct io_kiocb *req)
78e19bbe
JA
1497{
1498 /* drop both submit and complete references */
1499 if (refcount_sub_and_test(2, &req->refs))
1500 __io_free_req(req);
1501}
1502
978db57e
JA
1503static void io_double_put_req(struct io_kiocb *req)
1504{
1505 /* drop both submit and complete references */
1506 if (refcount_sub_and_test(2, &req->refs))
1507 io_free_req(req);
1508}
1509
1d7bb1d5 1510static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
a3a0e43f 1511{
84f97dc2
JA
1512 struct io_rings *rings = ctx->rings;
1513
ad3eb2c8
JA
1514 if (test_bit(0, &ctx->cq_check_overflow)) {
1515 /*
1516 * noflush == true is from the waitqueue handler, just ensure
1517 * we wake up the task, and the next invocation will flush the
1518 * entries. We cannot safely to it from here.
1519 */
1520 if (noflush && !list_empty(&ctx->cq_overflow_list))
1521 return -1U;
1d7bb1d5 1522
ad3eb2c8
JA
1523 io_cqring_overflow_flush(ctx, false);
1524 }
1d7bb1d5 1525
a3a0e43f
JA
1526 /* See comment at the top of this file */
1527 smp_rmb();
ad3eb2c8 1528 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
a3a0e43f
JA
1529}
1530
fb5ccc98
PB
1531static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1532{
1533 struct io_rings *rings = ctx->rings;
1534
1535 /* make sure SQ entry isn't read before tail */
1536 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1537}
1538
8237e045 1539static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
e94f141b 1540{
c6ca97b3
JA
1541 if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
1542 return false;
e94f141b 1543
c6ca97b3
JA
1544 if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
1545 rb->need_iter++;
1546
1547 rb->reqs[rb->to_free++] = req;
1548 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1549 io_free_req_many(req->ctx, rb);
1550 return true;
e94f141b
JA
1551}
1552
def596e9
JA
1553/*
1554 * Find and free completed poll iocbs
1555 */
1556static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1557 struct list_head *done)
1558{
8237e045 1559 struct req_batch rb;
def596e9 1560 struct io_kiocb *req;
def596e9 1561
c6ca97b3 1562 rb.to_free = rb.need_iter = 0;
def596e9
JA
1563 while (!list_empty(done)) {
1564 req = list_first_entry(done, struct io_kiocb, list);
1565 list_del(&req->list);
1566
78e19bbe 1567 io_cqring_fill_event(req, req->result);
def596e9
JA
1568 (*nr_events)++;
1569
8237e045
JA
1570 if (refcount_dec_and_test(&req->refs) &&
1571 !io_req_multi_free(&rb, req))
1572 io_free_req(req);
def596e9 1573 }
def596e9 1574
09bb8394 1575 io_commit_cqring(ctx);
8237e045 1576 io_free_req_many(ctx, &rb);
def596e9
JA
1577}
1578
1579static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1580 long min)
1581{
1582 struct io_kiocb *req, *tmp;
1583 LIST_HEAD(done);
1584 bool spin;
1585 int ret;
1586
1587 /*
1588 * Only spin for completions if we don't have multiple devices hanging
1589 * off our complete list, and we're under the requested amount.
1590 */
1591 spin = !ctx->poll_multi_file && *nr_events < min;
1592
1593 ret = 0;
1594 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
9adbd45d 1595 struct kiocb *kiocb = &req->rw.kiocb;
def596e9
JA
1596
1597 /*
1598 * Move completed entries to our local list. If we find a
1599 * request that requires polling, break out and complete
1600 * the done list first, if we have entries there.
1601 */
1602 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1603 list_move_tail(&req->list, &done);
1604 continue;
1605 }
1606 if (!list_empty(&done))
1607 break;
1608
1609 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1610 if (ret < 0)
1611 break;
1612
1613 if (ret && spin)
1614 spin = false;
1615 ret = 0;
1616 }
1617
1618 if (!list_empty(&done))
1619 io_iopoll_complete(ctx, nr_events, &done);
1620
1621 return ret;
1622}
1623
1624/*
d195a66e 1625 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
def596e9
JA
1626 * non-spinning poll check - we'll still enter the driver poll loop, but only
1627 * as a non-spinning completion check.
1628 */
1629static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1630 long min)
1631{
08f5439f 1632 while (!list_empty(&ctx->poll_list) && !need_resched()) {
def596e9
JA
1633 int ret;
1634
1635 ret = io_do_iopoll(ctx, nr_events, min);
1636 if (ret < 0)
1637 return ret;
1638 if (!min || *nr_events >= min)
1639 return 0;
1640 }
1641
1642 return 1;
1643}
1644
1645/*
1646 * We can't just wait for polled events to come to us, we have to actively
1647 * find and complete them.
1648 */
1649static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1650{
1651 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1652 return;
1653
1654 mutex_lock(&ctx->uring_lock);
1655 while (!list_empty(&ctx->poll_list)) {
1656 unsigned int nr_events = 0;
1657
1658 io_iopoll_getevents(ctx, &nr_events, 1);
08f5439f
JA
1659
1660 /*
1661 * Ensure we allow local-to-the-cpu processing to take place,
1662 * in this case we need to ensure that we reap all events.
1663 */
1664 cond_resched();
def596e9
JA
1665 }
1666 mutex_unlock(&ctx->uring_lock);
1667}
1668
2b2ed975
JA
1669static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1670 long min)
def596e9 1671{
2b2ed975 1672 int iters = 0, ret = 0;
500f9fba 1673
def596e9
JA
1674 do {
1675 int tmin = 0;
1676
a3a0e43f
JA
1677 /*
1678 * Don't enter poll loop if we already have events pending.
1679 * If we do, we can potentially be spinning for commands that
1680 * already triggered a CQE (eg in error).
1681 */
1d7bb1d5 1682 if (io_cqring_events(ctx, false))
a3a0e43f
JA
1683 break;
1684
500f9fba
JA
1685 /*
1686 * If a submit got punted to a workqueue, we can have the
1687 * application entering polling for a command before it gets
1688 * issued. That app will hold the uring_lock for the duration
1689 * of the poll right here, so we need to take a breather every
1690 * now and then to ensure that the issue has a chance to add
1691 * the poll to the issued list. Otherwise we can spin here
1692 * forever, while the workqueue is stuck trying to acquire the
1693 * very same mutex.
1694 */
1695 if (!(++iters & 7)) {
1696 mutex_unlock(&ctx->uring_lock);
1697 mutex_lock(&ctx->uring_lock);
1698 }
1699
def596e9
JA
1700 if (*nr_events < min)
1701 tmin = min - *nr_events;
1702
1703 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1704 if (ret <= 0)
1705 break;
1706 ret = 0;
1707 } while (min && !*nr_events && !need_resched());
1708
2b2ed975
JA
1709 return ret;
1710}
1711
1712static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1713 long min)
1714{
1715 int ret;
1716
1717 /*
1718 * We disallow the app entering submit/complete with polling, but we
1719 * still need to lock the ring to prevent racing with polled issue
1720 * that got punted to a workqueue.
1721 */
1722 mutex_lock(&ctx->uring_lock);
1723 ret = __io_iopoll_check(ctx, nr_events, min);
500f9fba 1724 mutex_unlock(&ctx->uring_lock);
def596e9
JA
1725 return ret;
1726}
1727
491381ce 1728static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 1729{
491381ce
JA
1730 /*
1731 * Tell lockdep we inherited freeze protection from submission
1732 * thread.
1733 */
1734 if (req->flags & REQ_F_ISREG) {
1735 struct inode *inode = file_inode(req->file);
2b188cc1 1736
491381ce 1737 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1 1738 }
491381ce 1739 file_end_write(req->file);
2b188cc1
JA
1740}
1741
4e88d6e7
JA
1742static inline void req_set_fail_links(struct io_kiocb *req)
1743{
1744 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1745 req->flags |= REQ_F_FAIL_LINK;
1746}
1747
ba816ad6 1748static void io_complete_rw_common(struct kiocb *kiocb, long res)
2b188cc1 1749{
9adbd45d 1750 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2b188cc1 1751
491381ce
JA
1752 if (kiocb->ki_flags & IOCB_WRITE)
1753 kiocb_end_write(req);
2b188cc1 1754
4e88d6e7
JA
1755 if (res != req->result)
1756 req_set_fail_links(req);
78e19bbe 1757 io_cqring_add_event(req, res);
ba816ad6
JA
1758}
1759
1760static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1761{
9adbd45d 1762 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6
JA
1763
1764 io_complete_rw_common(kiocb, res);
e65ef56d 1765 io_put_req(req);
2b188cc1
JA
1766}
1767
ba816ad6
JA
1768static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1769{
9adbd45d 1770 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ec9c02ad 1771 struct io_kiocb *nxt = NULL;
ba816ad6
JA
1772
1773 io_complete_rw_common(kiocb, res);
ec9c02ad
JL
1774 io_put_req_find_next(req, &nxt);
1775
1776 return nxt;
2b188cc1
JA
1777}
1778
def596e9
JA
1779static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1780{
9adbd45d 1781 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 1782
491381ce
JA
1783 if (kiocb->ki_flags & IOCB_WRITE)
1784 kiocb_end_write(req);
def596e9 1785
4e88d6e7
JA
1786 if (res != req->result)
1787 req_set_fail_links(req);
9e645e11 1788 req->result = res;
def596e9
JA
1789 if (res != -EAGAIN)
1790 req->flags |= REQ_F_IOPOLL_COMPLETED;
1791}
1792
1793/*
1794 * After the iocb has been issued, it's safe to be found on the poll list.
1795 * Adding the kiocb to the list AFTER submission ensures that we don't
1796 * find it from a io_iopoll_getevents() thread before the issuer is done
1797 * accessing the kiocb cookie.
1798 */
1799static void io_iopoll_req_issued(struct io_kiocb *req)
1800{
1801 struct io_ring_ctx *ctx = req->ctx;
1802
1803 /*
1804 * Track whether we have multiple files in our lists. This will impact
1805 * how we do polling eventually, not spinning if we're on potentially
1806 * different devices.
1807 */
1808 if (list_empty(&ctx->poll_list)) {
1809 ctx->poll_multi_file = false;
1810 } else if (!ctx->poll_multi_file) {
1811 struct io_kiocb *list_req;
1812
1813 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1814 list);
9adbd45d 1815 if (list_req->file != req->file)
def596e9
JA
1816 ctx->poll_multi_file = true;
1817 }
1818
1819 /*
1820 * For fast devices, IO may have already completed. If it has, add
1821 * it to the front so we find it first.
1822 */
1823 if (req->flags & REQ_F_IOPOLL_COMPLETED)
1824 list_add(&req->list, &ctx->poll_list);
1825 else
1826 list_add_tail(&req->list, &ctx->poll_list);
1827}
1828
3d6770fb 1829static void io_file_put(struct io_submit_state *state)
9a56a232 1830{
3d6770fb 1831 if (state->file) {
9a56a232
JA
1832 int diff = state->has_refs - state->used_refs;
1833
1834 if (diff)
1835 fput_many(state->file, diff);
1836 state->file = NULL;
1837 }
1838}
1839
1840/*
1841 * Get as many references to a file as we have IOs left in this submission,
1842 * assuming most submissions are for one file, or at least that each file
1843 * has more than one submission.
1844 */
1845static struct file *io_file_get(struct io_submit_state *state, int fd)
1846{
1847 if (!state)
1848 return fget(fd);
1849
1850 if (state->file) {
1851 if (state->fd == fd) {
1852 state->used_refs++;
1853 state->ios_left--;
1854 return state->file;
1855 }
3d6770fb 1856 io_file_put(state);
9a56a232
JA
1857 }
1858 state->file = fget_many(fd, state->ios_left);
1859 if (!state->file)
1860 return NULL;
1861
1862 state->fd = fd;
1863 state->has_refs = state->ios_left;
1864 state->used_refs = 1;
1865 state->ios_left--;
1866 return state->file;
1867}
1868
2b188cc1
JA
1869/*
1870 * If we tracked the file through the SCM inflight mechanism, we could support
1871 * any file. For now, just ensure that anything potentially problematic is done
1872 * inline.
1873 */
1874static bool io_file_supports_async(struct file *file)
1875{
1876 umode_t mode = file_inode(file)->i_mode;
1877
10d59345 1878 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2b188cc1
JA
1879 return true;
1880 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1881 return true;
1882
1883 return false;
1884}
1885
3529d8c2
JA
1886static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1887 bool force_nonblock)
2b188cc1 1888{
def596e9 1889 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 1890 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
1891 unsigned ioprio;
1892 int ret;
2b188cc1 1893
491381ce
JA
1894 if (S_ISREG(file_inode(req->file)->i_mode))
1895 req->flags |= REQ_F_ISREG;
1896
2b188cc1 1897 kiocb->ki_pos = READ_ONCE(sqe->off);
ba04291e
JA
1898 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
1899 req->flags |= REQ_F_CUR_POS;
1900 kiocb->ki_pos = req->file->f_pos;
1901 }
2b188cc1 1902 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
1903 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1904 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1905 if (unlikely(ret))
1906 return ret;
2b188cc1
JA
1907
1908 ioprio = READ_ONCE(sqe->ioprio);
1909 if (ioprio) {
1910 ret = ioprio_check_cap(ioprio);
1911 if (ret)
09bb8394 1912 return ret;
2b188cc1
JA
1913
1914 kiocb->ki_ioprio = ioprio;
1915 } else
1916 kiocb->ki_ioprio = get_current_ioprio();
1917
8449eeda 1918 /* don't allow async punt if RWF_NOWAIT was requested */
491381ce
JA
1919 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1920 (req->file->f_flags & O_NONBLOCK))
8449eeda
SB
1921 req->flags |= REQ_F_NOWAIT;
1922
1923 if (force_nonblock)
2b188cc1 1924 kiocb->ki_flags |= IOCB_NOWAIT;
8449eeda 1925
def596e9 1926 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
1927 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1928 !kiocb->ki_filp->f_op->iopoll)
09bb8394 1929 return -EOPNOTSUPP;
2b188cc1 1930
def596e9
JA
1931 kiocb->ki_flags |= IOCB_HIPRI;
1932 kiocb->ki_complete = io_complete_rw_iopoll;
6873e0bd 1933 req->result = 0;
def596e9 1934 } else {
09bb8394
JA
1935 if (kiocb->ki_flags & IOCB_HIPRI)
1936 return -EINVAL;
def596e9
JA
1937 kiocb->ki_complete = io_complete_rw;
1938 }
9adbd45d 1939
3529d8c2
JA
1940 req->rw.addr = READ_ONCE(sqe->addr);
1941 req->rw.len = READ_ONCE(sqe->len);
9adbd45d
JA
1942 /* we own ->private, reuse it for the buffer index */
1943 req->rw.kiocb.private = (void *) (unsigned long)
3529d8c2 1944 READ_ONCE(sqe->buf_index);
2b188cc1 1945 return 0;
2b188cc1
JA
1946}
1947
1948static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1949{
1950 switch (ret) {
1951 case -EIOCBQUEUED:
1952 break;
1953 case -ERESTARTSYS:
1954 case -ERESTARTNOINTR:
1955 case -ERESTARTNOHAND:
1956 case -ERESTART_RESTARTBLOCK:
1957 /*
1958 * We can't just restart the syscall, since previously
1959 * submitted sqes may already be in progress. Just fail this
1960 * IO with EINTR.
1961 */
1962 ret = -EINTR;
1963 /* fall through */
1964 default:
1965 kiocb->ki_complete(kiocb, ret, 0);
1966 }
1967}
1968
ba816ad6
JA
1969static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1970 bool in_async)
1971{
ba04291e
JA
1972 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1973
1974 if (req->flags & REQ_F_CUR_POS)
1975 req->file->f_pos = kiocb->ki_pos;
f9bd67f6 1976 if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
ba816ad6
JA
1977 *nxt = __io_complete_rw(kiocb, ret);
1978 else
1979 io_rw_done(kiocb, ret);
1980}
1981
9adbd45d 1982static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
7d009165 1983 struct iov_iter *iter)
edafccee 1984{
9adbd45d
JA
1985 struct io_ring_ctx *ctx = req->ctx;
1986 size_t len = req->rw.len;
edafccee
JA
1987 struct io_mapped_ubuf *imu;
1988 unsigned index, buf_index;
1989 size_t offset;
1990 u64 buf_addr;
1991
1992 /* attempt to use fixed buffers without having provided iovecs */
1993 if (unlikely(!ctx->user_bufs))
1994 return -EFAULT;
1995
9adbd45d 1996 buf_index = (unsigned long) req->rw.kiocb.private;
edafccee
JA
1997 if (unlikely(buf_index >= ctx->nr_user_bufs))
1998 return -EFAULT;
1999
2000 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2001 imu = &ctx->user_bufs[index];
9adbd45d 2002 buf_addr = req->rw.addr;
edafccee
JA
2003
2004 /* overflow */
2005 if (buf_addr + len < buf_addr)
2006 return -EFAULT;
2007 /* not inside the mapped region */
2008 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2009 return -EFAULT;
2010
2011 /*
2012 * May not be a start of buffer, set size appropriately
2013 * and advance us to the beginning.
2014 */
2015 offset = buf_addr - imu->ubuf;
2016 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
2017
2018 if (offset) {
2019 /*
2020 * Don't use iov_iter_advance() here, as it's really slow for
2021 * using the latter parts of a big fixed buffer - it iterates
2022 * over each segment manually. We can cheat a bit here, because
2023 * we know that:
2024 *
2025 * 1) it's a BVEC iter, we set it up
2026 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2027 * first and last bvec
2028 *
2029 * So just find our index, and adjust the iterator afterwards.
2030 * If the offset is within the first bvec (or the whole first
2031 * bvec, just use iov_iter_advance(). This makes it easier
2032 * since we can just skip the first segment, which may not
2033 * be PAGE_SIZE aligned.
2034 */
2035 const struct bio_vec *bvec = imu->bvec;
2036
2037 if (offset <= bvec->bv_len) {
2038 iov_iter_advance(iter, offset);
2039 } else {
2040 unsigned long seg_skip;
2041
2042 /* skip first vec */
2043 offset -= bvec->bv_len;
2044 seg_skip = 1 + (offset >> PAGE_SHIFT);
2045
2046 iter->bvec = bvec + seg_skip;
2047 iter->nr_segs -= seg_skip;
99c79f66 2048 iter->count -= bvec->bv_len + offset;
bd11b3a3 2049 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2050 }
2051 }
2052
5e559561 2053 return len;
edafccee
JA
2054}
2055
cf6fd4bd
PB
2056static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2057 struct iovec **iovec, struct iov_iter *iter)
2b188cc1 2058{
9adbd45d
JA
2059 void __user *buf = u64_to_user_ptr(req->rw.addr);
2060 size_t sqe_len = req->rw.len;
edafccee
JA
2061 u8 opcode;
2062
d625c6ee 2063 opcode = req->opcode;
7d009165 2064 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 2065 *iovec = NULL;
9adbd45d 2066 return io_import_fixed(req, rw, iter);
edafccee 2067 }
2b188cc1 2068
9adbd45d
JA
2069 /* buffer index only valid with fixed read/write */
2070 if (req->rw.kiocb.private)
2071 return -EINVAL;
2072
3a6820f2
JA
2073 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2074 ssize_t ret;
2075 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2076 *iovec = NULL;
2077 return ret;
2078 }
2079
f67676d1
JA
2080 if (req->io) {
2081 struct io_async_rw *iorw = &req->io->rw;
2082
2083 *iovec = iorw->iov;
2084 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2085 if (iorw->iov == iorw->fast_iov)
2086 *iovec = NULL;
2087 return iorw->size;
2088 }
2089
2b188cc1 2090#ifdef CONFIG_COMPAT
cf6fd4bd 2091 if (req->ctx->compat)
2b188cc1
JA
2092 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2093 iovec, iter);
2094#endif
2095
2096 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2097}
2098
31b51510 2099/*
32960613
JA
2100 * For files that don't have ->read_iter() and ->write_iter(), handle them
2101 * by looping over ->read() or ->write() manually.
31b51510 2102 */
32960613
JA
2103static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2104 struct iov_iter *iter)
2105{
2106 ssize_t ret = 0;
2107
2108 /*
2109 * Don't support polled IO through this interface, and we can't
2110 * support non-blocking either. For the latter, this just causes
2111 * the kiocb to be handled from an async context.
2112 */
2113 if (kiocb->ki_flags & IOCB_HIPRI)
2114 return -EOPNOTSUPP;
2115 if (kiocb->ki_flags & IOCB_NOWAIT)
2116 return -EAGAIN;
2117
2118 while (iov_iter_count(iter)) {
311ae9e1 2119 struct iovec iovec;
32960613
JA
2120 ssize_t nr;
2121
311ae9e1
PB
2122 if (!iov_iter_is_bvec(iter)) {
2123 iovec = iov_iter_iovec(iter);
2124 } else {
2125 /* fixed buffers import bvec */
2126 iovec.iov_base = kmap(iter->bvec->bv_page)
2127 + iter->iov_offset;
2128 iovec.iov_len = min(iter->count,
2129 iter->bvec->bv_len - iter->iov_offset);
2130 }
2131
32960613
JA
2132 if (rw == READ) {
2133 nr = file->f_op->read(file, iovec.iov_base,
2134 iovec.iov_len, &kiocb->ki_pos);
2135 } else {
2136 nr = file->f_op->write(file, iovec.iov_base,
2137 iovec.iov_len, &kiocb->ki_pos);
2138 }
2139
311ae9e1
PB
2140 if (iov_iter_is_bvec(iter))
2141 kunmap(iter->bvec->bv_page);
2142
32960613
JA
2143 if (nr < 0) {
2144 if (!ret)
2145 ret = nr;
2146 break;
2147 }
2148 ret += nr;
2149 if (nr != iovec.iov_len)
2150 break;
2151 iov_iter_advance(iter, nr);
2152 }
2153
2154 return ret;
2155}
2156
b7bb4f7d 2157static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
f67676d1
JA
2158 struct iovec *iovec, struct iovec *fast_iov,
2159 struct iov_iter *iter)
2160{
2161 req->io->rw.nr_segs = iter->nr_segs;
2162 req->io->rw.size = io_size;
2163 req->io->rw.iov = iovec;
2164 if (!req->io->rw.iov) {
2165 req->io->rw.iov = req->io->rw.fast_iov;
2166 memcpy(req->io->rw.iov, fast_iov,
2167 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
2168 } else {
2169 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
2170 }
2171}
2172
b7bb4f7d 2173static int io_alloc_async_ctx(struct io_kiocb *req)
f67676d1 2174{
d3656344
JA
2175 if (!io_op_defs[req->opcode].async_ctx)
2176 return 0;
f67676d1 2177 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
06b76d44 2178 return req->io == NULL;
b7bb4f7d
JA
2179}
2180
b7bb4f7d
JA
2181static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2182 struct iovec *iovec, struct iovec *fast_iov,
2183 struct iov_iter *iter)
2184{
980ad263 2185 if (!io_op_defs[req->opcode].async_ctx)
74566df3 2186 return 0;
5d204bcf
JA
2187 if (!req->io) {
2188 if (io_alloc_async_ctx(req))
2189 return -ENOMEM;
b7bb4f7d 2190
5d204bcf
JA
2191 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2192 }
b7bb4f7d 2193 return 0;
f67676d1
JA
2194}
2195
3529d8c2
JA
2196static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2197 bool force_nonblock)
f67676d1 2198{
3529d8c2
JA
2199 struct io_async_ctx *io;
2200 struct iov_iter iter;
f67676d1
JA
2201 ssize_t ret;
2202
3529d8c2
JA
2203 ret = io_prep_rw(req, sqe, force_nonblock);
2204 if (ret)
2205 return ret;
f67676d1 2206
3529d8c2
JA
2207 if (unlikely(!(req->file->f_mode & FMODE_READ)))
2208 return -EBADF;
f67676d1 2209
5f798bea
PB
2210 /* either don't need iovec imported or already have it */
2211 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3529d8c2
JA
2212 return 0;
2213
2214 io = req->io;
2215 io->rw.iov = io->rw.fast_iov;
2216 req->io = NULL;
2217 ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
2218 req->io = io;
2219 if (ret < 0)
2220 return ret;
2221
2222 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2223 return 0;
f67676d1
JA
2224}
2225
267bc904 2226static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
8358e3a8 2227 bool force_nonblock)
2b188cc1
JA
2228{
2229 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2230 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2231 struct iov_iter iter;
31b51510 2232 size_t iov_count;
f67676d1 2233 ssize_t io_size, ret;
2b188cc1 2234
3529d8c2 2235 ret = io_import_iovec(READ, req, &iovec, &iter);
06b76d44
JA
2236 if (ret < 0)
2237 return ret;
2b188cc1 2238
fd6c2e4c
JA
2239 /* Ensure we clear previously set non-block flag */
2240 if (!force_nonblock)
9adbd45d 2241 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2242
797f3f53 2243 req->result = 0;
f67676d1 2244 io_size = ret;
9e645e11 2245 if (req->flags & REQ_F_LINK)
f67676d1
JA
2246 req->result = io_size;
2247
2248 /*
2249 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2250 * we know to async punt it even if it was opened O_NONBLOCK
2251 */
9adbd45d 2252 if (force_nonblock && !io_file_supports_async(req->file)) {
f67676d1
JA
2253 req->flags |= REQ_F_MUST_PUNT;
2254 goto copy_iov;
2255 }
9e645e11 2256
31b51510 2257 iov_count = iov_iter_count(&iter);
9adbd45d 2258 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2b188cc1
JA
2259 if (!ret) {
2260 ssize_t ret2;
2261
9adbd45d
JA
2262 if (req->file->f_op->read_iter)
2263 ret2 = call_read_iter(req->file, kiocb, &iter);
32960613 2264 else
9adbd45d 2265 ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
32960613 2266
9d93a3f5 2267 /* Catch -EAGAIN return for forced non-blocking submission */
f67676d1 2268 if (!force_nonblock || ret2 != -EAGAIN) {
cf6fd4bd 2269 kiocb_done(kiocb, ret2, nxt, req->in_async);
f67676d1
JA
2270 } else {
2271copy_iov:
b7bb4f7d 2272 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2273 inline_vecs, &iter);
2274 if (ret)
2275 goto out_free;
2276 return -EAGAIN;
2277 }
2b188cc1 2278 }
f67676d1 2279out_free:
1e95081c 2280 kfree(iovec);
99bc4c38 2281 req->flags &= ~REQ_F_NEED_CLEANUP;
2b188cc1
JA
2282 return ret;
2283}
2284
3529d8c2
JA
2285static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2286 bool force_nonblock)
f67676d1 2287{
3529d8c2
JA
2288 struct io_async_ctx *io;
2289 struct iov_iter iter;
f67676d1
JA
2290 ssize_t ret;
2291
3529d8c2
JA
2292 ret = io_prep_rw(req, sqe, force_nonblock);
2293 if (ret)
2294 return ret;
f67676d1 2295
3529d8c2
JA
2296 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2297 return -EBADF;
f67676d1 2298
5f798bea
PB
2299 /* either don't need iovec imported or already have it */
2300 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3529d8c2
JA
2301 return 0;
2302
2303 io = req->io;
2304 io->rw.iov = io->rw.fast_iov;
2305 req->io = NULL;
2306 ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
2307 req->io = io;
2308 if (ret < 0)
2309 return ret;
2310
2311 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2312 return 0;
f67676d1
JA
2313}
2314
267bc904 2315static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
8358e3a8 2316 bool force_nonblock)
2b188cc1
JA
2317{
2318 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2319 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2320 struct iov_iter iter;
31b51510 2321 size_t iov_count;
f67676d1 2322 ssize_t ret, io_size;
2b188cc1 2323
3529d8c2 2324 ret = io_import_iovec(WRITE, req, &iovec, &iter);
06b76d44
JA
2325 if (ret < 0)
2326 return ret;
2b188cc1 2327
fd6c2e4c
JA
2328 /* Ensure we clear previously set non-block flag */
2329 if (!force_nonblock)
9adbd45d 2330 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2331
797f3f53 2332 req->result = 0;
f67676d1 2333 io_size = ret;
9e645e11 2334 if (req->flags & REQ_F_LINK)
f67676d1 2335 req->result = io_size;
9e645e11 2336
f67676d1
JA
2337 /*
2338 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2339 * we know to async punt it even if it was opened O_NONBLOCK
2340 */
2341 if (force_nonblock && !io_file_supports_async(req->file)) {
2342 req->flags |= REQ_F_MUST_PUNT;
2343 goto copy_iov;
2344 }
31b51510 2345
10d59345
JA
2346 /* file path doesn't support NOWAIT for non-direct_IO */
2347 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2348 (req->flags & REQ_F_ISREG))
f67676d1 2349 goto copy_iov;
31b51510 2350
f67676d1 2351 iov_count = iov_iter_count(&iter);
9adbd45d 2352 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2b188cc1 2353 if (!ret) {
9bf7933f
RP
2354 ssize_t ret2;
2355
2b188cc1
JA
2356 /*
2357 * Open-code file_start_write here to grab freeze protection,
2358 * which will be released by another thread in
2359 * io_complete_rw(). Fool lockdep by telling it the lock got
2360 * released so that it doesn't complain about the held lock when
2361 * we return to userspace.
2362 */
491381ce 2363 if (req->flags & REQ_F_ISREG) {
9adbd45d 2364 __sb_start_write(file_inode(req->file)->i_sb,
2b188cc1 2365 SB_FREEZE_WRITE, true);
9adbd45d 2366 __sb_writers_release(file_inode(req->file)->i_sb,
2b188cc1
JA
2367 SB_FREEZE_WRITE);
2368 }
2369 kiocb->ki_flags |= IOCB_WRITE;
9bf7933f 2370
9adbd45d
JA
2371 if (req->file->f_op->write_iter)
2372 ret2 = call_write_iter(req->file, kiocb, &iter);
32960613 2373 else
9adbd45d 2374 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
faac996c
JA
2375 /*
2376 * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
2377 * retry them without IOCB_NOWAIT.
2378 */
2379 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2380 ret2 = -EAGAIN;
f67676d1 2381 if (!force_nonblock || ret2 != -EAGAIN) {
cf6fd4bd 2382 kiocb_done(kiocb, ret2, nxt, req->in_async);
f67676d1
JA
2383 } else {
2384copy_iov:
b7bb4f7d 2385 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2386 inline_vecs, &iter);
2387 if (ret)
2388 goto out_free;
2389 return -EAGAIN;
2390 }
2b188cc1 2391 }
31b51510 2392out_free:
99bc4c38 2393 req->flags &= ~REQ_F_NEED_CLEANUP;
1e95081c 2394 kfree(iovec);
2b188cc1
JA
2395 return ret;
2396}
2397
2398/*
2399 * IORING_OP_NOP just posts a completion event, nothing else.
2400 */
78e19bbe 2401static int io_nop(struct io_kiocb *req)
2b188cc1
JA
2402{
2403 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 2404
def596e9
JA
2405 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2406 return -EINVAL;
2407
78e19bbe 2408 io_cqring_add_event(req, 0);
e65ef56d 2409 io_put_req(req);
2b188cc1
JA
2410 return 0;
2411}
2412
3529d8c2 2413static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 2414{
6b06314c 2415 struct io_ring_ctx *ctx = req->ctx;
c992fe29 2416
09bb8394
JA
2417 if (!req->file)
2418 return -EBADF;
c992fe29 2419
6b06314c 2420 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 2421 return -EINVAL;
edafccee 2422 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
2423 return -EINVAL;
2424
8ed8d3c3
JA
2425 req->sync.flags = READ_ONCE(sqe->fsync_flags);
2426 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2427 return -EINVAL;
2428
2429 req->sync.off = READ_ONCE(sqe->off);
2430 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
2431 return 0;
2432}
2433
8ed8d3c3
JA
2434static bool io_req_cancelled(struct io_kiocb *req)
2435{
2436 if (req->work.flags & IO_WQ_WORK_CANCEL) {
2437 req_set_fail_links(req);
2438 io_cqring_add_event(req, -ECANCELED);
2439 io_put_req(req);
2440 return true;
2441 }
2442
2443 return false;
2444}
2445
78912934
JA
2446static void io_link_work_cb(struct io_wq_work **workptr)
2447{
2448 struct io_wq_work *work = *workptr;
2449 struct io_kiocb *link = work->data;
2450
2451 io_queue_linked_timeout(link);
2452 work->func = io_wq_submit_work;
2453}
2454
2455static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
2456{
2457 struct io_kiocb *link;
2458
2459 io_prep_async_work(nxt, &link);
2460 *workptr = &nxt->work;
2461 if (link) {
2462 nxt->work.flags |= IO_WQ_WORK_CB;
2463 nxt->work.func = io_link_work_cb;
2464 nxt->work.data = link;
2465 }
2466}
2467
8ed8d3c3
JA
2468static void io_fsync_finish(struct io_wq_work **workptr)
2469{
2470 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2471 loff_t end = req->sync.off + req->sync.len;
2472 struct io_kiocb *nxt = NULL;
2473 int ret;
2474
2475 if (io_req_cancelled(req))
2476 return;
2477
9adbd45d 2478 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
2479 end > 0 ? end : LLONG_MAX,
2480 req->sync.flags & IORING_FSYNC_DATASYNC);
2481 if (ret < 0)
2482 req_set_fail_links(req);
2483 io_cqring_add_event(req, ret);
2484 io_put_req_find_next(req, &nxt);
2485 if (nxt)
78912934 2486 io_wq_assign_next(workptr, nxt);
8ed8d3c3
JA
2487}
2488
fc4df999
JA
2489static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
2490 bool force_nonblock)
c992fe29 2491{
8ed8d3c3 2492 struct io_wq_work *work, *old_work;
c992fe29
CH
2493
2494 /* fsync always requires a blocking context */
8ed8d3c3
JA
2495 if (force_nonblock) {
2496 io_put_req(req);
2497 req->work.func = io_fsync_finish;
c992fe29 2498 return -EAGAIN;
8ed8d3c3 2499 }
c992fe29 2500
8ed8d3c3
JA
2501 work = old_work = &req->work;
2502 io_fsync_finish(&work);
2503 if (work && work != old_work)
2504 *nxt = container_of(work, struct io_kiocb, work);
c992fe29
CH
2505 return 0;
2506}
2507
d63d1b5e 2508static void io_fallocate_finish(struct io_wq_work **workptr)
8ed8d3c3
JA
2509{
2510 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2511 struct io_kiocb *nxt = NULL;
2512 int ret;
2513
d63d1b5e
JA
2514 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2515 req->sync.len);
8ed8d3c3
JA
2516 if (ret < 0)
2517 req_set_fail_links(req);
2518 io_cqring_add_event(req, ret);
2519 io_put_req_find_next(req, &nxt);
2520 if (nxt)
78912934 2521 io_wq_assign_next(workptr, nxt);
5d17b4a4
JA
2522}
2523
d63d1b5e
JA
2524static int io_fallocate_prep(struct io_kiocb *req,
2525 const struct io_uring_sqe *sqe)
2526{
2527 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2528 return -EINVAL;
2529
2530 req->sync.off = READ_ONCE(sqe->off);
2531 req->sync.len = READ_ONCE(sqe->addr);
2532 req->sync.mode = READ_ONCE(sqe->len);
2533 return 0;
2534}
2535
2536static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
2537 bool force_nonblock)
5d17b4a4 2538{
8ed8d3c3 2539 struct io_wq_work *work, *old_work;
5d17b4a4 2540
d63d1b5e 2541 /* fallocate always requiring blocking context */
8ed8d3c3
JA
2542 if (force_nonblock) {
2543 io_put_req(req);
d63d1b5e 2544 req->work.func = io_fallocate_finish;
5d17b4a4 2545 return -EAGAIN;
8ed8d3c3 2546 }
5d17b4a4 2547
8ed8d3c3 2548 work = old_work = &req->work;
d63d1b5e 2549 io_fallocate_finish(&work);
8ed8d3c3
JA
2550 if (work && work != old_work)
2551 *nxt = container_of(work, struct io_kiocb, work);
d63d1b5e 2552
5d17b4a4
JA
2553 return 0;
2554}
2555
15b71abe 2556static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 2557{
f8748881 2558 const char __user *fname;
15b71abe 2559 int ret;
b7bb4f7d 2560
15b71abe
JA
2561 if (sqe->ioprio || sqe->buf_index)
2562 return -EINVAL;
cf3040ca
JA
2563 if (sqe->flags & IOSQE_FIXED_FILE)
2564 return -EBADF;
0bdbdd08
PB
2565 if (req->flags & REQ_F_NEED_CLEANUP)
2566 return 0;
03b1230c 2567
15b71abe 2568 req->open.dfd = READ_ONCE(sqe->fd);
c12cedf2 2569 req->open.how.mode = READ_ONCE(sqe->len);
f8748881 2570 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
c12cedf2 2571 req->open.how.flags = READ_ONCE(sqe->open_flags);
3529d8c2 2572
f8748881 2573 req->open.filename = getname(fname);
15b71abe
JA
2574 if (IS_ERR(req->open.filename)) {
2575 ret = PTR_ERR(req->open.filename);
2576 req->open.filename = NULL;
2577 return ret;
2578 }
3529d8c2 2579
8fef80bf 2580 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 2581 return 0;
03b1230c
JA
2582}
2583
cebdb986 2584static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 2585{
cebdb986
JA
2586 struct open_how __user *how;
2587 const char __user *fname;
2588 size_t len;
0fa03c62
JA
2589 int ret;
2590
cebdb986 2591 if (sqe->ioprio || sqe->buf_index)
0fa03c62 2592 return -EINVAL;
cf3040ca
JA
2593 if (sqe->flags & IOSQE_FIXED_FILE)
2594 return -EBADF;
0bdbdd08
PB
2595 if (req->flags & REQ_F_NEED_CLEANUP)
2596 return 0;
0fa03c62 2597
cebdb986
JA
2598 req->open.dfd = READ_ONCE(sqe->fd);
2599 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2600 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2601 len = READ_ONCE(sqe->len);
0fa03c62 2602
cebdb986
JA
2603 if (len < OPEN_HOW_SIZE_VER0)
2604 return -EINVAL;
3529d8c2 2605
cebdb986
JA
2606 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2607 len);
2608 if (ret)
2609 return ret;
3529d8c2 2610
cebdb986
JA
2611 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2612 req->open.how.flags |= O_LARGEFILE;
0fa03c62 2613
cebdb986
JA
2614 req->open.filename = getname(fname);
2615 if (IS_ERR(req->open.filename)) {
2616 ret = PTR_ERR(req->open.filename);
2617 req->open.filename = NULL;
2618 return ret;
2619 }
2620
8fef80bf 2621 req->flags |= REQ_F_NEED_CLEANUP;
cebdb986
JA
2622 return 0;
2623}
2624
2625static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
2626 bool force_nonblock)
15b71abe
JA
2627{
2628 struct open_flags op;
15b71abe
JA
2629 struct file *file;
2630 int ret;
2631
f86cd20c 2632 if (force_nonblock)
15b71abe 2633 return -EAGAIN;
15b71abe 2634
cebdb986 2635 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
2636 if (ret)
2637 goto err;
2638
cebdb986 2639 ret = get_unused_fd_flags(req->open.how.flags);
15b71abe
JA
2640 if (ret < 0)
2641 goto err;
2642
2643 file = do_filp_open(req->open.dfd, req->open.filename, &op);
2644 if (IS_ERR(file)) {
2645 put_unused_fd(ret);
2646 ret = PTR_ERR(file);
2647 } else {
2648 fsnotify_open(file);
2649 fd_install(ret, file);
2650 }
2651err:
2652 putname(req->open.filename);
8fef80bf 2653 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe
JA
2654 if (ret < 0)
2655 req_set_fail_links(req);
2656 io_cqring_add_event(req, ret);
2657 io_put_req_find_next(req, nxt);
2658 return 0;
2659}
2660
cebdb986
JA
2661static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
2662 bool force_nonblock)
2663{
2664 req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
2665 return io_openat2(req, nxt, force_nonblock);
2666}
2667
3e4827b0
JA
2668static int io_epoll_ctl_prep(struct io_kiocb *req,
2669 const struct io_uring_sqe *sqe)
2670{
2671#if defined(CONFIG_EPOLL)
2672 if (sqe->ioprio || sqe->buf_index)
2673 return -EINVAL;
2674
2675 req->epoll.epfd = READ_ONCE(sqe->fd);
2676 req->epoll.op = READ_ONCE(sqe->len);
2677 req->epoll.fd = READ_ONCE(sqe->off);
2678
2679 if (ep_op_has_event(req->epoll.op)) {
2680 struct epoll_event __user *ev;
2681
2682 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
2683 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
2684 return -EFAULT;
2685 }
2686
2687 return 0;
2688#else
2689 return -EOPNOTSUPP;
2690#endif
2691}
2692
2693static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
2694 bool force_nonblock)
2695{
2696#if defined(CONFIG_EPOLL)
2697 struct io_epoll *ie = &req->epoll;
2698 int ret;
2699
2700 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
2701 if (force_nonblock && ret == -EAGAIN)
2702 return -EAGAIN;
2703
2704 if (ret < 0)
2705 req_set_fail_links(req);
2706 io_cqring_add_event(req, ret);
2707 io_put_req_find_next(req, nxt);
2708 return 0;
2709#else
2710 return -EOPNOTSUPP;
2711#endif
2712}
2713
c1ca757b
JA
2714static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2715{
2716#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2717 if (sqe->ioprio || sqe->buf_index || sqe->off)
2718 return -EINVAL;
2719
2720 req->madvise.addr = READ_ONCE(sqe->addr);
2721 req->madvise.len = READ_ONCE(sqe->len);
2722 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
2723 return 0;
2724#else
2725 return -EOPNOTSUPP;
2726#endif
2727}
2728
2729static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
2730 bool force_nonblock)
2731{
2732#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2733 struct io_madvise *ma = &req->madvise;
2734 int ret;
2735
2736 if (force_nonblock)
2737 return -EAGAIN;
2738
2739 ret = do_madvise(ma->addr, ma->len, ma->advice);
2740 if (ret < 0)
2741 req_set_fail_links(req);
2742 io_cqring_add_event(req, ret);
2743 io_put_req_find_next(req, nxt);
2744 return 0;
2745#else
2746 return -EOPNOTSUPP;
2747#endif
2748}
2749
4840e418
JA
2750static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2751{
2752 if (sqe->ioprio || sqe->buf_index || sqe->addr)
2753 return -EINVAL;
2754
2755 req->fadvise.offset = READ_ONCE(sqe->off);
2756 req->fadvise.len = READ_ONCE(sqe->len);
2757 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
2758 return 0;
2759}
2760
2761static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
2762 bool force_nonblock)
2763{
2764 struct io_fadvise *fa = &req->fadvise;
2765 int ret;
2766
3e69426d
JA
2767 if (force_nonblock) {
2768 switch (fa->advice) {
2769 case POSIX_FADV_NORMAL:
2770 case POSIX_FADV_RANDOM:
2771 case POSIX_FADV_SEQUENTIAL:
2772 break;
2773 default:
2774 return -EAGAIN;
2775 }
2776 }
4840e418
JA
2777
2778 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
2779 if (ret < 0)
2780 req_set_fail_links(req);
2781 io_cqring_add_event(req, ret);
2782 io_put_req_find_next(req, nxt);
2783 return 0;
2784}
2785
eddc7ef5
JA
2786static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2787{
f8748881 2788 const char __user *fname;
eddc7ef5
JA
2789 unsigned lookup_flags;
2790 int ret;
2791
2792 if (sqe->ioprio || sqe->buf_index)
2793 return -EINVAL;
cf3040ca
JA
2794 if (sqe->flags & IOSQE_FIXED_FILE)
2795 return -EBADF;
0bdbdd08
PB
2796 if (req->flags & REQ_F_NEED_CLEANUP)
2797 return 0;
eddc7ef5
JA
2798
2799 req->open.dfd = READ_ONCE(sqe->fd);
2800 req->open.mask = READ_ONCE(sqe->len);
f8748881 2801 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
eddc7ef5 2802 req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
c12cedf2 2803 req->open.how.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 2804
c12cedf2 2805 if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
eddc7ef5
JA
2806 return -EINVAL;
2807
f8748881 2808 req->open.filename = getname_flags(fname, lookup_flags, NULL);
eddc7ef5
JA
2809 if (IS_ERR(req->open.filename)) {
2810 ret = PTR_ERR(req->open.filename);
2811 req->open.filename = NULL;
2812 return ret;
2813 }
2814
8fef80bf 2815 req->flags |= REQ_F_NEED_CLEANUP;
eddc7ef5
JA
2816 return 0;
2817}
2818
2819static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
2820 bool force_nonblock)
2821{
2822 struct io_open *ctx = &req->open;
2823 unsigned lookup_flags;
2824 struct path path;
2825 struct kstat stat;
2826 int ret;
2827
2828 if (force_nonblock)
2829 return -EAGAIN;
2830
c12cedf2 2831 if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
eddc7ef5
JA
2832 return -EINVAL;
2833
2834retry:
2835 /* filename_lookup() drops it, keep a reference */
2836 ctx->filename->refcnt++;
2837
2838 ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
2839 NULL);
2840 if (ret)
2841 goto err;
2842
c12cedf2 2843 ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
eddc7ef5
JA
2844 path_put(&path);
2845 if (retry_estale(ret, lookup_flags)) {
2846 lookup_flags |= LOOKUP_REVAL;
2847 goto retry;
2848 }
2849 if (!ret)
2850 ret = cp_statx(&stat, ctx->buffer);
2851err:
2852 putname(ctx->filename);
8fef80bf 2853 req->flags &= ~REQ_F_NEED_CLEANUP;
eddc7ef5
JA
2854 if (ret < 0)
2855 req_set_fail_links(req);
2856 io_cqring_add_event(req, ret);
2857 io_put_req_find_next(req, nxt);
2858 return 0;
2859}
2860
b5dba59e
JA
2861static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2862{
2863 /*
2864 * If we queue this for async, it must not be cancellable. That would
2865 * leave the 'file' in an undeterminate state.
2866 */
2867 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
2868
2869 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
2870 sqe->rw_flags || sqe->buf_index)
2871 return -EINVAL;
2872 if (sqe->flags & IOSQE_FIXED_FILE)
cf3040ca 2873 return -EBADF;
b5dba59e
JA
2874
2875 req->close.fd = READ_ONCE(sqe->fd);
2876 if (req->file->f_op == &io_uring_fops ||
b14cca0c 2877 req->close.fd == req->ctx->ring_fd)
b5dba59e
JA
2878 return -EBADF;
2879
2880 return 0;
2881}
2882
a93b3331
PB
2883/* only called when __close_fd_get_file() is done */
2884static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
2885{
2886 int ret;
2887
2888 ret = filp_close(req->close.put_file, req->work.files);
2889 if (ret < 0)
2890 req_set_fail_links(req);
2891 io_cqring_add_event(req, ret);
2892 fput(req->close.put_file);
2893 io_put_req_find_next(req, nxt);
2894}
2895
b5dba59e
JA
2896static void io_close_finish(struct io_wq_work **workptr)
2897{
2898 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2899 struct io_kiocb *nxt = NULL;
2900
a93b3331 2901 __io_close_finish(req, &nxt);
b5dba59e
JA
2902 if (nxt)
2903 io_wq_assign_next(workptr, nxt);
2904}
2905
2906static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
2907 bool force_nonblock)
2908{
2909 int ret;
2910
2911 req->close.put_file = NULL;
2912 ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
2913 if (ret < 0)
2914 return ret;
2915
2916 /* if the file has a flush method, be safe and punt to async */
f86cd20c 2917 if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
b5dba59e 2918 goto eagain;
b5dba59e
JA
2919
2920 /*
2921 * No ->flush(), safely close from here and just punt the
2922 * fput() to async context.
2923 */
a93b3331
PB
2924 __io_close_finish(req, nxt);
2925 return 0;
b5dba59e
JA
2926eagain:
2927 req->work.func = io_close_finish;
1a417f4e
JA
2928 /*
2929 * Do manual async queue here to avoid grabbing files - we don't
2930 * need the files, and it'll cause io_close_finish() to close
2931 * the file again and cause a double CQE entry for this request
2932 */
2933 io_queue_async_work(req);
2934 return 0;
b5dba59e
JA
2935}
2936
3529d8c2 2937static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
2938{
2939 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4
JA
2940
2941 if (!req->file)
2942 return -EBADF;
5d17b4a4
JA
2943
2944 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2945 return -EINVAL;
2946 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2947 return -EINVAL;
2948
8ed8d3c3
JA
2949 req->sync.off = READ_ONCE(sqe->off);
2950 req->sync.len = READ_ONCE(sqe->len);
2951 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
2952 return 0;
2953}
2954
2955static void io_sync_file_range_finish(struct io_wq_work **workptr)
2956{
2957 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2958 struct io_kiocb *nxt = NULL;
2959 int ret;
2960
2961 if (io_req_cancelled(req))
2962 return;
2963
9adbd45d 2964 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
2965 req->sync.flags);
2966 if (ret < 0)
2967 req_set_fail_links(req);
2968 io_cqring_add_event(req, ret);
2969 io_put_req_find_next(req, &nxt);
2970 if (nxt)
78912934 2971 io_wq_assign_next(workptr, nxt);
5d17b4a4
JA
2972}
2973
fc4df999 2974static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
5d17b4a4
JA
2975 bool force_nonblock)
2976{
8ed8d3c3 2977 struct io_wq_work *work, *old_work;
5d17b4a4
JA
2978
2979 /* sync_file_range always requires a blocking context */
8ed8d3c3
JA
2980 if (force_nonblock) {
2981 io_put_req(req);
2982 req->work.func = io_sync_file_range_finish;
5d17b4a4 2983 return -EAGAIN;
8ed8d3c3 2984 }
5d17b4a4 2985
8ed8d3c3
JA
2986 work = old_work = &req->work;
2987 io_sync_file_range_finish(&work);
2988 if (work && work != old_work)
2989 *nxt = container_of(work, struct io_kiocb, work);
5d17b4a4
JA
2990 return 0;
2991}
2992
3529d8c2 2993static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 2994{
0fa03c62 2995#if defined(CONFIG_NET)
e47293fd 2996 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 2997 struct io_async_ctx *io = req->io;
99bc4c38 2998 int ret;
03b1230c 2999
e47293fd
JA
3000 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3001 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 3002 sr->len = READ_ONCE(sqe->len);
3529d8c2 3003
fddaface 3004 if (!io || req->opcode == IORING_OP_SEND)
3529d8c2 3005 return 0;
5f798bea
PB
3006 /* iovec is already imported */
3007 if (req->flags & REQ_F_NEED_CLEANUP)
3008 return 0;
3529d8c2 3009
d9688565 3010 io->msg.iov = io->msg.fast_iov;
99bc4c38 3011 ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 3012 &io->msg.iov);
99bc4c38
PB
3013 if (!ret)
3014 req->flags |= REQ_F_NEED_CLEANUP;
3015 return ret;
03b1230c 3016#else
e47293fd 3017 return -EOPNOTSUPP;
03b1230c
JA
3018#endif
3019}
3020
fc4df999
JA
3021static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
3022 bool force_nonblock)
aa1fa28f 3023{
03b1230c 3024#if defined(CONFIG_NET)
0b416c3e 3025 struct io_async_msghdr *kmsg = NULL;
0fa03c62
JA
3026 struct socket *sock;
3027 int ret;
3028
3029 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3030 return -EINVAL;
3031
3032 sock = sock_from_file(req->file, &ret);
3033 if (sock) {
b7bb4f7d 3034 struct io_async_ctx io;
03b1230c 3035 struct sockaddr_storage addr;
0fa03c62
JA
3036 unsigned flags;
3037
03b1230c 3038 if (req->io) {
0b416c3e
JA
3039 kmsg = &req->io->msg;
3040 kmsg->msg.msg_name = &addr;
3041 /* if iov is set, it's allocated already */
3042 if (!kmsg->iov)
3043 kmsg->iov = kmsg->fast_iov;
3044 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3045 } else {
3529d8c2
JA
3046 struct io_sr_msg *sr = &req->sr_msg;
3047
0b416c3e
JA
3048 kmsg = &io.msg;
3049 kmsg->msg.msg_name = &addr;
3529d8c2
JA
3050
3051 io.msg.iov = io.msg.fast_iov;
3052 ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3053 sr->msg_flags, &io.msg.iov);
03b1230c 3054 if (ret)
3529d8c2 3055 return ret;
03b1230c 3056 }
0fa03c62 3057
e47293fd
JA
3058 flags = req->sr_msg.msg_flags;
3059 if (flags & MSG_DONTWAIT)
3060 req->flags |= REQ_F_NOWAIT;
3061 else if (force_nonblock)
3062 flags |= MSG_DONTWAIT;
3063
0b416c3e 3064 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
03b1230c 3065 if (force_nonblock && ret == -EAGAIN) {
b7bb4f7d
JA
3066 if (req->io)
3067 return -EAGAIN;
1e95081c
PB
3068 if (io_alloc_async_ctx(req)) {
3069 if (kmsg && kmsg->iov != kmsg->fast_iov)
3070 kfree(kmsg->iov);
b7bb4f7d 3071 return -ENOMEM;
1e95081c 3072 }
99bc4c38 3073 req->flags |= REQ_F_NEED_CLEANUP;
b7bb4f7d 3074 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
0b416c3e 3075 return -EAGAIN;
03b1230c 3076 }
441cdbd5
JA
3077 if (ret == -ERESTARTSYS)
3078 ret = -EINTR;
0fa03c62
JA
3079 }
3080
1e95081c 3081 if (kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3082 kfree(kmsg->iov);
99bc4c38 3083 req->flags &= ~REQ_F_NEED_CLEANUP;
78e19bbe 3084 io_cqring_add_event(req, ret);
4e88d6e7
JA
3085 if (ret < 0)
3086 req_set_fail_links(req);
ec9c02ad 3087 io_put_req_find_next(req, nxt);
5d17b4a4 3088 return 0;
03b1230c
JA
3089#else
3090 return -EOPNOTSUPP;
aa1fa28f 3091#endif
03b1230c 3092}
aa1fa28f 3093
fddaface
JA
3094static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
3095 bool force_nonblock)
3096{
3097#if defined(CONFIG_NET)
3098 struct socket *sock;
3099 int ret;
3100
3101 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3102 return -EINVAL;
3103
3104 sock = sock_from_file(req->file, &ret);
3105 if (sock) {
3106 struct io_sr_msg *sr = &req->sr_msg;
3107 struct msghdr msg;
3108 struct iovec iov;
3109 unsigned flags;
3110
3111 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3112 &msg.msg_iter);
3113 if (ret)
3114 return ret;
3115
3116 msg.msg_name = NULL;
3117 msg.msg_control = NULL;
3118 msg.msg_controllen = 0;
3119 msg.msg_namelen = 0;
3120
3121 flags = req->sr_msg.msg_flags;
3122 if (flags & MSG_DONTWAIT)
3123 req->flags |= REQ_F_NOWAIT;
3124 else if (force_nonblock)
3125 flags |= MSG_DONTWAIT;
3126
0b7b21e4
JA
3127 msg.msg_flags = flags;
3128 ret = sock_sendmsg(sock, &msg);
fddaface
JA
3129 if (force_nonblock && ret == -EAGAIN)
3130 return -EAGAIN;
3131 if (ret == -ERESTARTSYS)
3132 ret = -EINTR;
3133 }
3134
3135 io_cqring_add_event(req, ret);
3136 if (ret < 0)
3137 req_set_fail_links(req);
3138 io_put_req_find_next(req, nxt);
3139 return 0;
3140#else
3141 return -EOPNOTSUPP;
3142#endif
3143}
3144
3529d8c2
JA
3145static int io_recvmsg_prep(struct io_kiocb *req,
3146 const struct io_uring_sqe *sqe)
aa1fa28f
JA
3147{
3148#if defined(CONFIG_NET)
e47293fd 3149 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 3150 struct io_async_ctx *io = req->io;
99bc4c38 3151 int ret;
3529d8c2
JA
3152
3153 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3154 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 3155 sr->len = READ_ONCE(sqe->len);
06b76d44 3156
fddaface 3157 if (!io || req->opcode == IORING_OP_RECV)
06b76d44 3158 return 0;
5f798bea
PB
3159 /* iovec is already imported */
3160 if (req->flags & REQ_F_NEED_CLEANUP)
3161 return 0;
03b1230c 3162
d9688565 3163 io->msg.iov = io->msg.fast_iov;
99bc4c38 3164 ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 3165 &io->msg.uaddr, &io->msg.iov);
99bc4c38
PB
3166 if (!ret)
3167 req->flags |= REQ_F_NEED_CLEANUP;
3168 return ret;
aa1fa28f 3169#else
e47293fd 3170 return -EOPNOTSUPP;
aa1fa28f
JA
3171#endif
3172}
3173
fc4df999
JA
3174static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
3175 bool force_nonblock)
aa1fa28f
JA
3176{
3177#if defined(CONFIG_NET)
0b416c3e 3178 struct io_async_msghdr *kmsg = NULL;
03b1230c
JA
3179 struct socket *sock;
3180 int ret;
3181
3182 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3183 return -EINVAL;
3184
3185 sock = sock_from_file(req->file, &ret);
3186 if (sock) {
b7bb4f7d 3187 struct io_async_ctx io;
03b1230c 3188 struct sockaddr_storage addr;
03b1230c
JA
3189 unsigned flags;
3190
03b1230c 3191 if (req->io) {
0b416c3e
JA
3192 kmsg = &req->io->msg;
3193 kmsg->msg.msg_name = &addr;
3194 /* if iov is set, it's allocated already */
3195 if (!kmsg->iov)
3196 kmsg->iov = kmsg->fast_iov;
3197 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3198 } else {
3529d8c2
JA
3199 struct io_sr_msg *sr = &req->sr_msg;
3200
0b416c3e
JA
3201 kmsg = &io.msg;
3202 kmsg->msg.msg_name = &addr;
3529d8c2
JA
3203
3204 io.msg.iov = io.msg.fast_iov;
3205 ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
3206 sr->msg_flags, &io.msg.uaddr,
3207 &io.msg.iov);
03b1230c 3208 if (ret)
3529d8c2 3209 return ret;
03b1230c
JA
3210 }
3211
e47293fd
JA
3212 flags = req->sr_msg.msg_flags;
3213 if (flags & MSG_DONTWAIT)
3214 req->flags |= REQ_F_NOWAIT;
3215 else if (force_nonblock)
3216 flags |= MSG_DONTWAIT;
3217
3218 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3219 kmsg->uaddr, flags);
03b1230c 3220 if (force_nonblock && ret == -EAGAIN) {
b7bb4f7d
JA
3221 if (req->io)
3222 return -EAGAIN;
1e95081c
PB
3223 if (io_alloc_async_ctx(req)) {
3224 if (kmsg && kmsg->iov != kmsg->fast_iov)
3225 kfree(kmsg->iov);
b7bb4f7d 3226 return -ENOMEM;
1e95081c 3227 }
b7bb4f7d 3228 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
99bc4c38 3229 req->flags |= REQ_F_NEED_CLEANUP;
0b416c3e 3230 return -EAGAIN;
03b1230c
JA
3231 }
3232 if (ret == -ERESTARTSYS)
3233 ret = -EINTR;
3234 }
3235
1e95081c 3236 if (kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3237 kfree(kmsg->iov);
99bc4c38 3238 req->flags &= ~REQ_F_NEED_CLEANUP;
03b1230c 3239 io_cqring_add_event(req, ret);
4e88d6e7
JA
3240 if (ret < 0)
3241 req_set_fail_links(req);
03b1230c
JA
3242 io_put_req_find_next(req, nxt);
3243 return 0;
0fa03c62
JA
3244#else
3245 return -EOPNOTSUPP;
3246#endif
3247}
5d17b4a4 3248
fddaface
JA
3249static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
3250 bool force_nonblock)
3251{
3252#if defined(CONFIG_NET)
3253 struct socket *sock;
3254 int ret;
3255
3256 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3257 return -EINVAL;
3258
3259 sock = sock_from_file(req->file, &ret);
3260 if (sock) {
3261 struct io_sr_msg *sr = &req->sr_msg;
3262 struct msghdr msg;
3263 struct iovec iov;
3264 unsigned flags;
3265
3266 ret = import_single_range(READ, sr->buf, sr->len, &iov,
3267 &msg.msg_iter);
3268 if (ret)
3269 return ret;
3270
3271 msg.msg_name = NULL;
3272 msg.msg_control = NULL;
3273 msg.msg_controllen = 0;
3274 msg.msg_namelen = 0;
3275 msg.msg_iocb = NULL;
3276 msg.msg_flags = 0;
3277
3278 flags = req->sr_msg.msg_flags;
3279 if (flags & MSG_DONTWAIT)
3280 req->flags |= REQ_F_NOWAIT;
3281 else if (force_nonblock)
3282 flags |= MSG_DONTWAIT;
3283
0b7b21e4 3284 ret = sock_recvmsg(sock, &msg, flags);
fddaface
JA
3285 if (force_nonblock && ret == -EAGAIN)
3286 return -EAGAIN;
3287 if (ret == -ERESTARTSYS)
3288 ret = -EINTR;
3289 }
3290
3291 io_cqring_add_event(req, ret);
3292 if (ret < 0)
3293 req_set_fail_links(req);
3294 io_put_req_find_next(req, nxt);
3295 return 0;
3296#else
3297 return -EOPNOTSUPP;
3298#endif
3299}
3300
3301
3529d8c2 3302static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35
JA
3303{
3304#if defined(CONFIG_NET)
8ed8d3c3
JA
3305 struct io_accept *accept = &req->accept;
3306
17f2fe35
JA
3307 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3308 return -EINVAL;
8042d6ce 3309 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
3310 return -EINVAL;
3311
d55e5f5b
JA
3312 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3313 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 3314 accept->flags = READ_ONCE(sqe->accept_flags);
8ed8d3c3
JA
3315 return 0;
3316#else
3317 return -EOPNOTSUPP;
3318#endif
3319}
17f2fe35 3320
8ed8d3c3
JA
3321#if defined(CONFIG_NET)
3322static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
3323 bool force_nonblock)
3324{
3325 struct io_accept *accept = &req->accept;
3326 unsigned file_flags;
3327 int ret;
3328
3329 file_flags = force_nonblock ? O_NONBLOCK : 0;
3330 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3331 accept->addr_len, accept->flags);
3332 if (ret == -EAGAIN && force_nonblock)
17f2fe35 3333 return -EAGAIN;
8e3cca12
JA
3334 if (ret == -ERESTARTSYS)
3335 ret = -EINTR;
4e88d6e7
JA
3336 if (ret < 0)
3337 req_set_fail_links(req);
78e19bbe 3338 io_cqring_add_event(req, ret);
ec9c02ad 3339 io_put_req_find_next(req, nxt);
17f2fe35 3340 return 0;
8ed8d3c3
JA
3341}
3342
3343static void io_accept_finish(struct io_wq_work **workptr)
3344{
3345 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3346 struct io_kiocb *nxt = NULL;
3347
3348 if (io_req_cancelled(req))
3349 return;
3350 __io_accept(req, &nxt, false);
3351 if (nxt)
78912934 3352 io_wq_assign_next(workptr, nxt);
8ed8d3c3
JA
3353}
3354#endif
3355
3356static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
3357 bool force_nonblock)
3358{
3359#if defined(CONFIG_NET)
3360 int ret;
3361
8ed8d3c3
JA
3362 ret = __io_accept(req, nxt, force_nonblock);
3363 if (ret == -EAGAIN && force_nonblock) {
3364 req->work.func = io_accept_finish;
8ed8d3c3
JA
3365 io_put_req(req);
3366 return -EAGAIN;
3367 }
3368 return 0;
0fa03c62
JA
3369#else
3370 return -EOPNOTSUPP;
3371#endif
3372}
5d17b4a4 3373
3529d8c2 3374static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021
JA
3375{
3376#if defined(CONFIG_NET)
3529d8c2
JA
3377 struct io_connect *conn = &req->connect;
3378 struct io_async_ctx *io = req->io;
f499a021 3379
3fbb51c1
JA
3380 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3381 return -EINVAL;
3382 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3383 return -EINVAL;
3384
3529d8c2
JA
3385 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3386 conn->addr_len = READ_ONCE(sqe->addr2);
3387
3388 if (!io)
3389 return 0;
3390
3391 return move_addr_to_kernel(conn->addr, conn->addr_len,
3fbb51c1 3392 &io->connect.address);
f499a021 3393#else
3fbb51c1 3394 return -EOPNOTSUPP;
f499a021
JA
3395#endif
3396}
3397
fc4df999
JA
3398static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
3399 bool force_nonblock)
f8e85cf2
JA
3400{
3401#if defined(CONFIG_NET)
f499a021 3402 struct io_async_ctx __io, *io;
f8e85cf2 3403 unsigned file_flags;
3fbb51c1 3404 int ret;
f8e85cf2 3405
f499a021
JA
3406 if (req->io) {
3407 io = req->io;
3408 } else {
3529d8c2
JA
3409 ret = move_addr_to_kernel(req->connect.addr,
3410 req->connect.addr_len,
3411 &__io.connect.address);
f499a021
JA
3412 if (ret)
3413 goto out;
3414 io = &__io;
3415 }
3416
3fbb51c1
JA
3417 file_flags = force_nonblock ? O_NONBLOCK : 0;
3418
3419 ret = __sys_connect_file(req->file, &io->connect.address,
3420 req->connect.addr_len, file_flags);
87f80d62 3421 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
b7bb4f7d
JA
3422 if (req->io)
3423 return -EAGAIN;
3424 if (io_alloc_async_ctx(req)) {
f499a021
JA
3425 ret = -ENOMEM;
3426 goto out;
3427 }
b7bb4f7d 3428 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
f8e85cf2 3429 return -EAGAIN;
f499a021 3430 }
f8e85cf2
JA
3431 if (ret == -ERESTARTSYS)
3432 ret = -EINTR;
f499a021 3433out:
4e88d6e7
JA
3434 if (ret < 0)
3435 req_set_fail_links(req);
f8e85cf2
JA
3436 io_cqring_add_event(req, ret);
3437 io_put_req_find_next(req, nxt);
3438 return 0;
3439#else
3440 return -EOPNOTSUPP;
3441#endif
3442}
3443
221c5eb2
JA
3444static void io_poll_remove_one(struct io_kiocb *req)
3445{
3446 struct io_poll_iocb *poll = &req->poll;
3447
3448 spin_lock(&poll->head->lock);
3449 WRITE_ONCE(poll->canceled, true);
392edb45
JA
3450 if (!list_empty(&poll->wait.entry)) {
3451 list_del_init(&poll->wait.entry);
a197f664 3452 io_queue_async_work(req);
221c5eb2
JA
3453 }
3454 spin_unlock(&poll->head->lock);
78076bb6 3455 hash_del(&req->hash_node);
221c5eb2
JA
3456}
3457
3458static void io_poll_remove_all(struct io_ring_ctx *ctx)
3459{
78076bb6 3460 struct hlist_node *tmp;
221c5eb2 3461 struct io_kiocb *req;
78076bb6 3462 int i;
221c5eb2
JA
3463
3464 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
3465 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
3466 struct hlist_head *list;
3467
3468 list = &ctx->cancel_hash[i];
3469 hlist_for_each_entry_safe(req, tmp, list, hash_node)
3470 io_poll_remove_one(req);
221c5eb2
JA
3471 }
3472 spin_unlock_irq(&ctx->completion_lock);
3473}
3474
47f46768
JA
3475static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
3476{
78076bb6 3477 struct hlist_head *list;
47f46768
JA
3478 struct io_kiocb *req;
3479
78076bb6
JA
3480 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
3481 hlist_for_each_entry(req, list, hash_node) {
3482 if (sqe_addr == req->user_data) {
eac406c6
JA
3483 io_poll_remove_one(req);
3484 return 0;
3485 }
47f46768
JA
3486 }
3487
3488 return -ENOENT;
3489}
3490
3529d8c2
JA
3491static int io_poll_remove_prep(struct io_kiocb *req,
3492 const struct io_uring_sqe *sqe)
0969e783 3493{
0969e783
JA
3494 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3495 return -EINVAL;
3496 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3497 sqe->poll_events)
3498 return -EINVAL;
3499
3500 req->poll.addr = READ_ONCE(sqe->addr);
0969e783
JA
3501 return 0;
3502}
3503
221c5eb2
JA
3504/*
3505 * Find a running poll command that matches one specified in sqe->addr,
3506 * and remove it if found.
3507 */
fc4df999 3508static int io_poll_remove(struct io_kiocb *req)
221c5eb2
JA
3509{
3510 struct io_ring_ctx *ctx = req->ctx;
0969e783 3511 u64 addr;
47f46768 3512 int ret;
221c5eb2 3513
0969e783 3514 addr = req->poll.addr;
221c5eb2 3515 spin_lock_irq(&ctx->completion_lock);
0969e783 3516 ret = io_poll_cancel(ctx, addr);
221c5eb2
JA
3517 spin_unlock_irq(&ctx->completion_lock);
3518
78e19bbe 3519 io_cqring_add_event(req, ret);
4e88d6e7
JA
3520 if (ret < 0)
3521 req_set_fail_links(req);
e65ef56d 3522 io_put_req(req);
221c5eb2
JA
3523 return 0;
3524}
3525
b0dd8a41 3526static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
221c5eb2 3527{
a197f664
JL
3528 struct io_ring_ctx *ctx = req->ctx;
3529
8c838788 3530 req->poll.done = true;
b0dd8a41
JA
3531 if (error)
3532 io_cqring_fill_event(req, error);
3533 else
3534 io_cqring_fill_event(req, mangle_poll(mask));
8c838788 3535 io_commit_cqring(ctx);
221c5eb2
JA
3536}
3537
561fb04a 3538static void io_poll_complete_work(struct io_wq_work **workptr)
221c5eb2 3539{
561fb04a 3540 struct io_wq_work *work = *workptr;
221c5eb2
JA
3541 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3542 struct io_poll_iocb *poll = &req->poll;
3543 struct poll_table_struct pt = { ._key = poll->events };
3544 struct io_ring_ctx *ctx = req->ctx;
89723d0b 3545 struct io_kiocb *nxt = NULL;
221c5eb2 3546 __poll_t mask = 0;
b0dd8a41 3547 int ret = 0;
221c5eb2 3548
b0dd8a41 3549 if (work->flags & IO_WQ_WORK_CANCEL) {
561fb04a 3550 WRITE_ONCE(poll->canceled, true);
b0dd8a41
JA
3551 ret = -ECANCELED;
3552 } else if (READ_ONCE(poll->canceled)) {
3553 ret = -ECANCELED;
3554 }
561fb04a 3555
b0dd8a41 3556 if (ret != -ECANCELED)
221c5eb2
JA
3557 mask = vfs_poll(poll->file, &pt) & poll->events;
3558
3559 /*
3560 * Note that ->ki_cancel callers also delete iocb from active_reqs after
3561 * calling ->ki_cancel. We need the ctx_lock roundtrip here to
3562 * synchronize with them. In the cancellation case the list_del_init
3563 * itself is not actually needed, but harmless so we keep it in to
3564 * avoid further branches in the fast path.
3565 */
3566 spin_lock_irq(&ctx->completion_lock);
b0dd8a41 3567 if (!mask && ret != -ECANCELED) {
392edb45 3568 add_wait_queue(poll->head, &poll->wait);
221c5eb2
JA
3569 spin_unlock_irq(&ctx->completion_lock);
3570 return;
3571 }
78076bb6 3572 hash_del(&req->hash_node);
b0dd8a41 3573 io_poll_complete(req, mask, ret);
221c5eb2
JA
3574 spin_unlock_irq(&ctx->completion_lock);
3575
8c838788 3576 io_cqring_ev_posted(ctx);
89723d0b 3577
4e88d6e7
JA
3578 if (ret < 0)
3579 req_set_fail_links(req);
ec9c02ad 3580 io_put_req_find_next(req, &nxt);
89723d0b 3581 if (nxt)
78912934 3582 io_wq_assign_next(workptr, nxt);
221c5eb2
JA
3583}
3584
e94f141b
JA
3585static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
3586{
e94f141b 3587 struct io_kiocb *req, *tmp;
8237e045 3588 struct req_batch rb;
e94f141b 3589
c6ca97b3 3590 rb.to_free = rb.need_iter = 0;
e94f141b
JA
3591 spin_lock_irq(&ctx->completion_lock);
3592 llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
3593 hash_del(&req->hash_node);
3594 io_poll_complete(req, req->result, 0);
3595
8237e045
JA
3596 if (refcount_dec_and_test(&req->refs) &&
3597 !io_req_multi_free(&rb, req)) {
3598 req->flags |= REQ_F_COMP_LOCKED;
3599 io_free_req(req);
e94f141b
JA
3600 }
3601 }
3602 spin_unlock_irq(&ctx->completion_lock);
3603
3604 io_cqring_ev_posted(ctx);
8237e045 3605 io_free_req_many(ctx, &rb);
e94f141b
JA
3606}
3607
3608static void io_poll_flush(struct io_wq_work **workptr)
3609{
3610 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3611 struct llist_node *nodes;
3612
3613 nodes = llist_del_all(&req->ctx->poll_llist);
3614 if (nodes)
3615 __io_poll_flush(req->ctx, nodes);
3616}
3617
f0b493e6
JA
3618static void io_poll_trigger_evfd(struct io_wq_work **workptr)
3619{
3620 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3621
3622 eventfd_signal(req->ctx->cq_ev_fd, 1);
3623 io_put_req(req);
3624}
3625
221c5eb2
JA
3626static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
3627 void *key)
3628{
e944475e 3629 struct io_poll_iocb *poll = wait->private;
221c5eb2
JA
3630 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
3631 struct io_ring_ctx *ctx = req->ctx;
3632 __poll_t mask = key_to_poll(key);
221c5eb2
JA
3633
3634 /* for instances that support it check for an event match first: */
8c838788
JA
3635 if (mask && !(mask & poll->events))
3636 return 0;
221c5eb2 3637
392edb45 3638 list_del_init(&poll->wait.entry);
221c5eb2 3639
7c9e7f0f
JA
3640 /*
3641 * Run completion inline if we can. We're using trylock here because
3642 * we are violating the completion_lock -> poll wq lock ordering.
3643 * If we have a link timeout we're going to need the completion_lock
3644 * for finalizing the request, mark us as having grabbed that already.
3645 */
e94f141b
JA
3646 if (mask) {
3647 unsigned long flags;
221c5eb2 3648
e94f141b
JA
3649 if (llist_empty(&ctx->poll_llist) &&
3650 spin_trylock_irqsave(&ctx->completion_lock, flags)) {
f0b493e6
JA
3651 bool trigger_ev;
3652
e94f141b
JA
3653 hash_del(&req->hash_node);
3654 io_poll_complete(req, mask, 0);
e94f141b 3655
f0b493e6
JA
3656 trigger_ev = io_should_trigger_evfd(ctx);
3657 if (trigger_ev && eventfd_signal_count()) {
3658 trigger_ev = false;
3659 req->work.func = io_poll_trigger_evfd;
3660 } else {
3661 req->flags |= REQ_F_COMP_LOCKED;
3662 io_put_req(req);
3663 req = NULL;
3664 }
3665 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3666 __io_cqring_ev_posted(ctx, trigger_ev);
e94f141b
JA
3667 } else {
3668 req->result = mask;
3669 req->llist_node.next = NULL;
3670 /* if the list wasn't empty, we're done */
3671 if (!llist_add(&req->llist_node, &ctx->poll_llist))
3672 req = NULL;
3673 else
3674 req->work.func = io_poll_flush;
3675 }
221c5eb2 3676 }
e94f141b
JA
3677 if (req)
3678 io_queue_async_work(req);
221c5eb2 3679
221c5eb2
JA
3680 return 1;
3681}
3682
3683struct io_poll_table {
3684 struct poll_table_struct pt;
3685 struct io_kiocb *req;
3686 int error;
3687};
3688
3689static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
3690 struct poll_table_struct *p)
3691{
3692 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
3693
3694 if (unlikely(pt->req->poll.head)) {
3695 pt->error = -EINVAL;
3696 return;
3697 }
3698
3699 pt->error = 0;
3700 pt->req->poll.head = head;
392edb45 3701 add_wait_queue(head, &pt->req->poll.wait);
221c5eb2
JA
3702}
3703
eac406c6
JA
3704static void io_poll_req_insert(struct io_kiocb *req)
3705{
3706 struct io_ring_ctx *ctx = req->ctx;
78076bb6
JA
3707 struct hlist_head *list;
3708
3709 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
3710 hlist_add_head(&req->hash_node, list);
eac406c6
JA
3711}
3712
3529d8c2 3713static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
3714{
3715 struct io_poll_iocb *poll = &req->poll;
221c5eb2 3716 u16 events;
221c5eb2
JA
3717
3718 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3719 return -EINVAL;
3720 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
3721 return -EINVAL;
09bb8394
JA
3722 if (!poll->file)
3723 return -EBADF;
221c5eb2 3724
221c5eb2
JA
3725 events = READ_ONCE(sqe->poll_events);
3726 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
0969e783
JA
3727 return 0;
3728}
3729
3730static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
3731{
3732 struct io_poll_iocb *poll = &req->poll;
3733 struct io_ring_ctx *ctx = req->ctx;
3734 struct io_poll_table ipt;
3735 bool cancel = false;
3736 __poll_t mask;
0969e783
JA
3737
3738 INIT_IO_WORK(&req->work, io_poll_complete_work);
78076bb6 3739 INIT_HLIST_NODE(&req->hash_node);
221c5eb2 3740
221c5eb2 3741 poll->head = NULL;
8c838788 3742 poll->done = false;
221c5eb2
JA
3743 poll->canceled = false;
3744
3745 ipt.pt._qproc = io_poll_queue_proc;
3746 ipt.pt._key = poll->events;
3747 ipt.req = req;
3748 ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
3749
3750 /* initialized the list so that we can do list_empty checks */
392edb45
JA
3751 INIT_LIST_HEAD(&poll->wait.entry);
3752 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
3753 poll->wait.private = poll;
221c5eb2 3754
36703247
JA
3755 INIT_LIST_HEAD(&req->list);
3756
221c5eb2 3757 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
221c5eb2
JA
3758
3759 spin_lock_irq(&ctx->completion_lock);
8c838788
JA
3760 if (likely(poll->head)) {
3761 spin_lock(&poll->head->lock);
392edb45 3762 if (unlikely(list_empty(&poll->wait.entry))) {
8c838788
JA
3763 if (ipt.error)
3764 cancel = true;
3765 ipt.error = 0;
3766 mask = 0;
3767 }
3768 if (mask || ipt.error)
392edb45 3769 list_del_init(&poll->wait.entry);
8c838788
JA
3770 else if (cancel)
3771 WRITE_ONCE(poll->canceled, true);
3772 else if (!poll->done) /* actually waiting for an event */
eac406c6 3773 io_poll_req_insert(req);
8c838788
JA
3774 spin_unlock(&poll->head->lock);
3775 }
3776 if (mask) { /* no async, we'd stolen it */
221c5eb2 3777 ipt.error = 0;
b0dd8a41 3778 io_poll_complete(req, mask, 0);
221c5eb2 3779 }
221c5eb2
JA
3780 spin_unlock_irq(&ctx->completion_lock);
3781
8c838788
JA
3782 if (mask) {
3783 io_cqring_ev_posted(ctx);
ec9c02ad 3784 io_put_req_find_next(req, nxt);
221c5eb2 3785 }
8c838788 3786 return ipt.error;
221c5eb2
JA
3787}
3788
5262f567
JA
3789static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
3790{
ad8a48ac
JA
3791 struct io_timeout_data *data = container_of(timer,
3792 struct io_timeout_data, timer);
3793 struct io_kiocb *req = data->req;
3794 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
3795 unsigned long flags;
3796
5262f567
JA
3797 atomic_inc(&ctx->cq_timeouts);
3798
3799 spin_lock_irqsave(&ctx->completion_lock, flags);
ef03681a 3800 /*
11365043
JA
3801 * We could be racing with timeout deletion. If the list is empty,
3802 * then timeout lookup already found it and will be handling it.
ef03681a 3803 */
842f9612 3804 if (!list_empty(&req->list)) {
11365043 3805 struct io_kiocb *prev;
5262f567 3806
11365043
JA
3807 /*
3808 * Adjust the reqs sequence before the current one because it
d195a66e 3809 * will consume a slot in the cq_ring and the cq_tail
11365043
JA
3810 * pointer will be increased, otherwise other timeout reqs may
3811 * return in advance without waiting for enough wait_nr.
3812 */
3813 prev = req;
3814 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
3815 prev->sequence++;
11365043 3816 list_del_init(&req->list);
11365043 3817 }
5262f567 3818
78e19bbe 3819 io_cqring_fill_event(req, -ETIME);
5262f567
JA
3820 io_commit_cqring(ctx);
3821 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3822
3823 io_cqring_ev_posted(ctx);
4e88d6e7 3824 req_set_fail_links(req);
5262f567
JA
3825 io_put_req(req);
3826 return HRTIMER_NORESTART;
3827}
3828
47f46768
JA
3829static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
3830{
3831 struct io_kiocb *req;
3832 int ret = -ENOENT;
3833
3834 list_for_each_entry(req, &ctx->timeout_list, list) {
3835 if (user_data == req->user_data) {
3836 list_del_init(&req->list);
3837 ret = 0;
3838 break;
3839 }
3840 }
3841
3842 if (ret == -ENOENT)
3843 return ret;
3844
2d28390a 3845 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
47f46768
JA
3846 if (ret == -1)
3847 return -EALREADY;
3848
4e88d6e7 3849 req_set_fail_links(req);
47f46768
JA
3850 io_cqring_fill_event(req, -ECANCELED);
3851 io_put_req(req);
3852 return 0;
3853}
3854
3529d8c2
JA
3855static int io_timeout_remove_prep(struct io_kiocb *req,
3856 const struct io_uring_sqe *sqe)
b29472ee 3857{
b29472ee
JA
3858 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3859 return -EINVAL;
3860 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
3861 return -EINVAL;
3862
3863 req->timeout.addr = READ_ONCE(sqe->addr);
3864 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
3865 if (req->timeout.flags)
3866 return -EINVAL;
3867
b29472ee
JA
3868 return 0;
3869}
3870
11365043
JA
3871/*
3872 * Remove or update an existing timeout command
3873 */
fc4df999 3874static int io_timeout_remove(struct io_kiocb *req)
11365043
JA
3875{
3876 struct io_ring_ctx *ctx = req->ctx;
47f46768 3877 int ret;
11365043 3878
11365043 3879 spin_lock_irq(&ctx->completion_lock);
b29472ee 3880 ret = io_timeout_cancel(ctx, req->timeout.addr);
11365043 3881
47f46768 3882 io_cqring_fill_event(req, ret);
11365043
JA
3883 io_commit_cqring(ctx);
3884 spin_unlock_irq(&ctx->completion_lock);
5262f567 3885 io_cqring_ev_posted(ctx);
4e88d6e7
JA
3886 if (ret < 0)
3887 req_set_fail_links(req);
ec9c02ad 3888 io_put_req(req);
11365043 3889 return 0;
5262f567
JA
3890}
3891
3529d8c2 3892static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 3893 bool is_timeout_link)
5262f567 3894{
ad8a48ac 3895 struct io_timeout_data *data;
a41525ab 3896 unsigned flags;
5262f567 3897
ad8a48ac 3898 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 3899 return -EINVAL;
ad8a48ac 3900 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 3901 return -EINVAL;
2d28390a
JA
3902 if (sqe->off && is_timeout_link)
3903 return -EINVAL;
a41525ab
JA
3904 flags = READ_ONCE(sqe->timeout_flags);
3905 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 3906 return -EINVAL;
bdf20073 3907
26a61679
JA
3908 req->timeout.count = READ_ONCE(sqe->off);
3909
3529d8c2 3910 if (!req->io && io_alloc_async_ctx(req))
26a61679
JA
3911 return -ENOMEM;
3912
3913 data = &req->io->timeout;
ad8a48ac 3914 data->req = req;
ad8a48ac
JA
3915 req->flags |= REQ_F_TIMEOUT;
3916
3917 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
3918 return -EFAULT;
3919
11365043 3920 if (flags & IORING_TIMEOUT_ABS)
ad8a48ac 3921 data->mode = HRTIMER_MODE_ABS;
11365043 3922 else
ad8a48ac 3923 data->mode = HRTIMER_MODE_REL;
11365043 3924
ad8a48ac
JA
3925 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
3926 return 0;
3927}
3928
fc4df999 3929static int io_timeout(struct io_kiocb *req)
ad8a48ac
JA
3930{
3931 unsigned count;
3932 struct io_ring_ctx *ctx = req->ctx;
3933 struct io_timeout_data *data;
3934 struct list_head *entry;
3935 unsigned span = 0;
ad8a48ac 3936
2d28390a 3937 data = &req->io->timeout;
93bd25bb 3938
5262f567
JA
3939 /*
3940 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
3941 * timeout event to be satisfied. If it isn't set, then this is
3942 * a pure timeout request, sequence isn't used.
5262f567 3943 */
26a61679 3944 count = req->timeout.count;
93bd25bb
JA
3945 if (!count) {
3946 req->flags |= REQ_F_TIMEOUT_NOSEQ;
3947 spin_lock_irq(&ctx->completion_lock);
3948 entry = ctx->timeout_list.prev;
3949 goto add;
3950 }
5262f567
JA
3951
3952 req->sequence = ctx->cached_sq_head + count - 1;
2d28390a 3953 data->seq_offset = count;
5262f567
JA
3954
3955 /*
3956 * Insertion sort, ensuring the first entry in the list is always
3957 * the one we need first.
3958 */
5262f567
JA
3959 spin_lock_irq(&ctx->completion_lock);
3960 list_for_each_prev(entry, &ctx->timeout_list) {
3961 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5da0fb1a 3962 unsigned nxt_sq_head;
3963 long long tmp, tmp_nxt;
2d28390a 3964 u32 nxt_offset = nxt->io->timeout.seq_offset;
5262f567 3965
93bd25bb
JA
3966 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
3967 continue;
3968
5da0fb1a 3969 /*
3970 * Since cached_sq_head + count - 1 can overflow, use type long
3971 * long to store it.
3972 */
3973 tmp = (long long)ctx->cached_sq_head + count - 1;
cc42e0ac
PB
3974 nxt_sq_head = nxt->sequence - nxt_offset + 1;
3975 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
5da0fb1a 3976
3977 /*
3978 * cached_sq_head may overflow, and it will never overflow twice
3979 * once there is some timeout req still be valid.
3980 */
3981 if (ctx->cached_sq_head < nxt_sq_head)
8b07a65a 3982 tmp += UINT_MAX;
5da0fb1a 3983
a1f58ba4 3984 if (tmp > tmp_nxt)
5262f567 3985 break;
a1f58ba4 3986
3987 /*
3988 * Sequence of reqs after the insert one and itself should
3989 * be adjusted because each timeout req consumes a slot.
3990 */
3991 span++;
3992 nxt->sequence++;
5262f567 3993 }
a1f58ba4 3994 req->sequence -= span;
93bd25bb 3995add:
5262f567 3996 list_add(&req->list, entry);
ad8a48ac
JA
3997 data->timer.function = io_timeout_fn;
3998 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 3999 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
4000 return 0;
4001}
5262f567 4002
62755e35
JA
4003static bool io_cancel_cb(struct io_wq_work *work, void *data)
4004{
4005 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4006
4007 return req->user_data == (unsigned long) data;
4008}
4009
e977d6d3 4010static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
62755e35 4011{
62755e35 4012 enum io_wq_cancel cancel_ret;
62755e35
JA
4013 int ret = 0;
4014
62755e35
JA
4015 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
4016 switch (cancel_ret) {
4017 case IO_WQ_CANCEL_OK:
4018 ret = 0;
4019 break;
4020 case IO_WQ_CANCEL_RUNNING:
4021 ret = -EALREADY;
4022 break;
4023 case IO_WQ_CANCEL_NOTFOUND:
4024 ret = -ENOENT;
4025 break;
4026 }
4027
e977d6d3
JA
4028 return ret;
4029}
4030
47f46768
JA
4031static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4032 struct io_kiocb *req, __u64 sqe_addr,
b0dd8a41 4033 struct io_kiocb **nxt, int success_ret)
47f46768
JA
4034{
4035 unsigned long flags;
4036 int ret;
4037
4038 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4039 if (ret != -ENOENT) {
4040 spin_lock_irqsave(&ctx->completion_lock, flags);
4041 goto done;
4042 }
4043
4044 spin_lock_irqsave(&ctx->completion_lock, flags);
4045 ret = io_timeout_cancel(ctx, sqe_addr);
4046 if (ret != -ENOENT)
4047 goto done;
4048 ret = io_poll_cancel(ctx, sqe_addr);
4049done:
b0dd8a41
JA
4050 if (!ret)
4051 ret = success_ret;
47f46768
JA
4052 io_cqring_fill_event(req, ret);
4053 io_commit_cqring(ctx);
4054 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4055 io_cqring_ev_posted(ctx);
4056
4e88d6e7
JA
4057 if (ret < 0)
4058 req_set_fail_links(req);
47f46768
JA
4059 io_put_req_find_next(req, nxt);
4060}
4061
3529d8c2
JA
4062static int io_async_cancel_prep(struct io_kiocb *req,
4063 const struct io_uring_sqe *sqe)
e977d6d3 4064{
fbf23849 4065 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3
JA
4066 return -EINVAL;
4067 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4068 sqe->cancel_flags)
4069 return -EINVAL;
4070
fbf23849
JA
4071 req->cancel.addr = READ_ONCE(sqe->addr);
4072 return 0;
4073}
4074
4075static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
4076{
4077 struct io_ring_ctx *ctx = req->ctx;
fbf23849
JA
4078
4079 io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
5262f567
JA
4080 return 0;
4081}
4082
05f3fb3c
JA
4083static int io_files_update_prep(struct io_kiocb *req,
4084 const struct io_uring_sqe *sqe)
4085{
4086 if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4087 return -EINVAL;
4088
4089 req->files_update.offset = READ_ONCE(sqe->off);
4090 req->files_update.nr_args = READ_ONCE(sqe->len);
4091 if (!req->files_update.nr_args)
4092 return -EINVAL;
4093 req->files_update.arg = READ_ONCE(sqe->addr);
4094 return 0;
4095}
4096
4097static int io_files_update(struct io_kiocb *req, bool force_nonblock)
fbf23849
JA
4098{
4099 struct io_ring_ctx *ctx = req->ctx;
05f3fb3c
JA
4100 struct io_uring_files_update up;
4101 int ret;
fbf23849 4102
f86cd20c 4103 if (force_nonblock)
05f3fb3c 4104 return -EAGAIN;
05f3fb3c
JA
4105
4106 up.offset = req->files_update.offset;
4107 up.fds = req->files_update.arg;
4108
4109 mutex_lock(&ctx->uring_lock);
4110 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4111 mutex_unlock(&ctx->uring_lock);
4112
4113 if (ret < 0)
4114 req_set_fail_links(req);
4115 io_cqring_add_event(req, ret);
4116 io_put_req(req);
5262f567
JA
4117 return 0;
4118}
4119
3529d8c2
JA
4120static int io_req_defer_prep(struct io_kiocb *req,
4121 const struct io_uring_sqe *sqe)
f67676d1 4122{
e781573e 4123 ssize_t ret = 0;
f67676d1 4124
f86cd20c
JA
4125 if (io_op_defs[req->opcode].file_table) {
4126 ret = io_grab_files(req);
4127 if (unlikely(ret))
4128 return ret;
4129 }
4130
cccf0ee8
JA
4131 io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4132
d625c6ee 4133 switch (req->opcode) {
e781573e
JA
4134 case IORING_OP_NOP:
4135 break;
f67676d1
JA
4136 case IORING_OP_READV:
4137 case IORING_OP_READ_FIXED:
3a6820f2 4138 case IORING_OP_READ:
3529d8c2 4139 ret = io_read_prep(req, sqe, true);
f67676d1
JA
4140 break;
4141 case IORING_OP_WRITEV:
4142 case IORING_OP_WRITE_FIXED:
3a6820f2 4143 case IORING_OP_WRITE:
3529d8c2 4144 ret = io_write_prep(req, sqe, true);
f67676d1 4145 break;
0969e783 4146 case IORING_OP_POLL_ADD:
3529d8c2 4147 ret = io_poll_add_prep(req, sqe);
0969e783
JA
4148 break;
4149 case IORING_OP_POLL_REMOVE:
3529d8c2 4150 ret = io_poll_remove_prep(req, sqe);
0969e783 4151 break;
8ed8d3c3 4152 case IORING_OP_FSYNC:
3529d8c2 4153 ret = io_prep_fsync(req, sqe);
8ed8d3c3
JA
4154 break;
4155 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2 4156 ret = io_prep_sfr(req, sqe);
8ed8d3c3 4157 break;
03b1230c 4158 case IORING_OP_SENDMSG:
fddaface 4159 case IORING_OP_SEND:
3529d8c2 4160 ret = io_sendmsg_prep(req, sqe);
03b1230c
JA
4161 break;
4162 case IORING_OP_RECVMSG:
fddaface 4163 case IORING_OP_RECV:
3529d8c2 4164 ret = io_recvmsg_prep(req, sqe);
03b1230c 4165 break;
f499a021 4166 case IORING_OP_CONNECT:
3529d8c2 4167 ret = io_connect_prep(req, sqe);
f499a021 4168 break;
2d28390a 4169 case IORING_OP_TIMEOUT:
3529d8c2 4170 ret = io_timeout_prep(req, sqe, false);
b7bb4f7d 4171 break;
b29472ee 4172 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2 4173 ret = io_timeout_remove_prep(req, sqe);
b29472ee 4174 break;
fbf23849 4175 case IORING_OP_ASYNC_CANCEL:
3529d8c2 4176 ret = io_async_cancel_prep(req, sqe);
fbf23849 4177 break;
2d28390a 4178 case IORING_OP_LINK_TIMEOUT:
3529d8c2 4179 ret = io_timeout_prep(req, sqe, true);
b7bb4f7d 4180 break;
8ed8d3c3 4181 case IORING_OP_ACCEPT:
3529d8c2 4182 ret = io_accept_prep(req, sqe);
8ed8d3c3 4183 break;
d63d1b5e
JA
4184 case IORING_OP_FALLOCATE:
4185 ret = io_fallocate_prep(req, sqe);
4186 break;
15b71abe
JA
4187 case IORING_OP_OPENAT:
4188 ret = io_openat_prep(req, sqe);
4189 break;
b5dba59e
JA
4190 case IORING_OP_CLOSE:
4191 ret = io_close_prep(req, sqe);
4192 break;
05f3fb3c
JA
4193 case IORING_OP_FILES_UPDATE:
4194 ret = io_files_update_prep(req, sqe);
4195 break;
eddc7ef5
JA
4196 case IORING_OP_STATX:
4197 ret = io_statx_prep(req, sqe);
4198 break;
4840e418
JA
4199 case IORING_OP_FADVISE:
4200 ret = io_fadvise_prep(req, sqe);
4201 break;
c1ca757b
JA
4202 case IORING_OP_MADVISE:
4203 ret = io_madvise_prep(req, sqe);
4204 break;
cebdb986
JA
4205 case IORING_OP_OPENAT2:
4206 ret = io_openat2_prep(req, sqe);
4207 break;
3e4827b0
JA
4208 case IORING_OP_EPOLL_CTL:
4209 ret = io_epoll_ctl_prep(req, sqe);
4210 break;
f67676d1 4211 default:
e781573e
JA
4212 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4213 req->opcode);
4214 ret = -EINVAL;
b7bb4f7d 4215 break;
f67676d1
JA
4216 }
4217
b7bb4f7d 4218 return ret;
f67676d1
JA
4219}
4220
3529d8c2 4221static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
de0617e4 4222{
a197f664 4223 struct io_ring_ctx *ctx = req->ctx;
f67676d1 4224 int ret;
de0617e4 4225
9d858b21
BL
4226 /* Still need defer if there is pending req in defer list. */
4227 if (!req_need_defer(req) && list_empty(&ctx->defer_list))
de0617e4
JA
4228 return 0;
4229
3529d8c2 4230 if (!req->io && io_alloc_async_ctx(req))
de0617e4
JA
4231 return -EAGAIN;
4232
3529d8c2 4233 ret = io_req_defer_prep(req, sqe);
b7bb4f7d 4234 if (ret < 0)
2d28390a 4235 return ret;
2d28390a 4236
de0617e4 4237 spin_lock_irq(&ctx->completion_lock);
9d858b21 4238 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
de0617e4 4239 spin_unlock_irq(&ctx->completion_lock);
de0617e4
JA
4240 return 0;
4241 }
4242
915967f6 4243 trace_io_uring_defer(ctx, req, req->user_data);
de0617e4
JA
4244 list_add_tail(&req->list, &ctx->defer_list);
4245 spin_unlock_irq(&ctx->completion_lock);
4246 return -EIOCBQUEUED;
4247}
4248
99bc4c38
PB
4249static void io_cleanup_req(struct io_kiocb *req)
4250{
4251 struct io_async_ctx *io = req->io;
4252
4253 switch (req->opcode) {
4254 case IORING_OP_READV:
4255 case IORING_OP_READ_FIXED:
4256 case IORING_OP_READ:
4257 case IORING_OP_WRITEV:
4258 case IORING_OP_WRITE_FIXED:
4259 case IORING_OP_WRITE:
4260 if (io->rw.iov != io->rw.fast_iov)
4261 kfree(io->rw.iov);
4262 break;
4263 case IORING_OP_SENDMSG:
4264 case IORING_OP_RECVMSG:
4265 if (io->msg.iov != io->msg.fast_iov)
4266 kfree(io->msg.iov);
4267 break;
8fef80bf
PB
4268 case IORING_OP_OPENAT:
4269 case IORING_OP_OPENAT2:
4270 case IORING_OP_STATX:
4271 putname(req->open.filename);
4272 break;
99bc4c38
PB
4273 }
4274
4275 req->flags &= ~REQ_F_NEED_CLEANUP;
4276}
4277
3529d8c2
JA
4278static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4279 struct io_kiocb **nxt, bool force_nonblock)
2b188cc1 4280{
a197f664 4281 struct io_ring_ctx *ctx = req->ctx;
d625c6ee 4282 int ret;
2b188cc1 4283
d625c6ee 4284 switch (req->opcode) {
2b188cc1 4285 case IORING_OP_NOP:
78e19bbe 4286 ret = io_nop(req);
2b188cc1
JA
4287 break;
4288 case IORING_OP_READV:
edafccee 4289 case IORING_OP_READ_FIXED:
3a6820f2 4290 case IORING_OP_READ:
3529d8c2
JA
4291 if (sqe) {
4292 ret = io_read_prep(req, sqe, force_nonblock);
4293 if (ret < 0)
4294 break;
4295 }
267bc904 4296 ret = io_read(req, nxt, force_nonblock);
edafccee 4297 break;
3529d8c2 4298 case IORING_OP_WRITEV:
edafccee 4299 case IORING_OP_WRITE_FIXED:
3a6820f2 4300 case IORING_OP_WRITE:
3529d8c2
JA
4301 if (sqe) {
4302 ret = io_write_prep(req, sqe, force_nonblock);
4303 if (ret < 0)
4304 break;
4305 }
267bc904 4306 ret = io_write(req, nxt, force_nonblock);
2b188cc1 4307 break;
c992fe29 4308 case IORING_OP_FSYNC:
3529d8c2
JA
4309 if (sqe) {
4310 ret = io_prep_fsync(req, sqe);
4311 if (ret < 0)
4312 break;
4313 }
fc4df999 4314 ret = io_fsync(req, nxt, force_nonblock);
c992fe29 4315 break;
221c5eb2 4316 case IORING_OP_POLL_ADD:
3529d8c2
JA
4317 if (sqe) {
4318 ret = io_poll_add_prep(req, sqe);
4319 if (ret)
4320 break;
4321 }
fc4df999 4322 ret = io_poll_add(req, nxt);
221c5eb2
JA
4323 break;
4324 case IORING_OP_POLL_REMOVE:
3529d8c2
JA
4325 if (sqe) {
4326 ret = io_poll_remove_prep(req, sqe);
4327 if (ret < 0)
4328 break;
4329 }
fc4df999 4330 ret = io_poll_remove(req);
221c5eb2 4331 break;
5d17b4a4 4332 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2
JA
4333 if (sqe) {
4334 ret = io_prep_sfr(req, sqe);
4335 if (ret < 0)
4336 break;
4337 }
fc4df999 4338 ret = io_sync_file_range(req, nxt, force_nonblock);
5d17b4a4 4339 break;
0fa03c62 4340 case IORING_OP_SENDMSG:
fddaface 4341 case IORING_OP_SEND:
3529d8c2
JA
4342 if (sqe) {
4343 ret = io_sendmsg_prep(req, sqe);
4344 if (ret < 0)
4345 break;
4346 }
fddaface
JA
4347 if (req->opcode == IORING_OP_SENDMSG)
4348 ret = io_sendmsg(req, nxt, force_nonblock);
4349 else
4350 ret = io_send(req, nxt, force_nonblock);
0fa03c62 4351 break;
aa1fa28f 4352 case IORING_OP_RECVMSG:
fddaface 4353 case IORING_OP_RECV:
3529d8c2
JA
4354 if (sqe) {
4355 ret = io_recvmsg_prep(req, sqe);
4356 if (ret)
4357 break;
4358 }
fddaface
JA
4359 if (req->opcode == IORING_OP_RECVMSG)
4360 ret = io_recvmsg(req, nxt, force_nonblock);
4361 else
4362 ret = io_recv(req, nxt, force_nonblock);
aa1fa28f 4363 break;
5262f567 4364 case IORING_OP_TIMEOUT:
3529d8c2
JA
4365 if (sqe) {
4366 ret = io_timeout_prep(req, sqe, false);
4367 if (ret)
4368 break;
4369 }
fc4df999 4370 ret = io_timeout(req);
5262f567 4371 break;
11365043 4372 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2
JA
4373 if (sqe) {
4374 ret = io_timeout_remove_prep(req, sqe);
4375 if (ret)
4376 break;
4377 }
fc4df999 4378 ret = io_timeout_remove(req);
11365043 4379 break;
17f2fe35 4380 case IORING_OP_ACCEPT:
3529d8c2
JA
4381 if (sqe) {
4382 ret = io_accept_prep(req, sqe);
4383 if (ret)
4384 break;
4385 }
fc4df999 4386 ret = io_accept(req, nxt, force_nonblock);
17f2fe35 4387 break;
f8e85cf2 4388 case IORING_OP_CONNECT:
3529d8c2
JA
4389 if (sqe) {
4390 ret = io_connect_prep(req, sqe);
4391 if (ret)
4392 break;
4393 }
fc4df999 4394 ret = io_connect(req, nxt, force_nonblock);
f8e85cf2 4395 break;
62755e35 4396 case IORING_OP_ASYNC_CANCEL:
3529d8c2
JA
4397 if (sqe) {
4398 ret = io_async_cancel_prep(req, sqe);
4399 if (ret)
4400 break;
4401 }
fc4df999 4402 ret = io_async_cancel(req, nxt);
62755e35 4403 break;
d63d1b5e
JA
4404 case IORING_OP_FALLOCATE:
4405 if (sqe) {
4406 ret = io_fallocate_prep(req, sqe);
4407 if (ret)
4408 break;
4409 }
4410 ret = io_fallocate(req, nxt, force_nonblock);
4411 break;
15b71abe
JA
4412 case IORING_OP_OPENAT:
4413 if (sqe) {
4414 ret = io_openat_prep(req, sqe);
4415 if (ret)
4416 break;
4417 }
4418 ret = io_openat(req, nxt, force_nonblock);
4419 break;
b5dba59e
JA
4420 case IORING_OP_CLOSE:
4421 if (sqe) {
4422 ret = io_close_prep(req, sqe);
4423 if (ret)
4424 break;
4425 }
4426 ret = io_close(req, nxt, force_nonblock);
4427 break;
05f3fb3c
JA
4428 case IORING_OP_FILES_UPDATE:
4429 if (sqe) {
4430 ret = io_files_update_prep(req, sqe);
4431 if (ret)
4432 break;
4433 }
4434 ret = io_files_update(req, force_nonblock);
4435 break;
eddc7ef5
JA
4436 case IORING_OP_STATX:
4437 if (sqe) {
4438 ret = io_statx_prep(req, sqe);
4439 if (ret)
4440 break;
4441 }
4442 ret = io_statx(req, nxt, force_nonblock);
4443 break;
4840e418
JA
4444 case IORING_OP_FADVISE:
4445 if (sqe) {
4446 ret = io_fadvise_prep(req, sqe);
4447 if (ret)
4448 break;
4449 }
4450 ret = io_fadvise(req, nxt, force_nonblock);
4451 break;
c1ca757b
JA
4452 case IORING_OP_MADVISE:
4453 if (sqe) {
4454 ret = io_madvise_prep(req, sqe);
4455 if (ret)
4456 break;
4457 }
4458 ret = io_madvise(req, nxt, force_nonblock);
4459 break;
cebdb986
JA
4460 case IORING_OP_OPENAT2:
4461 if (sqe) {
4462 ret = io_openat2_prep(req, sqe);
4463 if (ret)
4464 break;
4465 }
4466 ret = io_openat2(req, nxt, force_nonblock);
4467 break;
3e4827b0
JA
4468 case IORING_OP_EPOLL_CTL:
4469 if (sqe) {
4470 ret = io_epoll_ctl_prep(req, sqe);
4471 if (ret)
4472 break;
4473 }
4474 ret = io_epoll_ctl(req, nxt, force_nonblock);
4475 break;
2b188cc1
JA
4476 default:
4477 ret = -EINVAL;
4478 break;
4479 }
4480
def596e9
JA
4481 if (ret)
4482 return ret;
4483
4484 if (ctx->flags & IORING_SETUP_IOPOLL) {
11ba820b
JA
4485 const bool in_async = io_wq_current_is_worker();
4486
9e645e11 4487 if (req->result == -EAGAIN)
def596e9
JA
4488 return -EAGAIN;
4489
11ba820b
JA
4490 /* workqueue context doesn't hold uring_lock, grab it now */
4491 if (in_async)
4492 mutex_lock(&ctx->uring_lock);
4493
def596e9 4494 io_iopoll_req_issued(req);
11ba820b
JA
4495
4496 if (in_async)
4497 mutex_unlock(&ctx->uring_lock);
def596e9
JA
4498 }
4499
4500 return 0;
2b188cc1
JA
4501}
4502
561fb04a 4503static void io_wq_submit_work(struct io_wq_work **workptr)
2b188cc1 4504{
561fb04a 4505 struct io_wq_work *work = *workptr;
2b188cc1 4506 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
561fb04a
JA
4507 struct io_kiocb *nxt = NULL;
4508 int ret = 0;
2b188cc1 4509
0c9d5ccd
JA
4510 /* if NO_CANCEL is set, we must still run the work */
4511 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
4512 IO_WQ_WORK_CANCEL) {
561fb04a 4513 ret = -ECANCELED;
0c9d5ccd 4514 }
31b51510 4515
561fb04a 4516 if (!ret) {
cf6fd4bd 4517 req->in_async = true;
561fb04a 4518 do {
3529d8c2 4519 ret = io_issue_sqe(req, NULL, &nxt, false);
561fb04a
JA
4520 /*
4521 * We can get EAGAIN for polled IO even though we're
4522 * forcing a sync submission from here, since we can't
4523 * wait for request slots on the block side.
4524 */
4525 if (ret != -EAGAIN)
4526 break;
4527 cond_resched();
4528 } while (1);
4529 }
31b51510 4530
561fb04a 4531 /* drop submission reference */
ec9c02ad 4532 io_put_req(req);
817869d2 4533
561fb04a 4534 if (ret) {
4e88d6e7 4535 req_set_fail_links(req);
78e19bbe 4536 io_cqring_add_event(req, ret);
817869d2 4537 io_put_req(req);
edafccee 4538 }
2b188cc1 4539
561fb04a 4540 /* if a dependent link is ready, pass it back */
78912934
JA
4541 if (!ret && nxt)
4542 io_wq_assign_next(workptr, nxt);
2b188cc1
JA
4543}
4544
15b71abe 4545static int io_req_needs_file(struct io_kiocb *req, int fd)
9e3aa61a 4546{
d3656344 4547 if (!io_op_defs[req->opcode].needs_file)
9e3aa61a 4548 return 0;
0b5faf6b 4549 if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
d3656344
JA
4550 return 0;
4551 return 1;
09bb8394
JA
4552}
4553
65e19f54
JA
4554static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
4555 int index)
4556{
4557 struct fixed_file_table *table;
4558
05f3fb3c
JA
4559 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
4560 return table->files[index & IORING_FILE_TABLE_MASK];;
65e19f54
JA
4561}
4562
3529d8c2
JA
4563static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
4564 const struct io_uring_sqe *sqe)
09bb8394 4565{
a197f664 4566 struct io_ring_ctx *ctx = req->ctx;
09bb8394 4567 unsigned flags;
d3656344 4568 int fd;
09bb8394 4569
3529d8c2
JA
4570 flags = READ_ONCE(sqe->flags);
4571 fd = READ_ONCE(sqe->fd);
09bb8394 4572
d3656344
JA
4573 if (!io_req_needs_file(req, fd))
4574 return 0;
09bb8394
JA
4575
4576 if (flags & IOSQE_FIXED_FILE) {
05f3fb3c 4577 if (unlikely(!ctx->file_data ||
09bb8394
JA
4578 (unsigned) fd >= ctx->nr_user_files))
4579 return -EBADF;
b7620121 4580 fd = array_index_nospec(fd, ctx->nr_user_files);
65e19f54
JA
4581 req->file = io_file_from_index(ctx, fd);
4582 if (!req->file)
08a45173 4583 return -EBADF;
09bb8394 4584 req->flags |= REQ_F_FIXED_FILE;
05f3fb3c 4585 percpu_ref_get(&ctx->file_data->refs);
09bb8394 4586 } else {
cf6fd4bd 4587 if (req->needs_fixed_file)
09bb8394 4588 return -EBADF;
c826bd7a 4589 trace_io_uring_file_get(ctx, fd);
09bb8394
JA
4590 req->file = io_file_get(state, fd);
4591 if (unlikely(!req->file))
4592 return -EBADF;
4593 }
4594
4595 return 0;
4596}
4597
a197f664 4598static int io_grab_files(struct io_kiocb *req)
fcb323cc
JA
4599{
4600 int ret = -EBADF;
a197f664 4601 struct io_ring_ctx *ctx = req->ctx;
fcb323cc 4602
f86cd20c
JA
4603 if (req->work.files)
4604 return 0;
b14cca0c 4605 if (!ctx->ring_file)
b5dba59e
JA
4606 return -EBADF;
4607
fcb323cc
JA
4608 rcu_read_lock();
4609 spin_lock_irq(&ctx->inflight_lock);
4610 /*
4611 * We use the f_ops->flush() handler to ensure that we can flush
4612 * out work accessing these files if the fd is closed. Check if
4613 * the fd has changed since we started down this path, and disallow
4614 * this operation if it has.
4615 */
b14cca0c 4616 if (fcheck(ctx->ring_fd) == ctx->ring_file) {
fcb323cc
JA
4617 list_add(&req->inflight_entry, &ctx->inflight_list);
4618 req->flags |= REQ_F_INFLIGHT;
4619 req->work.files = current->files;
4620 ret = 0;
4621 }
4622 spin_unlock_irq(&ctx->inflight_lock);
4623 rcu_read_unlock();
4624
4625 return ret;
4626}
4627
2665abfd 4628static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 4629{
ad8a48ac
JA
4630 struct io_timeout_data *data = container_of(timer,
4631 struct io_timeout_data, timer);
4632 struct io_kiocb *req = data->req;
2665abfd
JA
4633 struct io_ring_ctx *ctx = req->ctx;
4634 struct io_kiocb *prev = NULL;
4635 unsigned long flags;
2665abfd
JA
4636
4637 spin_lock_irqsave(&ctx->completion_lock, flags);
4638
4639 /*
4640 * We don't expect the list to be empty, that will only happen if we
4641 * race with the completion of the linked work.
4642 */
4493233e
PB
4643 if (!list_empty(&req->link_list)) {
4644 prev = list_entry(req->link_list.prev, struct io_kiocb,
4645 link_list);
5d960724 4646 if (refcount_inc_not_zero(&prev->refs)) {
4493233e 4647 list_del_init(&req->link_list);
5d960724
JA
4648 prev->flags &= ~REQ_F_LINK_TIMEOUT;
4649 } else
76a46e06 4650 prev = NULL;
2665abfd
JA
4651 }
4652
4653 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4654
4655 if (prev) {
4e88d6e7 4656 req_set_fail_links(prev);
b0dd8a41
JA
4657 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
4658 -ETIME);
76a46e06 4659 io_put_req(prev);
47f46768
JA
4660 } else {
4661 io_cqring_add_event(req, -ETIME);
4662 io_put_req(req);
2665abfd 4663 }
2665abfd
JA
4664 return HRTIMER_NORESTART;
4665}
4666
ad8a48ac 4667static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 4668{
76a46e06 4669 struct io_ring_ctx *ctx = req->ctx;
2665abfd 4670
76a46e06
JA
4671 /*
4672 * If the list is now empty, then our linked request finished before
4673 * we got a chance to setup the timer
4674 */
4675 spin_lock_irq(&ctx->completion_lock);
4493233e 4676 if (!list_empty(&req->link_list)) {
2d28390a 4677 struct io_timeout_data *data = &req->io->timeout;
94ae5e77 4678
ad8a48ac
JA
4679 data->timer.function = io_link_timeout_fn;
4680 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
4681 data->mode);
2665abfd 4682 }
76a46e06 4683 spin_unlock_irq(&ctx->completion_lock);
2665abfd 4684
2665abfd 4685 /* drop submission reference */
76a46e06
JA
4686 io_put_req(req);
4687}
2665abfd 4688
ad8a48ac 4689static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd
JA
4690{
4691 struct io_kiocb *nxt;
4692
4693 if (!(req->flags & REQ_F_LINK))
4694 return NULL;
4695
4493233e
PB
4696 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
4697 link_list);
d625c6ee 4698 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 4699 return NULL;
2665abfd 4700
76a46e06 4701 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 4702 return nxt;
2665abfd
JA
4703}
4704
3529d8c2 4705static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 4706{
4a0a7a18 4707 struct io_kiocb *linked_timeout;
f9bd67f6 4708 struct io_kiocb *nxt = NULL;
e0c5c576 4709 int ret;
2b188cc1 4710
4a0a7a18
JA
4711again:
4712 linked_timeout = io_prep_linked_timeout(req);
4713
3529d8c2 4714 ret = io_issue_sqe(req, sqe, &nxt, true);
491381ce
JA
4715
4716 /*
4717 * We async punt it if the file wasn't marked NOWAIT, or if the file
4718 * doesn't support non-blocking read/write attempts
4719 */
4720 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
4721 (req->flags & REQ_F_MUST_PUNT))) {
86a761f8 4722punt:
f86cd20c 4723 if (io_op_defs[req->opcode].file_table) {
bbad27b2
PB
4724 ret = io_grab_files(req);
4725 if (ret)
4726 goto err;
2b188cc1 4727 }
bbad27b2
PB
4728
4729 /*
4730 * Queued up for async execution, worker will release
4731 * submit reference when the iocb is actually submitted.
4732 */
4733 io_queue_async_work(req);
4a0a7a18 4734 goto done_req;
2b188cc1 4735 }
e65ef56d 4736
fcb323cc 4737err:
76a46e06 4738 /* drop submission reference */
ec9c02ad 4739 io_put_req(req);
e65ef56d 4740
f9bd67f6 4741 if (linked_timeout) {
76a46e06 4742 if (!ret)
f9bd67f6 4743 io_queue_linked_timeout(linked_timeout);
76a46e06 4744 else
f9bd67f6 4745 io_put_req(linked_timeout);
76a46e06
JA
4746 }
4747
e65ef56d 4748 /* and drop final reference, if we failed */
9e645e11 4749 if (ret) {
78e19bbe 4750 io_cqring_add_event(req, ret);
4e88d6e7 4751 req_set_fail_links(req);
e65ef56d 4752 io_put_req(req);
9e645e11 4753 }
4a0a7a18
JA
4754done_req:
4755 if (nxt) {
4756 req = nxt;
4757 nxt = NULL;
86a761f8
PB
4758
4759 if (req->flags & REQ_F_FORCE_ASYNC)
4760 goto punt;
4a0a7a18
JA
4761 goto again;
4762 }
2b188cc1
JA
4763}
4764
3529d8c2 4765static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4fe2c963
JL
4766{
4767 int ret;
4768
3529d8c2 4769 ret = io_req_defer(req, sqe);
4fe2c963
JL
4770 if (ret) {
4771 if (ret != -EIOCBQUEUED) {
1118591a 4772fail_req:
78e19bbe 4773 io_cqring_add_event(req, ret);
4e88d6e7 4774 req_set_fail_links(req);
78e19bbe 4775 io_double_put_req(req);
4fe2c963 4776 }
2550878f 4777 } else if (req->flags & REQ_F_FORCE_ASYNC) {
1118591a
PB
4778 ret = io_req_defer_prep(req, sqe);
4779 if (unlikely(ret < 0))
4780 goto fail_req;
ce35a47a
JA
4781 /*
4782 * Never try inline submit of IOSQE_ASYNC is set, go straight
4783 * to async execution.
4784 */
4785 req->work.flags |= IO_WQ_WORK_CONCURRENT;
4786 io_queue_async_work(req);
4787 } else {
3529d8c2 4788 __io_queue_sqe(req, sqe);
ce35a47a 4789 }
4fe2c963
JL
4790}
4791
1b4a51b6 4792static inline void io_queue_link_head(struct io_kiocb *req)
4fe2c963 4793{
94ae5e77 4794 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
1b4a51b6
PB
4795 io_cqring_add_event(req, -ECANCELED);
4796 io_double_put_req(req);
4797 } else
3529d8c2 4798 io_queue_sqe(req, NULL);
4fe2c963
JL
4799}
4800
4e88d6e7 4801#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
ce35a47a 4802 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
9e645e11 4803
3529d8c2
JA
4804static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4805 struct io_submit_state *state, struct io_kiocb **link)
9e645e11 4806{
75c6a039 4807 const struct cred *old_creds = NULL;
a197f664 4808 struct io_ring_ctx *ctx = req->ctx;
32fe525b 4809 unsigned int sqe_flags;
75c6a039 4810 int ret, id;
9e645e11 4811
32fe525b 4812 sqe_flags = READ_ONCE(sqe->flags);
9e645e11
JA
4813
4814 /* enforce forwards compatibility on users */
32fe525b 4815 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
9e645e11 4816 ret = -EINVAL;
196be95c 4817 goto err_req;
9e645e11
JA
4818 }
4819
75c6a039
JA
4820 id = READ_ONCE(sqe->personality);
4821 if (id) {
4822 const struct cred *personality_creds;
4823
4824 personality_creds = idr_find(&ctx->personality_idr, id);
4825 if (unlikely(!personality_creds)) {
4826 ret = -EINVAL;
4827 goto err_req;
4828 }
4829 old_creds = override_creds(personality_creds);
4830 }
4831
6b47ee6e
PB
4832 /* same numerical values with corresponding REQ_F_*, safe to copy */
4833 req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
4834 IOSQE_ASYNC);
9e645e11 4835
3529d8c2 4836 ret = io_req_set_file(state, req, sqe);
9e645e11
JA
4837 if (unlikely(ret)) {
4838err_req:
78e19bbe
JA
4839 io_cqring_add_event(req, ret);
4840 io_double_put_req(req);
75c6a039
JA
4841 if (old_creds)
4842 revert_creds(old_creds);
2e6e1fde 4843 return false;
9e645e11
JA
4844 }
4845
9e645e11
JA
4846 /*
4847 * If we already have a head request, queue this one for async
4848 * submittal once the head completes. If we don't have a head but
4849 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
4850 * submitted sync once the chain is complete. If none of those
4851 * conditions are true (normal request), then just queue it.
4852 */
4853 if (*link) {
9d76377f 4854 struct io_kiocb *head = *link;
4e88d6e7 4855
8cdf2193
PB
4856 /*
4857 * Taking sequential execution of a link, draining both sides
4858 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
4859 * requests in the link. So, it drains the head and the
4860 * next after the link request. The last one is done via
4861 * drain_next flag to persist the effect across calls.
4862 */
711be031
PB
4863 if (sqe_flags & IOSQE_IO_DRAIN) {
4864 head->flags |= REQ_F_IO_DRAIN;
4865 ctx->drain_next = 1;
4866 }
b7bb4f7d 4867 if (io_alloc_async_ctx(req)) {
9e645e11
JA
4868 ret = -EAGAIN;
4869 goto err_req;
4870 }
4871
3529d8c2 4872 ret = io_req_defer_prep(req, sqe);
2d28390a 4873 if (ret) {
4e88d6e7 4874 /* fail even hard links since we don't submit */
9d76377f 4875 head->flags |= REQ_F_FAIL_LINK;
f67676d1 4876 goto err_req;
2d28390a 4877 }
9d76377f
PB
4878 trace_io_uring_link(ctx, req, head);
4879 list_add_tail(&req->link_list, &head->link_list);
32fe525b
PB
4880
4881 /* last request of a link, enqueue the link */
4882 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
4883 io_queue_link_head(head);
4884 *link = NULL;
4885 }
9e645e11 4886 } else {
711be031
PB
4887 if (unlikely(ctx->drain_next)) {
4888 req->flags |= REQ_F_IO_DRAIN;
4889 req->ctx->drain_next = 0;
4890 }
4891 if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
4892 req->flags |= REQ_F_LINK;
711be031
PB
4893 INIT_LIST_HEAD(&req->link_list);
4894 ret = io_req_defer_prep(req, sqe);
4895 if (ret)
4896 req->flags |= REQ_F_FAIL_LINK;
4897 *link = req;
4898 } else {
4899 io_queue_sqe(req, sqe);
4900 }
9e645e11 4901 }
2e6e1fde 4902
75c6a039
JA
4903 if (old_creds)
4904 revert_creds(old_creds);
2e6e1fde 4905 return true;
9e645e11
JA
4906}
4907
9a56a232
JA
4908/*
4909 * Batched submission is done, ensure local IO is flushed out.
4910 */
4911static void io_submit_state_end(struct io_submit_state *state)
4912{
4913 blk_finish_plug(&state->plug);
3d6770fb 4914 io_file_put(state);
2579f913 4915 if (state->free_reqs)
6c8a3134 4916 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
9a56a232
JA
4917}
4918
4919/*
4920 * Start submission side cache.
4921 */
4922static void io_submit_state_start(struct io_submit_state *state,
22efde59 4923 unsigned int max_ios)
9a56a232
JA
4924{
4925 blk_start_plug(&state->plug);
2579f913 4926 state->free_reqs = 0;
9a56a232
JA
4927 state->file = NULL;
4928 state->ios_left = max_ios;
4929}
4930
2b188cc1
JA
4931static void io_commit_sqring(struct io_ring_ctx *ctx)
4932{
75b28aff 4933 struct io_rings *rings = ctx->rings;
2b188cc1 4934
caf582c6
PB
4935 /*
4936 * Ensure any loads from the SQEs are done at this point,
4937 * since once we write the new head, the application could
4938 * write new data to them.
4939 */
4940 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
4941}
4942
2b188cc1 4943/*
3529d8c2 4944 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
2b188cc1
JA
4945 * that is mapped by userspace. This means that care needs to be taken to
4946 * ensure that reads are stable, as we cannot rely on userspace always
4947 * being a good citizen. If members of the sqe are validated and then later
4948 * used, it's important that those reads are done through READ_ONCE() to
4949 * prevent a re-load down the line.
4950 */
3529d8c2
JA
4951static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
4952 const struct io_uring_sqe **sqe_ptr)
2b188cc1 4953{
75b28aff 4954 u32 *sq_array = ctx->sq_array;
2b188cc1
JA
4955 unsigned head;
4956
4957 /*
4958 * The cached sq head (or cq tail) serves two purposes:
4959 *
4960 * 1) allows us to batch the cost of updating the user visible
4961 * head updates.
4962 * 2) allows the kernel side to track the head on its own, even
4963 * though the application is the one updating it.
4964 */
ee7d46d9 4965 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
9835d6fa 4966 if (likely(head < ctx->sq_entries)) {
cf6fd4bd
PB
4967 /*
4968 * All io need record the previous position, if LINK vs DARIN,
4969 * it can be used to mark the position of the first IO in the
4970 * link list.
4971 */
4972 req->sequence = ctx->cached_sq_head;
3529d8c2
JA
4973 *sqe_ptr = &ctx->sq_sqes[head];
4974 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
4975 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
2b188cc1
JA
4976 ctx->cached_sq_head++;
4977 return true;
4978 }
4979
4980 /* drop invalid entries */
4981 ctx->cached_sq_head++;
498ccd9e 4982 ctx->cached_sq_dropped++;
ee7d46d9 4983 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
2b188cc1
JA
4984 return false;
4985}
4986
fb5ccc98 4987static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
ae9428ca
PB
4988 struct file *ring_file, int ring_fd,
4989 struct mm_struct **mm, bool async)
6c271ce2
JA
4990{
4991 struct io_submit_state state, *statep = NULL;
9e645e11 4992 struct io_kiocb *link = NULL;
9e645e11 4993 int i, submitted = 0;
95a1b3ff 4994 bool mm_fault = false;
6c271ce2 4995
c4a2ed72 4996 /* if we have a backlog and couldn't flush it all, return BUSY */
ad3eb2c8
JA
4997 if (test_bit(0, &ctx->sq_check_overflow)) {
4998 if (!list_empty(&ctx->cq_overflow_list) &&
4999 !io_cqring_overflow_flush(ctx, false))
5000 return -EBUSY;
5001 }
6c271ce2 5002
ee7d46d9
PB
5003 /* make sure SQ entry isn't read before tail */
5004 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
9ef4f124 5005
2b85edfc
PB
5006 if (!percpu_ref_tryget_many(&ctx->refs, nr))
5007 return -EAGAIN;
6c271ce2
JA
5008
5009 if (nr > IO_PLUG_THRESHOLD) {
22efde59 5010 io_submit_state_start(&state, nr);
6c271ce2
JA
5011 statep = &state;
5012 }
5013
b14cca0c
PB
5014 ctx->ring_fd = ring_fd;
5015 ctx->ring_file = ring_file;
5016
6c271ce2 5017 for (i = 0; i < nr; i++) {
3529d8c2 5018 const struct io_uring_sqe *sqe;
196be95c 5019 struct io_kiocb *req;
1cb1edb2 5020 int err;
fb5ccc98 5021
196be95c
PB
5022 req = io_get_req(ctx, statep);
5023 if (unlikely(!req)) {
5024 if (!submitted)
5025 submitted = -EAGAIN;
fb5ccc98 5026 break;
196be95c 5027 }
3529d8c2 5028 if (!io_get_sqring(ctx, req, &sqe)) {
2b85edfc 5029 __io_req_do_free(req);
196be95c
PB
5030 break;
5031 }
fb5ccc98 5032
d3656344
JA
5033 /* will complete beyond this point, count as submitted */
5034 submitted++;
5035
5036 if (unlikely(req->opcode >= IORING_OP_LAST)) {
1cb1edb2
PB
5037 err = -EINVAL;
5038fail_req:
5039 io_cqring_add_event(req, err);
d3656344 5040 io_double_put_req(req);
196be95c
PB
5041 break;
5042 }
fb5ccc98 5043
d3656344 5044 if (io_op_defs[req->opcode].needs_mm && !*mm) {
95a1b3ff 5045 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
1cb1edb2
PB
5046 if (unlikely(mm_fault)) {
5047 err = -EFAULT;
5048 goto fail_req;
95a1b3ff 5049 }
1cb1edb2
PB
5050 use_mm(ctx->sqo_mm);
5051 *mm = ctx->sqo_mm;
9e645e11 5052 }
9e645e11 5053
cf6fd4bd
PB
5054 req->in_async = async;
5055 req->needs_fixed_file = async;
354420f7
JA
5056 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
5057 true, async);
3529d8c2 5058 if (!io_submit_sqe(req, sqe, statep, &link))
2e6e1fde 5059 break;
6c271ce2
JA
5060 }
5061
9466f437
PB
5062 if (unlikely(submitted != nr)) {
5063 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
5064
5065 percpu_ref_put_many(&ctx->refs, nr - ref_used);
5066 }
9e645e11 5067 if (link)
1b4a51b6 5068 io_queue_link_head(link);
6c271ce2
JA
5069 if (statep)
5070 io_submit_state_end(&state);
5071
ae9428ca
PB
5072 /* Commit SQ ring head once we've consumed and submitted all SQEs */
5073 io_commit_sqring(ctx);
5074
6c271ce2
JA
5075 return submitted;
5076}
5077
5078static int io_sq_thread(void *data)
5079{
6c271ce2
JA
5080 struct io_ring_ctx *ctx = data;
5081 struct mm_struct *cur_mm = NULL;
181e448d 5082 const struct cred *old_cred;
6c271ce2
JA
5083 mm_segment_t old_fs;
5084 DEFINE_WAIT(wait);
5085 unsigned inflight;
5086 unsigned long timeout;
c1edbf5f 5087 int ret;
6c271ce2 5088
206aefde 5089 complete(&ctx->completions[1]);
a4c0b3de 5090
6c271ce2
JA
5091 old_fs = get_fs();
5092 set_fs(USER_DS);
181e448d 5093 old_cred = override_creds(ctx->creds);
6c271ce2 5094
c1edbf5f 5095 ret = timeout = inflight = 0;
2bbcd6d3 5096 while (!kthread_should_park()) {
fb5ccc98 5097 unsigned int to_submit;
6c271ce2
JA
5098
5099 if (inflight) {
5100 unsigned nr_events = 0;
5101
5102 if (ctx->flags & IORING_SETUP_IOPOLL) {
2b2ed975
JA
5103 /*
5104 * inflight is the count of the maximum possible
5105 * entries we submitted, but it can be smaller
5106 * if we dropped some of them. If we don't have
5107 * poll entries available, then we know that we
5108 * have nothing left to poll for. Reset the
5109 * inflight count to zero in that case.
5110 */
5111 mutex_lock(&ctx->uring_lock);
5112 if (!list_empty(&ctx->poll_list))
5113 __io_iopoll_check(ctx, &nr_events, 0);
5114 else
5115 inflight = 0;
5116 mutex_unlock(&ctx->uring_lock);
6c271ce2
JA
5117 } else {
5118 /*
5119 * Normal IO, just pretend everything completed.
5120 * We don't have to poll completions for that.
5121 */
5122 nr_events = inflight;
5123 }
5124
5125 inflight -= nr_events;
5126 if (!inflight)
5127 timeout = jiffies + ctx->sq_thread_idle;
5128 }
5129
fb5ccc98 5130 to_submit = io_sqring_entries(ctx);
c1edbf5f
JA
5131
5132 /*
5133 * If submit got -EBUSY, flag us as needing the application
5134 * to enter the kernel to reap and flush events.
5135 */
5136 if (!to_submit || ret == -EBUSY) {
6c271ce2
JA
5137 /*
5138 * We're polling. If we're within the defined idle
5139 * period, then let us spin without work before going
c1edbf5f
JA
5140 * to sleep. The exception is if we got EBUSY doing
5141 * more IO, we should wait for the application to
5142 * reap events and wake us up.
6c271ce2 5143 */
c1edbf5f 5144 if (inflight ||
df069d80
JA
5145 (!time_after(jiffies, timeout) && ret != -EBUSY &&
5146 !percpu_ref_is_dying(&ctx->refs))) {
9831a90c 5147 cond_resched();
6c271ce2
JA
5148 continue;
5149 }
5150
5151 /*
5152 * Drop cur_mm before scheduling, we can't hold it for
5153 * long periods (or over schedule()). Do this before
5154 * adding ourselves to the waitqueue, as the unuse/drop
5155 * may sleep.
5156 */
5157 if (cur_mm) {
5158 unuse_mm(cur_mm);
5159 mmput(cur_mm);
5160 cur_mm = NULL;
5161 }
5162
5163 prepare_to_wait(&ctx->sqo_wait, &wait,
5164 TASK_INTERRUPTIBLE);
5165
5166 /* Tell userspace we may need a wakeup call */
75b28aff 5167 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
0d7bae69
SB
5168 /* make sure to read SQ tail after writing flags */
5169 smp_mb();
6c271ce2 5170
fb5ccc98 5171 to_submit = io_sqring_entries(ctx);
c1edbf5f 5172 if (!to_submit || ret == -EBUSY) {
2bbcd6d3 5173 if (kthread_should_park()) {
6c271ce2
JA
5174 finish_wait(&ctx->sqo_wait, &wait);
5175 break;
5176 }
5177 if (signal_pending(current))
5178 flush_signals(current);
5179 schedule();
5180 finish_wait(&ctx->sqo_wait, &wait);
5181
75b28aff 5182 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5183 continue;
5184 }
5185 finish_wait(&ctx->sqo_wait, &wait);
5186
75b28aff 5187 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5188 }
5189
8a4955ff 5190 mutex_lock(&ctx->uring_lock);
1d7bb1d5 5191 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
8a4955ff 5192 mutex_unlock(&ctx->uring_lock);
1d7bb1d5
JA
5193 if (ret > 0)
5194 inflight += ret;
6c271ce2
JA
5195 }
5196
5197 set_fs(old_fs);
5198 if (cur_mm) {
5199 unuse_mm(cur_mm);
5200 mmput(cur_mm);
5201 }
181e448d 5202 revert_creds(old_cred);
06058632 5203
2bbcd6d3 5204 kthread_parkme();
06058632 5205
6c271ce2
JA
5206 return 0;
5207}
5208
bda52162
JA
5209struct io_wait_queue {
5210 struct wait_queue_entry wq;
5211 struct io_ring_ctx *ctx;
5212 unsigned to_wait;
5213 unsigned nr_timeouts;
5214};
5215
1d7bb1d5 5216static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
bda52162
JA
5217{
5218 struct io_ring_ctx *ctx = iowq->ctx;
5219
5220 /*
d195a66e 5221 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
5222 * started waiting. For timeouts, we always want to return to userspace,
5223 * regardless of event count.
5224 */
1d7bb1d5 5225 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
bda52162
JA
5226 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
5227}
5228
5229static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
5230 int wake_flags, void *key)
5231{
5232 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
5233 wq);
5234
1d7bb1d5
JA
5235 /* use noflush == true, as we can't safely rely on locking context */
5236 if (!io_should_wake(iowq, true))
bda52162
JA
5237 return -1;
5238
5239 return autoremove_wake_function(curr, mode, wake_flags, key);
5240}
5241
2b188cc1
JA
5242/*
5243 * Wait until events become available, if we don't already have some. The
5244 * application must reap them itself, as they reside on the shared cq ring.
5245 */
5246static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
5247 const sigset_t __user *sig, size_t sigsz)
5248{
bda52162
JA
5249 struct io_wait_queue iowq = {
5250 .wq = {
5251 .private = current,
5252 .func = io_wake_function,
5253 .entry = LIST_HEAD_INIT(iowq.wq.entry),
5254 },
5255 .ctx = ctx,
5256 .to_wait = min_events,
5257 };
75b28aff 5258 struct io_rings *rings = ctx->rings;
e9ffa5c2 5259 int ret = 0;
2b188cc1 5260
1d7bb1d5 5261 if (io_cqring_events(ctx, false) >= min_events)
2b188cc1
JA
5262 return 0;
5263
5264 if (sig) {
9e75ad5d
AB
5265#ifdef CONFIG_COMPAT
5266 if (in_compat_syscall())
5267 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 5268 sigsz);
9e75ad5d
AB
5269 else
5270#endif
b772434b 5271 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 5272
2b188cc1
JA
5273 if (ret)
5274 return ret;
5275 }
5276
bda52162 5277 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
c826bd7a 5278 trace_io_uring_cqring_wait(ctx, min_events);
bda52162
JA
5279 do {
5280 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
5281 TASK_INTERRUPTIBLE);
1d7bb1d5 5282 if (io_should_wake(&iowq, false))
bda52162
JA
5283 break;
5284 schedule();
5285 if (signal_pending(current)) {
e9ffa5c2 5286 ret = -EINTR;
bda52162
JA
5287 break;
5288 }
5289 } while (1);
5290 finish_wait(&ctx->wait, &iowq.wq);
5291
e9ffa5c2 5292 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 5293
75b28aff 5294 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
5295}
5296
6b06314c
JA
5297static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
5298{
5299#if defined(CONFIG_UNIX)
5300 if (ctx->ring_sock) {
5301 struct sock *sock = ctx->ring_sock->sk;
5302 struct sk_buff *skb;
5303
5304 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
5305 kfree_skb(skb);
5306 }
5307#else
5308 int i;
5309
65e19f54
JA
5310 for (i = 0; i < ctx->nr_user_files; i++) {
5311 struct file *file;
5312
5313 file = io_file_from_index(ctx, i);
5314 if (file)
5315 fput(file);
5316 }
6b06314c
JA
5317#endif
5318}
5319
05f3fb3c
JA
5320static void io_file_ref_kill(struct percpu_ref *ref)
5321{
5322 struct fixed_file_data *data;
5323
5324 data = container_of(ref, struct fixed_file_data, refs);
5325 complete(&data->done);
5326}
5327
6b06314c
JA
5328static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
5329{
05f3fb3c 5330 struct fixed_file_data *data = ctx->file_data;
65e19f54
JA
5331 unsigned nr_tables, i;
5332
05f3fb3c 5333 if (!data)
6b06314c
JA
5334 return -ENXIO;
5335
05f3fb3c 5336 percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
e46a7950 5337 flush_work(&data->ref_work);
2faf852d
JA
5338 wait_for_completion(&data->done);
5339 io_ring_file_ref_flush(data);
05f3fb3c
JA
5340 percpu_ref_exit(&data->refs);
5341
6b06314c 5342 __io_sqe_files_unregister(ctx);
65e19f54
JA
5343 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
5344 for (i = 0; i < nr_tables; i++)
05f3fb3c
JA
5345 kfree(data->table[i].files);
5346 kfree(data->table);
5347 kfree(data);
5348 ctx->file_data = NULL;
6b06314c
JA
5349 ctx->nr_user_files = 0;
5350 return 0;
5351}
5352
6c271ce2
JA
5353static void io_sq_thread_stop(struct io_ring_ctx *ctx)
5354{
5355 if (ctx->sqo_thread) {
206aefde 5356 wait_for_completion(&ctx->completions[1]);
2bbcd6d3
RP
5357 /*
5358 * The park is a bit of a work-around, without it we get
5359 * warning spews on shutdown with SQPOLL set and affinity
5360 * set to a single CPU.
5361 */
06058632 5362 kthread_park(ctx->sqo_thread);
6c271ce2
JA
5363 kthread_stop(ctx->sqo_thread);
5364 ctx->sqo_thread = NULL;
5365 }
5366}
5367
6b06314c
JA
5368static void io_finish_async(struct io_ring_ctx *ctx)
5369{
6c271ce2
JA
5370 io_sq_thread_stop(ctx);
5371
561fb04a
JA
5372 if (ctx->io_wq) {
5373 io_wq_destroy(ctx->io_wq);
5374 ctx->io_wq = NULL;
6b06314c
JA
5375 }
5376}
5377
5378#if defined(CONFIG_UNIX)
6b06314c
JA
5379/*
5380 * Ensure the UNIX gc is aware of our file set, so we are certain that
5381 * the io_uring can be safely unregistered on process exit, even if we have
5382 * loops in the file referencing.
5383 */
5384static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
5385{
5386 struct sock *sk = ctx->ring_sock->sk;
5387 struct scm_fp_list *fpl;
5388 struct sk_buff *skb;
08a45173 5389 int i, nr_files;
6b06314c
JA
5390
5391 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
5392 unsigned long inflight = ctx->user->unix_inflight + nr;
5393
5394 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
5395 return -EMFILE;
5396 }
5397
5398 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
5399 if (!fpl)
5400 return -ENOMEM;
5401
5402 skb = alloc_skb(0, GFP_KERNEL);
5403 if (!skb) {
5404 kfree(fpl);
5405 return -ENOMEM;
5406 }
5407
5408 skb->sk = sk;
6b06314c 5409
08a45173 5410 nr_files = 0;
6b06314c
JA
5411 fpl->user = get_uid(ctx->user);
5412 for (i = 0; i < nr; i++) {
65e19f54
JA
5413 struct file *file = io_file_from_index(ctx, i + offset);
5414
5415 if (!file)
08a45173 5416 continue;
65e19f54 5417 fpl->fp[nr_files] = get_file(file);
08a45173
JA
5418 unix_inflight(fpl->user, fpl->fp[nr_files]);
5419 nr_files++;
6b06314c
JA
5420 }
5421
08a45173
JA
5422 if (nr_files) {
5423 fpl->max = SCM_MAX_FD;
5424 fpl->count = nr_files;
5425 UNIXCB(skb).fp = fpl;
05f3fb3c 5426 skb->destructor = unix_destruct_scm;
08a45173
JA
5427 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
5428 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 5429
08a45173
JA
5430 for (i = 0; i < nr_files; i++)
5431 fput(fpl->fp[i]);
5432 } else {
5433 kfree_skb(skb);
5434 kfree(fpl);
5435 }
6b06314c
JA
5436
5437 return 0;
5438}
5439
5440/*
5441 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
5442 * causes regular reference counting to break down. We rely on the UNIX
5443 * garbage collection to take care of this problem for us.
5444 */
5445static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5446{
5447 unsigned left, total;
5448 int ret = 0;
5449
5450 total = 0;
5451 left = ctx->nr_user_files;
5452 while (left) {
5453 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
5454
5455 ret = __io_sqe_files_scm(ctx, this_files, total);
5456 if (ret)
5457 break;
5458 left -= this_files;
5459 total += this_files;
5460 }
5461
5462 if (!ret)
5463 return 0;
5464
5465 while (total < ctx->nr_user_files) {
65e19f54
JA
5466 struct file *file = io_file_from_index(ctx, total);
5467
5468 if (file)
5469 fput(file);
6b06314c
JA
5470 total++;
5471 }
5472
5473 return ret;
5474}
5475#else
5476static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5477{
5478 return 0;
5479}
5480#endif
5481
65e19f54
JA
5482static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
5483 unsigned nr_files)
5484{
5485 int i;
5486
5487 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5488 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5489 unsigned this_files;
5490
5491 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
5492 table->files = kcalloc(this_files, sizeof(struct file *),
5493 GFP_KERNEL);
5494 if (!table->files)
5495 break;
5496 nr_files -= this_files;
5497 }
5498
5499 if (i == nr_tables)
5500 return 0;
5501
5502 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5503 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5504 kfree(table->files);
5505 }
5506 return 1;
5507}
5508
05f3fb3c
JA
5509static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
5510{
5511#if defined(CONFIG_UNIX)
5512 struct sock *sock = ctx->ring_sock->sk;
5513 struct sk_buff_head list, *head = &sock->sk_receive_queue;
5514 struct sk_buff *skb;
5515 int i;
5516
5517 __skb_queue_head_init(&list);
5518
5519 /*
5520 * Find the skb that holds this file in its SCM_RIGHTS. When found,
5521 * remove this entry and rearrange the file array.
5522 */
5523 skb = skb_dequeue(head);
5524 while (skb) {
5525 struct scm_fp_list *fp;
5526
5527 fp = UNIXCB(skb).fp;
5528 for (i = 0; i < fp->count; i++) {
5529 int left;
5530
5531 if (fp->fp[i] != file)
5532 continue;
5533
5534 unix_notinflight(fp->user, fp->fp[i]);
5535 left = fp->count - 1 - i;
5536 if (left) {
5537 memmove(&fp->fp[i], &fp->fp[i + 1],
5538 left * sizeof(struct file *));
5539 }
5540 fp->count--;
5541 if (!fp->count) {
5542 kfree_skb(skb);
5543 skb = NULL;
5544 } else {
5545 __skb_queue_tail(&list, skb);
5546 }
5547 fput(file);
5548 file = NULL;
5549 break;
5550 }
5551
5552 if (!file)
5553 break;
5554
5555 __skb_queue_tail(&list, skb);
5556
5557 skb = skb_dequeue(head);
5558 }
5559
5560 if (skb_peek(&list)) {
5561 spin_lock_irq(&head->lock);
5562 while ((skb = __skb_dequeue(&list)) != NULL)
5563 __skb_queue_tail(head, skb);
5564 spin_unlock_irq(&head->lock);
5565 }
5566#else
5567 fput(file);
5568#endif
5569}
5570
5571struct io_file_put {
5572 struct llist_node llist;
5573 struct file *file;
5574 struct completion *done;
5575};
5576
2faf852d 5577static void io_ring_file_ref_flush(struct fixed_file_data *data)
65e19f54 5578{
05f3fb3c 5579 struct io_file_put *pfile, *tmp;
05f3fb3c 5580 struct llist_node *node;
65e19f54 5581
05f3fb3c
JA
5582 while ((node = llist_del_all(&data->put_llist)) != NULL) {
5583 llist_for_each_entry_safe(pfile, tmp, node, llist) {
5584 io_ring_file_put(data->ctx, pfile->file);
5585 if (pfile->done)
5586 complete(pfile->done);
5587 else
5588 kfree(pfile);
5589 }
65e19f54 5590 }
2faf852d 5591}
65e19f54 5592
2faf852d
JA
5593static void io_ring_file_ref_switch(struct work_struct *work)
5594{
5595 struct fixed_file_data *data;
65e19f54 5596
2faf852d
JA
5597 data = container_of(work, struct fixed_file_data, ref_work);
5598 io_ring_file_ref_flush(data);
05f3fb3c
JA
5599 percpu_ref_get(&data->refs);
5600 percpu_ref_switch_to_percpu(&data->refs);
5601}
65e19f54 5602
05f3fb3c
JA
5603static void io_file_data_ref_zero(struct percpu_ref *ref)
5604{
5605 struct fixed_file_data *data;
5606
5607 data = container_of(ref, struct fixed_file_data, refs);
5608
2faf852d
JA
5609 /*
5610 * We can't safely switch from inside this context, punt to wq. If
5611 * the table ref is going away, the table is being unregistered.
5612 * Don't queue up the async work for that case, the caller will
5613 * handle it.
5614 */
5615 if (!percpu_ref_is_dying(&data->refs))
5616 queue_work(system_wq, &data->ref_work);
65e19f54
JA
5617}
5618
6b06314c
JA
5619static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
5620 unsigned nr_args)
5621{
5622 __s32 __user *fds = (__s32 __user *) arg;
65e19f54 5623 unsigned nr_tables;
05f3fb3c 5624 struct file *file;
6b06314c
JA
5625 int fd, ret = 0;
5626 unsigned i;
5627
05f3fb3c 5628 if (ctx->file_data)
6b06314c
JA
5629 return -EBUSY;
5630 if (!nr_args)
5631 return -EINVAL;
5632 if (nr_args > IORING_MAX_FIXED_FILES)
5633 return -EMFILE;
5634
05f3fb3c
JA
5635 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
5636 if (!ctx->file_data)
5637 return -ENOMEM;
5638 ctx->file_data->ctx = ctx;
5639 init_completion(&ctx->file_data->done);
5640
65e19f54 5641 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
05f3fb3c
JA
5642 ctx->file_data->table = kcalloc(nr_tables,
5643 sizeof(struct fixed_file_table),
65e19f54 5644 GFP_KERNEL);
05f3fb3c
JA
5645 if (!ctx->file_data->table) {
5646 kfree(ctx->file_data);
5647 ctx->file_data = NULL;
6b06314c 5648 return -ENOMEM;
05f3fb3c
JA
5649 }
5650
5651 if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
5652 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
5653 kfree(ctx->file_data->table);
5654 kfree(ctx->file_data);
5655 ctx->file_data = NULL;
6b06314c 5656 return -ENOMEM;
05f3fb3c
JA
5657 }
5658 ctx->file_data->put_llist.first = NULL;
5659 INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
6b06314c 5660
65e19f54 5661 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
05f3fb3c
JA
5662 percpu_ref_exit(&ctx->file_data->refs);
5663 kfree(ctx->file_data->table);
5664 kfree(ctx->file_data);
5665 ctx->file_data = NULL;
65e19f54
JA
5666 return -ENOMEM;
5667 }
5668
08a45173 5669 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
65e19f54
JA
5670 struct fixed_file_table *table;
5671 unsigned index;
5672
6b06314c
JA
5673 ret = -EFAULT;
5674 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
5675 break;
08a45173
JA
5676 /* allow sparse sets */
5677 if (fd == -1) {
5678 ret = 0;
5679 continue;
5680 }
6b06314c 5681
05f3fb3c 5682 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54 5683 index = i & IORING_FILE_TABLE_MASK;
05f3fb3c 5684 file = fget(fd);
6b06314c
JA
5685
5686 ret = -EBADF;
05f3fb3c 5687 if (!file)
6b06314c 5688 break;
05f3fb3c 5689
6b06314c
JA
5690 /*
5691 * Don't allow io_uring instances to be registered. If UNIX
5692 * isn't enabled, then this causes a reference cycle and this
5693 * instance can never get freed. If UNIX is enabled we'll
5694 * handle it just fine, but there's still no point in allowing
5695 * a ring fd as it doesn't support regular read/write anyway.
5696 */
05f3fb3c
JA
5697 if (file->f_op == &io_uring_fops) {
5698 fput(file);
6b06314c
JA
5699 break;
5700 }
6b06314c 5701 ret = 0;
05f3fb3c 5702 table->files[index] = file;
6b06314c
JA
5703 }
5704
5705 if (ret) {
65e19f54 5706 for (i = 0; i < ctx->nr_user_files; i++) {
65e19f54
JA
5707 file = io_file_from_index(ctx, i);
5708 if (file)
5709 fput(file);
5710 }
5711 for (i = 0; i < nr_tables; i++)
05f3fb3c 5712 kfree(ctx->file_data->table[i].files);
6b06314c 5713
05f3fb3c
JA
5714 kfree(ctx->file_data->table);
5715 kfree(ctx->file_data);
5716 ctx->file_data = NULL;
6b06314c
JA
5717 ctx->nr_user_files = 0;
5718 return ret;
5719 }
5720
5721 ret = io_sqe_files_scm(ctx);
5722 if (ret)
5723 io_sqe_files_unregister(ctx);
5724
5725 return ret;
5726}
5727
c3a31e60
JA
5728static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
5729 int index)
5730{
5731#if defined(CONFIG_UNIX)
5732 struct sock *sock = ctx->ring_sock->sk;
5733 struct sk_buff_head *head = &sock->sk_receive_queue;
5734 struct sk_buff *skb;
5735
5736 /*
5737 * See if we can merge this file into an existing skb SCM_RIGHTS
5738 * file set. If there's no room, fall back to allocating a new skb
5739 * and filling it in.
5740 */
5741 spin_lock_irq(&head->lock);
5742 skb = skb_peek(head);
5743 if (skb) {
5744 struct scm_fp_list *fpl = UNIXCB(skb).fp;
5745
5746 if (fpl->count < SCM_MAX_FD) {
5747 __skb_unlink(skb, head);
5748 spin_unlock_irq(&head->lock);
5749 fpl->fp[fpl->count] = get_file(file);
5750 unix_inflight(fpl->user, fpl->fp[fpl->count]);
5751 fpl->count++;
5752 spin_lock_irq(&head->lock);
5753 __skb_queue_head(head, skb);
5754 } else {
5755 skb = NULL;
5756 }
5757 }
5758 spin_unlock_irq(&head->lock);
5759
5760 if (skb) {
5761 fput(file);
5762 return 0;
5763 }
5764
5765 return __io_sqe_files_scm(ctx, 1, index);
5766#else
5767 return 0;
5768#endif
5769}
5770
05f3fb3c 5771static void io_atomic_switch(struct percpu_ref *ref)
c3a31e60 5772{
05f3fb3c
JA
5773 struct fixed_file_data *data;
5774
5775 data = container_of(ref, struct fixed_file_data, refs);
5776 clear_bit(FFD_F_ATOMIC, &data->state);
5777}
5778
5779static bool io_queue_file_removal(struct fixed_file_data *data,
5780 struct file *file)
5781{
5782 struct io_file_put *pfile, pfile_stack;
5783 DECLARE_COMPLETION_ONSTACK(done);
5784
5785 /*
5786 * If we fail allocating the struct we need for doing async reomval
5787 * of this file, just punt to sync and wait for it.
5788 */
5789 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
5790 if (!pfile) {
5791 pfile = &pfile_stack;
5792 pfile->done = &done;
5793 }
5794
5795 pfile->file = file;
5796 llist_add(&pfile->llist, &data->put_llist);
5797
5798 if (pfile == &pfile_stack) {
5799 if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5800 percpu_ref_put(&data->refs);
5801 percpu_ref_switch_to_atomic(&data->refs,
5802 io_atomic_switch);
5803 }
5804 wait_for_completion(&done);
5805 flush_work(&data->ref_work);
5806 return false;
5807 }
5808
5809 return true;
5810}
5811
5812static int __io_sqe_files_update(struct io_ring_ctx *ctx,
5813 struct io_uring_files_update *up,
5814 unsigned nr_args)
5815{
5816 struct fixed_file_data *data = ctx->file_data;
5817 bool ref_switch = false;
5818 struct file *file;
c3a31e60
JA
5819 __s32 __user *fds;
5820 int fd, i, err;
5821 __u32 done;
5822
05f3fb3c 5823 if (check_add_overflow(up->offset, nr_args, &done))
c3a31e60
JA
5824 return -EOVERFLOW;
5825 if (done > ctx->nr_user_files)
5826 return -EINVAL;
5827
5828 done = 0;
05f3fb3c 5829 fds = u64_to_user_ptr(up->fds);
c3a31e60 5830 while (nr_args) {
65e19f54
JA
5831 struct fixed_file_table *table;
5832 unsigned index;
5833
c3a31e60
JA
5834 err = 0;
5835 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
5836 err = -EFAULT;
5837 break;
5838 }
05f3fb3c
JA
5839 i = array_index_nospec(up->offset, ctx->nr_user_files);
5840 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54
JA
5841 index = i & IORING_FILE_TABLE_MASK;
5842 if (table->files[index]) {
05f3fb3c 5843 file = io_file_from_index(ctx, index);
65e19f54 5844 table->files[index] = NULL;
05f3fb3c
JA
5845 if (io_queue_file_removal(data, file))
5846 ref_switch = true;
c3a31e60
JA
5847 }
5848 if (fd != -1) {
c3a31e60
JA
5849 file = fget(fd);
5850 if (!file) {
5851 err = -EBADF;
5852 break;
5853 }
5854 /*
5855 * Don't allow io_uring instances to be registered. If
5856 * UNIX isn't enabled, then this causes a reference
5857 * cycle and this instance can never get freed. If UNIX
5858 * is enabled we'll handle it just fine, but there's
5859 * still no point in allowing a ring fd as it doesn't
5860 * support regular read/write anyway.
5861 */
5862 if (file->f_op == &io_uring_fops) {
5863 fput(file);
5864 err = -EBADF;
5865 break;
5866 }
65e19f54 5867 table->files[index] = file;
c3a31e60
JA
5868 err = io_sqe_file_register(ctx, file, i);
5869 if (err)
5870 break;
5871 }
5872 nr_args--;
5873 done++;
05f3fb3c
JA
5874 up->offset++;
5875 }
5876
5877 if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5878 percpu_ref_put(&data->refs);
5879 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
c3a31e60
JA
5880 }
5881
5882 return done ? done : err;
5883}
05f3fb3c
JA
5884static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
5885 unsigned nr_args)
5886{
5887 struct io_uring_files_update up;
5888
5889 if (!ctx->file_data)
5890 return -ENXIO;
5891 if (!nr_args)
5892 return -EINVAL;
5893 if (copy_from_user(&up, arg, sizeof(up)))
5894 return -EFAULT;
5895 if (up.resv)
5896 return -EINVAL;
5897
5898 return __io_sqe_files_update(ctx, &up, nr_args);
5899}
c3a31e60 5900
7d723065
JA
5901static void io_put_work(struct io_wq_work *work)
5902{
5903 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5904
5905 io_put_req(req);
5906}
5907
5908static void io_get_work(struct io_wq_work *work)
5909{
5910 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5911
5912 refcount_inc(&req->refs);
5913}
5914
24369c2e
PB
5915static int io_init_wq_offload(struct io_ring_ctx *ctx,
5916 struct io_uring_params *p)
5917{
5918 struct io_wq_data data;
5919 struct fd f;
5920 struct io_ring_ctx *ctx_attach;
5921 unsigned int concurrency;
5922 int ret = 0;
5923
5924 data.user = ctx->user;
5925 data.get_work = io_get_work;
5926 data.put_work = io_put_work;
5927
5928 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
5929 /* Do QD, or 4 * CPUS, whatever is smallest */
5930 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
5931
5932 ctx->io_wq = io_wq_create(concurrency, &data);
5933 if (IS_ERR(ctx->io_wq)) {
5934 ret = PTR_ERR(ctx->io_wq);
5935 ctx->io_wq = NULL;
5936 }
5937 return ret;
5938 }
5939
5940 f = fdget(p->wq_fd);
5941 if (!f.file)
5942 return -EBADF;
5943
5944 if (f.file->f_op != &io_uring_fops) {
5945 ret = -EINVAL;
5946 goto out_fput;
5947 }
5948
5949 ctx_attach = f.file->private_data;
5950 /* @io_wq is protected by holding the fd */
5951 if (!io_wq_get(ctx_attach->io_wq, &data)) {
5952 ret = -EINVAL;
5953 goto out_fput;
5954 }
5955
5956 ctx->io_wq = ctx_attach->io_wq;
5957out_fput:
5958 fdput(f);
5959 return ret;
5960}
5961
6c271ce2
JA
5962static int io_sq_offload_start(struct io_ring_ctx *ctx,
5963 struct io_uring_params *p)
2b188cc1
JA
5964{
5965 int ret;
5966
6c271ce2 5967 init_waitqueue_head(&ctx->sqo_wait);
2b188cc1
JA
5968 mmgrab(current->mm);
5969 ctx->sqo_mm = current->mm;
5970
6c271ce2 5971 if (ctx->flags & IORING_SETUP_SQPOLL) {
3ec482d1
JA
5972 ret = -EPERM;
5973 if (!capable(CAP_SYS_ADMIN))
5974 goto err;
5975
917257da
JA
5976 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
5977 if (!ctx->sq_thread_idle)
5978 ctx->sq_thread_idle = HZ;
5979
6c271ce2 5980 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 5981 int cpu = p->sq_thread_cpu;
6c271ce2 5982
917257da 5983 ret = -EINVAL;
44a9bd18
JA
5984 if (cpu >= nr_cpu_ids)
5985 goto err;
7889f44d 5986 if (!cpu_online(cpu))
917257da
JA
5987 goto err;
5988
6c271ce2
JA
5989 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
5990 ctx, cpu,
5991 "io_uring-sq");
5992 } else {
5993 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
5994 "io_uring-sq");
5995 }
5996 if (IS_ERR(ctx->sqo_thread)) {
5997 ret = PTR_ERR(ctx->sqo_thread);
5998 ctx->sqo_thread = NULL;
5999 goto err;
6000 }
6001 wake_up_process(ctx->sqo_thread);
6002 } else if (p->flags & IORING_SETUP_SQ_AFF) {
6003 /* Can't have SQ_AFF without SQPOLL */
6004 ret = -EINVAL;
6005 goto err;
6006 }
6007
24369c2e
PB
6008 ret = io_init_wq_offload(ctx, p);
6009 if (ret)
2b188cc1 6010 goto err;
2b188cc1
JA
6011
6012 return 0;
6013err:
54a91f3b 6014 io_finish_async(ctx);
2b188cc1
JA
6015 mmdrop(ctx->sqo_mm);
6016 ctx->sqo_mm = NULL;
6017 return ret;
6018}
6019
6020static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
6021{
6022 atomic_long_sub(nr_pages, &user->locked_vm);
6023}
6024
6025static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
6026{
6027 unsigned long page_limit, cur_pages, new_pages;
6028
6029 /* Don't allow more pages than we can safely lock */
6030 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
6031
6032 do {
6033 cur_pages = atomic_long_read(&user->locked_vm);
6034 new_pages = cur_pages + nr_pages;
6035 if (new_pages > page_limit)
6036 return -ENOMEM;
6037 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
6038 new_pages) != cur_pages);
6039
6040 return 0;
6041}
6042
6043static void io_mem_free(void *ptr)
6044{
52e04ef4
MR
6045 struct page *page;
6046
6047 if (!ptr)
6048 return;
2b188cc1 6049
52e04ef4 6050 page = virt_to_head_page(ptr);
2b188cc1
JA
6051 if (put_page_testzero(page))
6052 free_compound_page(page);
6053}
6054
6055static void *io_mem_alloc(size_t size)
6056{
6057 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
6058 __GFP_NORETRY;
6059
6060 return (void *) __get_free_pages(gfp_flags, get_order(size));
6061}
6062
75b28aff
HV
6063static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
6064 size_t *sq_offset)
6065{
6066 struct io_rings *rings;
6067 size_t off, sq_array_size;
6068
6069 off = struct_size(rings, cqes, cq_entries);
6070 if (off == SIZE_MAX)
6071 return SIZE_MAX;
6072
6073#ifdef CONFIG_SMP
6074 off = ALIGN(off, SMP_CACHE_BYTES);
6075 if (off == 0)
6076 return SIZE_MAX;
6077#endif
6078
6079 sq_array_size = array_size(sizeof(u32), sq_entries);
6080 if (sq_array_size == SIZE_MAX)
6081 return SIZE_MAX;
6082
6083 if (check_add_overflow(off, sq_array_size, &off))
6084 return SIZE_MAX;
6085
6086 if (sq_offset)
6087 *sq_offset = off;
6088
6089 return off;
6090}
6091
2b188cc1
JA
6092static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
6093{
75b28aff 6094 size_t pages;
2b188cc1 6095
75b28aff
HV
6096 pages = (size_t)1 << get_order(
6097 rings_size(sq_entries, cq_entries, NULL));
6098 pages += (size_t)1 << get_order(
6099 array_size(sizeof(struct io_uring_sqe), sq_entries));
2b188cc1 6100
75b28aff 6101 return pages;
2b188cc1
JA
6102}
6103
edafccee
JA
6104static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
6105{
6106 int i, j;
6107
6108 if (!ctx->user_bufs)
6109 return -ENXIO;
6110
6111 for (i = 0; i < ctx->nr_user_bufs; i++) {
6112 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6113
6114 for (j = 0; j < imu->nr_bvecs; j++)
f1f6a7dd 6115 unpin_user_page(imu->bvec[j].bv_page);
edafccee
JA
6116
6117 if (ctx->account_mem)
6118 io_unaccount_mem(ctx->user, imu->nr_bvecs);
d4ef6475 6119 kvfree(imu->bvec);
edafccee
JA
6120 imu->nr_bvecs = 0;
6121 }
6122
6123 kfree(ctx->user_bufs);
6124 ctx->user_bufs = NULL;
6125 ctx->nr_user_bufs = 0;
6126 return 0;
6127}
6128
6129static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
6130 void __user *arg, unsigned index)
6131{
6132 struct iovec __user *src;
6133
6134#ifdef CONFIG_COMPAT
6135 if (ctx->compat) {
6136 struct compat_iovec __user *ciovs;
6137 struct compat_iovec ciov;
6138
6139 ciovs = (struct compat_iovec __user *) arg;
6140 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
6141 return -EFAULT;
6142
d55e5f5b 6143 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
6144 dst->iov_len = ciov.iov_len;
6145 return 0;
6146 }
6147#endif
6148 src = (struct iovec __user *) arg;
6149 if (copy_from_user(dst, &src[index], sizeof(*dst)))
6150 return -EFAULT;
6151 return 0;
6152}
6153
6154static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
6155 unsigned nr_args)
6156{
6157 struct vm_area_struct **vmas = NULL;
6158 struct page **pages = NULL;
6159 int i, j, got_pages = 0;
6160 int ret = -EINVAL;
6161
6162 if (ctx->user_bufs)
6163 return -EBUSY;
6164 if (!nr_args || nr_args > UIO_MAXIOV)
6165 return -EINVAL;
6166
6167 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
6168 GFP_KERNEL);
6169 if (!ctx->user_bufs)
6170 return -ENOMEM;
6171
6172 for (i = 0; i < nr_args; i++) {
6173 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6174 unsigned long off, start, end, ubuf;
6175 int pret, nr_pages;
6176 struct iovec iov;
6177 size_t size;
6178
6179 ret = io_copy_iov(ctx, &iov, arg, i);
6180 if (ret)
a278682d 6181 goto err;
edafccee
JA
6182
6183 /*
6184 * Don't impose further limits on the size and buffer
6185 * constraints here, we'll -EINVAL later when IO is
6186 * submitted if they are wrong.
6187 */
6188 ret = -EFAULT;
6189 if (!iov.iov_base || !iov.iov_len)
6190 goto err;
6191
6192 /* arbitrary limit, but we need something */
6193 if (iov.iov_len > SZ_1G)
6194 goto err;
6195
6196 ubuf = (unsigned long) iov.iov_base;
6197 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
6198 start = ubuf >> PAGE_SHIFT;
6199 nr_pages = end - start;
6200
6201 if (ctx->account_mem) {
6202 ret = io_account_mem(ctx->user, nr_pages);
6203 if (ret)
6204 goto err;
6205 }
6206
6207 ret = 0;
6208 if (!pages || nr_pages > got_pages) {
6209 kfree(vmas);
6210 kfree(pages);
d4ef6475 6211 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
edafccee 6212 GFP_KERNEL);
d4ef6475 6213 vmas = kvmalloc_array(nr_pages,
edafccee
JA
6214 sizeof(struct vm_area_struct *),
6215 GFP_KERNEL);
6216 if (!pages || !vmas) {
6217 ret = -ENOMEM;
6218 if (ctx->account_mem)
6219 io_unaccount_mem(ctx->user, nr_pages);
6220 goto err;
6221 }
6222 got_pages = nr_pages;
6223 }
6224
d4ef6475 6225 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
edafccee
JA
6226 GFP_KERNEL);
6227 ret = -ENOMEM;
6228 if (!imu->bvec) {
6229 if (ctx->account_mem)
6230 io_unaccount_mem(ctx->user, nr_pages);
6231 goto err;
6232 }
6233
6234 ret = 0;
6235 down_read(&current->mm->mmap_sem);
2113b05d 6236 pret = pin_user_pages(ubuf, nr_pages,
932f4a63
IW
6237 FOLL_WRITE | FOLL_LONGTERM,
6238 pages, vmas);
edafccee
JA
6239 if (pret == nr_pages) {
6240 /* don't support file backed memory */
6241 for (j = 0; j < nr_pages; j++) {
6242 struct vm_area_struct *vma = vmas[j];
6243
6244 if (vma->vm_file &&
6245 !is_file_hugepages(vma->vm_file)) {
6246 ret = -EOPNOTSUPP;
6247 break;
6248 }
6249 }
6250 } else {
6251 ret = pret < 0 ? pret : -EFAULT;
6252 }
6253 up_read(&current->mm->mmap_sem);
6254 if (ret) {
6255 /*
6256 * if we did partial map, or found file backed vmas,
6257 * release any pages we did get
6258 */
27c4d3a3 6259 if (pret > 0)
f1f6a7dd 6260 unpin_user_pages(pages, pret);
edafccee
JA
6261 if (ctx->account_mem)
6262 io_unaccount_mem(ctx->user, nr_pages);
d4ef6475 6263 kvfree(imu->bvec);
edafccee
JA
6264 goto err;
6265 }
6266
6267 off = ubuf & ~PAGE_MASK;
6268 size = iov.iov_len;
6269 for (j = 0; j < nr_pages; j++) {
6270 size_t vec_len;
6271
6272 vec_len = min_t(size_t, size, PAGE_SIZE - off);
6273 imu->bvec[j].bv_page = pages[j];
6274 imu->bvec[j].bv_len = vec_len;
6275 imu->bvec[j].bv_offset = off;
6276 off = 0;
6277 size -= vec_len;
6278 }
6279 /* store original address for later verification */
6280 imu->ubuf = ubuf;
6281 imu->len = iov.iov_len;
6282 imu->nr_bvecs = nr_pages;
6283
6284 ctx->nr_user_bufs++;
6285 }
d4ef6475
MR
6286 kvfree(pages);
6287 kvfree(vmas);
edafccee
JA
6288 return 0;
6289err:
d4ef6475
MR
6290 kvfree(pages);
6291 kvfree(vmas);
edafccee
JA
6292 io_sqe_buffer_unregister(ctx);
6293 return ret;
6294}
6295
9b402849
JA
6296static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
6297{
6298 __s32 __user *fds = arg;
6299 int fd;
6300
6301 if (ctx->cq_ev_fd)
6302 return -EBUSY;
6303
6304 if (copy_from_user(&fd, fds, sizeof(*fds)))
6305 return -EFAULT;
6306
6307 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
6308 if (IS_ERR(ctx->cq_ev_fd)) {
6309 int ret = PTR_ERR(ctx->cq_ev_fd);
6310 ctx->cq_ev_fd = NULL;
6311 return ret;
6312 }
6313
6314 return 0;
6315}
6316
6317static int io_eventfd_unregister(struct io_ring_ctx *ctx)
6318{
6319 if (ctx->cq_ev_fd) {
6320 eventfd_ctx_put(ctx->cq_ev_fd);
6321 ctx->cq_ev_fd = NULL;
6322 return 0;
6323 }
6324
6325 return -ENXIO;
6326}
6327
2b188cc1
JA
6328static void io_ring_ctx_free(struct io_ring_ctx *ctx)
6329{
6b06314c 6330 io_finish_async(ctx);
2b188cc1
JA
6331 if (ctx->sqo_mm)
6332 mmdrop(ctx->sqo_mm);
def596e9
JA
6333
6334 io_iopoll_reap_events(ctx);
edafccee 6335 io_sqe_buffer_unregister(ctx);
6b06314c 6336 io_sqe_files_unregister(ctx);
9b402849 6337 io_eventfd_unregister(ctx);
def596e9 6338
2b188cc1 6339#if defined(CONFIG_UNIX)
355e8d26
EB
6340 if (ctx->ring_sock) {
6341 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 6342 sock_release(ctx->ring_sock);
355e8d26 6343 }
2b188cc1
JA
6344#endif
6345
75b28aff 6346 io_mem_free(ctx->rings);
2b188cc1 6347 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
6348
6349 percpu_ref_exit(&ctx->refs);
6350 if (ctx->account_mem)
6351 io_unaccount_mem(ctx->user,
6352 ring_pages(ctx->sq_entries, ctx->cq_entries));
6353 free_uid(ctx->user);
181e448d 6354 put_cred(ctx->creds);
206aefde 6355 kfree(ctx->completions);
78076bb6 6356 kfree(ctx->cancel_hash);
0ddf92e8 6357 kmem_cache_free(req_cachep, ctx->fallback_req);
2b188cc1
JA
6358 kfree(ctx);
6359}
6360
6361static __poll_t io_uring_poll(struct file *file, poll_table *wait)
6362{
6363 struct io_ring_ctx *ctx = file->private_data;
6364 __poll_t mask = 0;
6365
6366 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
6367 /*
6368 * synchronizes with barrier from wq_has_sleeper call in
6369 * io_commit_cqring
6370 */
2b188cc1 6371 smp_rmb();
75b28aff
HV
6372 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
6373 ctx->rings->sq_ring_entries)
2b188cc1 6374 mask |= EPOLLOUT | EPOLLWRNORM;
63e5d81f 6375 if (io_cqring_events(ctx, false))
2b188cc1
JA
6376 mask |= EPOLLIN | EPOLLRDNORM;
6377
6378 return mask;
6379}
6380
6381static int io_uring_fasync(int fd, struct file *file, int on)
6382{
6383 struct io_ring_ctx *ctx = file->private_data;
6384
6385 return fasync_helper(fd, file, on, &ctx->cq_fasync);
6386}
6387
071698e1
JA
6388static int io_remove_personalities(int id, void *p, void *data)
6389{
6390 struct io_ring_ctx *ctx = data;
6391 const struct cred *cred;
6392
6393 cred = idr_remove(&ctx->personality_idr, id);
6394 if (cred)
6395 put_cred(cred);
6396 return 0;
6397}
6398
2b188cc1
JA
6399static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
6400{
6401 mutex_lock(&ctx->uring_lock);
6402 percpu_ref_kill(&ctx->refs);
6403 mutex_unlock(&ctx->uring_lock);
6404
df069d80
JA
6405 /*
6406 * Wait for sq thread to idle, if we have one. It won't spin on new
6407 * work after we've killed the ctx ref above. This is important to do
6408 * before we cancel existing commands, as the thread could otherwise
6409 * be queueing new work post that. If that's work we need to cancel,
6410 * it could cause shutdown to hang.
6411 */
6412 while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
6413 cpu_relax();
6414
5262f567 6415 io_kill_timeouts(ctx);
221c5eb2 6416 io_poll_remove_all(ctx);
561fb04a
JA
6417
6418 if (ctx->io_wq)
6419 io_wq_cancel_all(ctx->io_wq);
6420
def596e9 6421 io_iopoll_reap_events(ctx);
15dff286
JA
6422 /* if we failed setting up the ctx, we might not have any rings */
6423 if (ctx->rings)
6424 io_cqring_overflow_flush(ctx, true);
071698e1 6425 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
206aefde 6426 wait_for_completion(&ctx->completions[0]);
2b188cc1
JA
6427 io_ring_ctx_free(ctx);
6428}
6429
6430static int io_uring_release(struct inode *inode, struct file *file)
6431{
6432 struct io_ring_ctx *ctx = file->private_data;
6433
6434 file->private_data = NULL;
6435 io_ring_ctx_wait_and_kill(ctx);
6436 return 0;
6437}
6438
fcb323cc
JA
6439static void io_uring_cancel_files(struct io_ring_ctx *ctx,
6440 struct files_struct *files)
6441{
6442 struct io_kiocb *req;
6443 DEFINE_WAIT(wait);
6444
6445 while (!list_empty_careful(&ctx->inflight_list)) {
768134d4 6446 struct io_kiocb *cancel_req = NULL;
fcb323cc
JA
6447
6448 spin_lock_irq(&ctx->inflight_lock);
6449 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
768134d4
JA
6450 if (req->work.files != files)
6451 continue;
6452 /* req is being completed, ignore */
6453 if (!refcount_inc_not_zero(&req->refs))
6454 continue;
6455 cancel_req = req;
6456 break;
fcb323cc 6457 }
768134d4 6458 if (cancel_req)
fcb323cc 6459 prepare_to_wait(&ctx->inflight_wait, &wait,
768134d4 6460 TASK_UNINTERRUPTIBLE);
fcb323cc
JA
6461 spin_unlock_irq(&ctx->inflight_lock);
6462
768134d4
JA
6463 /* We need to keep going until we don't find a matching req */
6464 if (!cancel_req)
fcb323cc 6465 break;
2f6d9b9d
BL
6466
6467 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
6468 io_put_req(cancel_req);
fcb323cc
JA
6469 schedule();
6470 }
768134d4 6471 finish_wait(&ctx->inflight_wait, &wait);
fcb323cc
JA
6472}
6473
6474static int io_uring_flush(struct file *file, void *data)
6475{
6476 struct io_ring_ctx *ctx = file->private_data;
6477
6478 io_uring_cancel_files(ctx, data);
6ab23144
JA
6479
6480 /*
6481 * If the task is going away, cancel work it may have pending
6482 */
6483 if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
6484 io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
6485
fcb323cc
JA
6486 return 0;
6487}
6488
6c5c240e
RP
6489static void *io_uring_validate_mmap_request(struct file *file,
6490 loff_t pgoff, size_t sz)
2b188cc1 6491{
2b188cc1 6492 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 6493 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
6494 struct page *page;
6495 void *ptr;
6496
6497 switch (offset) {
6498 case IORING_OFF_SQ_RING:
75b28aff
HV
6499 case IORING_OFF_CQ_RING:
6500 ptr = ctx->rings;
2b188cc1
JA
6501 break;
6502 case IORING_OFF_SQES:
6503 ptr = ctx->sq_sqes;
6504 break;
2b188cc1 6505 default:
6c5c240e 6506 return ERR_PTR(-EINVAL);
2b188cc1
JA
6507 }
6508
6509 page = virt_to_head_page(ptr);
a50b854e 6510 if (sz > page_size(page))
6c5c240e
RP
6511 return ERR_PTR(-EINVAL);
6512
6513 return ptr;
6514}
6515
6516#ifdef CONFIG_MMU
6517
6518static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6519{
6520 size_t sz = vma->vm_end - vma->vm_start;
6521 unsigned long pfn;
6522 void *ptr;
6523
6524 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
6525 if (IS_ERR(ptr))
6526 return PTR_ERR(ptr);
2b188cc1
JA
6527
6528 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
6529 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
6530}
6531
6c5c240e
RP
6532#else /* !CONFIG_MMU */
6533
6534static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6535{
6536 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
6537}
6538
6539static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
6540{
6541 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
6542}
6543
6544static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
6545 unsigned long addr, unsigned long len,
6546 unsigned long pgoff, unsigned long flags)
6547{
6548 void *ptr;
6549
6550 ptr = io_uring_validate_mmap_request(file, pgoff, len);
6551 if (IS_ERR(ptr))
6552 return PTR_ERR(ptr);
6553
6554 return (unsigned long) ptr;
6555}
6556
6557#endif /* !CONFIG_MMU */
6558
2b188cc1
JA
6559SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
6560 u32, min_complete, u32, flags, const sigset_t __user *, sig,
6561 size_t, sigsz)
6562{
6563 struct io_ring_ctx *ctx;
6564 long ret = -EBADF;
6565 int submitted = 0;
6566 struct fd f;
6567
6c271ce2 6568 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2b188cc1
JA
6569 return -EINVAL;
6570
6571 f = fdget(fd);
6572 if (!f.file)
6573 return -EBADF;
6574
6575 ret = -EOPNOTSUPP;
6576 if (f.file->f_op != &io_uring_fops)
6577 goto out_fput;
6578
6579 ret = -ENXIO;
6580 ctx = f.file->private_data;
6581 if (!percpu_ref_tryget(&ctx->refs))
6582 goto out_fput;
6583
6c271ce2
JA
6584 /*
6585 * For SQ polling, the thread will do all submissions and completions.
6586 * Just return the requested submit count, and wake the thread if
6587 * we were asked to.
6588 */
b2a9eada 6589 ret = 0;
6c271ce2 6590 if (ctx->flags & IORING_SETUP_SQPOLL) {
c1edbf5f
JA
6591 if (!list_empty_careful(&ctx->cq_overflow_list))
6592 io_cqring_overflow_flush(ctx, false);
6c271ce2
JA
6593 if (flags & IORING_ENTER_SQ_WAKEUP)
6594 wake_up(&ctx->sqo_wait);
6595 submitted = to_submit;
b2a9eada 6596 } else if (to_submit) {
ae9428ca 6597 struct mm_struct *cur_mm;
2b188cc1
JA
6598
6599 mutex_lock(&ctx->uring_lock);
ae9428ca
PB
6600 /* already have mm, so io_submit_sqes() won't try to grab it */
6601 cur_mm = ctx->sqo_mm;
6602 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
6603 &cur_mm, false);
2b188cc1 6604 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
6605
6606 if (submitted != to_submit)
6607 goto out;
2b188cc1
JA
6608 }
6609 if (flags & IORING_ENTER_GETEVENTS) {
def596e9
JA
6610 unsigned nr_events = 0;
6611
2b188cc1
JA
6612 min_complete = min(min_complete, ctx->cq_entries);
6613
def596e9 6614 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9 6615 ret = io_iopoll_check(ctx, &nr_events, min_complete);
def596e9
JA
6616 } else {
6617 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
6618 }
2b188cc1
JA
6619 }
6620
7c504e65 6621out:
6805b32e 6622 percpu_ref_put(&ctx->refs);
2b188cc1
JA
6623out_fput:
6624 fdput(f);
6625 return submitted ? submitted : ret;
6626}
6627
87ce955b
JA
6628static int io_uring_show_cred(int id, void *p, void *data)
6629{
6630 const struct cred *cred = p;
6631 struct seq_file *m = data;
6632 struct user_namespace *uns = seq_user_ns(m);
6633 struct group_info *gi;
6634 kernel_cap_t cap;
6635 unsigned __capi;
6636 int g;
6637
6638 seq_printf(m, "%5d\n", id);
6639 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
6640 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
6641 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
6642 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
6643 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
6644 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
6645 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
6646 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
6647 seq_puts(m, "\n\tGroups:\t");
6648 gi = cred->group_info;
6649 for (g = 0; g < gi->ngroups; g++) {
6650 seq_put_decimal_ull(m, g ? " " : "",
6651 from_kgid_munged(uns, gi->gid[g]));
6652 }
6653 seq_puts(m, "\n\tCapEff:\t");
6654 cap = cred->cap_effective;
6655 CAP_FOR_EACH_U32(__capi)
6656 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
6657 seq_putc(m, '\n');
6658 return 0;
6659}
6660
6661static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
6662{
6663 int i;
6664
6665 mutex_lock(&ctx->uring_lock);
6666 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
6667 for (i = 0; i < ctx->nr_user_files; i++) {
6668 struct fixed_file_table *table;
6669 struct file *f;
6670
6671 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6672 f = table->files[i & IORING_FILE_TABLE_MASK];
6673 if (f)
6674 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
6675 else
6676 seq_printf(m, "%5u: <none>\n", i);
6677 }
6678 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
6679 for (i = 0; i < ctx->nr_user_bufs; i++) {
6680 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
6681
6682 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
6683 (unsigned int) buf->len);
6684 }
6685 if (!idr_is_empty(&ctx->personality_idr)) {
6686 seq_printf(m, "Personalities:\n");
6687 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
6688 }
6689 mutex_unlock(&ctx->uring_lock);
6690}
6691
6692static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
6693{
6694 struct io_ring_ctx *ctx = f->private_data;
6695
6696 if (percpu_ref_tryget(&ctx->refs)) {
6697 __io_uring_show_fdinfo(ctx, m);
6698 percpu_ref_put(&ctx->refs);
6699 }
6700}
6701
2b188cc1
JA
6702static const struct file_operations io_uring_fops = {
6703 .release = io_uring_release,
fcb323cc 6704 .flush = io_uring_flush,
2b188cc1 6705 .mmap = io_uring_mmap,
6c5c240e
RP
6706#ifndef CONFIG_MMU
6707 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
6708 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
6709#endif
2b188cc1
JA
6710 .poll = io_uring_poll,
6711 .fasync = io_uring_fasync,
87ce955b 6712 .show_fdinfo = io_uring_show_fdinfo,
2b188cc1
JA
6713};
6714
6715static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
6716 struct io_uring_params *p)
6717{
75b28aff
HV
6718 struct io_rings *rings;
6719 size_t size, sq_array_offset;
2b188cc1 6720
75b28aff
HV
6721 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
6722 if (size == SIZE_MAX)
6723 return -EOVERFLOW;
6724
6725 rings = io_mem_alloc(size);
6726 if (!rings)
2b188cc1
JA
6727 return -ENOMEM;
6728
75b28aff
HV
6729 ctx->rings = rings;
6730 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
6731 rings->sq_ring_mask = p->sq_entries - 1;
6732 rings->cq_ring_mask = p->cq_entries - 1;
6733 rings->sq_ring_entries = p->sq_entries;
6734 rings->cq_ring_entries = p->cq_entries;
6735 ctx->sq_mask = rings->sq_ring_mask;
6736 ctx->cq_mask = rings->cq_ring_mask;
6737 ctx->sq_entries = rings->sq_ring_entries;
6738 ctx->cq_entries = rings->cq_ring_entries;
2b188cc1
JA
6739
6740 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
6741 if (size == SIZE_MAX) {
6742 io_mem_free(ctx->rings);
6743 ctx->rings = NULL;
2b188cc1 6744 return -EOVERFLOW;
eb065d30 6745 }
2b188cc1
JA
6746
6747 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
6748 if (!ctx->sq_sqes) {
6749 io_mem_free(ctx->rings);
6750 ctx->rings = NULL;
2b188cc1 6751 return -ENOMEM;
eb065d30 6752 }
2b188cc1 6753
2b188cc1
JA
6754 return 0;
6755}
6756
6757/*
6758 * Allocate an anonymous fd, this is what constitutes the application
6759 * visible backing of an io_uring instance. The application mmaps this
6760 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
6761 * we have to tie this fd to a socket for file garbage collection purposes.
6762 */
6763static int io_uring_get_fd(struct io_ring_ctx *ctx)
6764{
6765 struct file *file;
6766 int ret;
6767
6768#if defined(CONFIG_UNIX)
6769 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
6770 &ctx->ring_sock);
6771 if (ret)
6772 return ret;
6773#endif
6774
6775 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
6776 if (ret < 0)
6777 goto err;
6778
6779 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
6780 O_RDWR | O_CLOEXEC);
6781 if (IS_ERR(file)) {
6782 put_unused_fd(ret);
6783 ret = PTR_ERR(file);
6784 goto err;
6785 }
6786
6787#if defined(CONFIG_UNIX)
6788 ctx->ring_sock->file = file;
6789#endif
6790 fd_install(ret, file);
6791 return ret;
6792err:
6793#if defined(CONFIG_UNIX)
6794 sock_release(ctx->ring_sock);
6795 ctx->ring_sock = NULL;
6796#endif
6797 return ret;
6798}
6799
6800static int io_uring_create(unsigned entries, struct io_uring_params *p)
6801{
6802 struct user_struct *user = NULL;
6803 struct io_ring_ctx *ctx;
6804 bool account_mem;
6805 int ret;
6806
8110c1a6 6807 if (!entries)
2b188cc1 6808 return -EINVAL;
8110c1a6
JA
6809 if (entries > IORING_MAX_ENTRIES) {
6810 if (!(p->flags & IORING_SETUP_CLAMP))
6811 return -EINVAL;
6812 entries = IORING_MAX_ENTRIES;
6813 }
2b188cc1
JA
6814
6815 /*
6816 * Use twice as many entries for the CQ ring. It's possible for the
6817 * application to drive a higher depth than the size of the SQ ring,
6818 * since the sqes are only used at submission time. This allows for
33a107f0
JA
6819 * some flexibility in overcommitting a bit. If the application has
6820 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
6821 * of CQ ring entries manually.
2b188cc1
JA
6822 */
6823 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
6824 if (p->flags & IORING_SETUP_CQSIZE) {
6825 /*
6826 * If IORING_SETUP_CQSIZE is set, we do the same roundup
6827 * to a power-of-two, if it isn't already. We do NOT impose
6828 * any cq vs sq ring sizing.
6829 */
8110c1a6 6830 if (p->cq_entries < p->sq_entries)
33a107f0 6831 return -EINVAL;
8110c1a6
JA
6832 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
6833 if (!(p->flags & IORING_SETUP_CLAMP))
6834 return -EINVAL;
6835 p->cq_entries = IORING_MAX_CQ_ENTRIES;
6836 }
33a107f0
JA
6837 p->cq_entries = roundup_pow_of_two(p->cq_entries);
6838 } else {
6839 p->cq_entries = 2 * p->sq_entries;
6840 }
2b188cc1
JA
6841
6842 user = get_uid(current_user());
6843 account_mem = !capable(CAP_IPC_LOCK);
6844
6845 if (account_mem) {
6846 ret = io_account_mem(user,
6847 ring_pages(p->sq_entries, p->cq_entries));
6848 if (ret) {
6849 free_uid(user);
6850 return ret;
6851 }
6852 }
6853
6854 ctx = io_ring_ctx_alloc(p);
6855 if (!ctx) {
6856 if (account_mem)
6857 io_unaccount_mem(user, ring_pages(p->sq_entries,
6858 p->cq_entries));
6859 free_uid(user);
6860 return -ENOMEM;
6861 }
6862 ctx->compat = in_compat_syscall();
6863 ctx->account_mem = account_mem;
6864 ctx->user = user;
0b8c0ec7 6865 ctx->creds = get_current_cred();
2b188cc1
JA
6866
6867 ret = io_allocate_scq_urings(ctx, p);
6868 if (ret)
6869 goto err;
6870
6c271ce2 6871 ret = io_sq_offload_start(ctx, p);
2b188cc1
JA
6872 if (ret)
6873 goto err;
6874
2b188cc1 6875 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
6876 p->sq_off.head = offsetof(struct io_rings, sq.head);
6877 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
6878 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
6879 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
6880 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
6881 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
6882 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
6883
6884 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
6885 p->cq_off.head = offsetof(struct io_rings, cq.head);
6886 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
6887 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
6888 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
6889 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
6890 p->cq_off.cqes = offsetof(struct io_rings, cqes);
ac90f249 6891
044c1ab3
JA
6892 /*
6893 * Install ring fd as the very last thing, so we don't risk someone
6894 * having closed it before we finish setup
6895 */
6896 ret = io_uring_get_fd(ctx);
6897 if (ret < 0)
6898 goto err;
6899
da8c9690 6900 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
cccf0ee8
JA
6901 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
6902 IORING_FEAT_CUR_PERSONALITY;
c826bd7a 6903 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
6904 return ret;
6905err:
6906 io_ring_ctx_wait_and_kill(ctx);
6907 return ret;
6908}
6909
6910/*
6911 * Sets up an aio uring context, and returns the fd. Applications asks for a
6912 * ring size, we return the actual sq/cq ring sizes (among other things) in the
6913 * params structure passed in.
6914 */
6915static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
6916{
6917 struct io_uring_params p;
6918 long ret;
6919 int i;
6920
6921 if (copy_from_user(&p, params, sizeof(p)))
6922 return -EFAULT;
6923 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
6924 if (p.resv[i])
6925 return -EINVAL;
6926 }
6927
6c271ce2 6928 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 6929 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
24369c2e 6930 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
2b188cc1
JA
6931 return -EINVAL;
6932
6933 ret = io_uring_create(entries, &p);
6934 if (ret < 0)
6935 return ret;
6936
6937 if (copy_to_user(params, &p, sizeof(p)))
6938 return -EFAULT;
6939
6940 return ret;
6941}
6942
6943SYSCALL_DEFINE2(io_uring_setup, u32, entries,
6944 struct io_uring_params __user *, params)
6945{
6946 return io_uring_setup(entries, params);
6947}
6948
66f4af93
JA
6949static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
6950{
6951 struct io_uring_probe *p;
6952 size_t size;
6953 int i, ret;
6954
6955 size = struct_size(p, ops, nr_args);
6956 if (size == SIZE_MAX)
6957 return -EOVERFLOW;
6958 p = kzalloc(size, GFP_KERNEL);
6959 if (!p)
6960 return -ENOMEM;
6961
6962 ret = -EFAULT;
6963 if (copy_from_user(p, arg, size))
6964 goto out;
6965 ret = -EINVAL;
6966 if (memchr_inv(p, 0, size))
6967 goto out;
6968
6969 p->last_op = IORING_OP_LAST - 1;
6970 if (nr_args > IORING_OP_LAST)
6971 nr_args = IORING_OP_LAST;
6972
6973 for (i = 0; i < nr_args; i++) {
6974 p->ops[i].op = i;
6975 if (!io_op_defs[i].not_supported)
6976 p->ops[i].flags = IO_URING_OP_SUPPORTED;
6977 }
6978 p->ops_len = i;
6979
6980 ret = 0;
6981 if (copy_to_user(arg, p, size))
6982 ret = -EFAULT;
6983out:
6984 kfree(p);
6985 return ret;
6986}
6987
071698e1
JA
6988static int io_register_personality(struct io_ring_ctx *ctx)
6989{
6990 const struct cred *creds = get_current_cred();
6991 int id;
6992
6993 id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
6994 USHRT_MAX, GFP_KERNEL);
6995 if (id < 0)
6996 put_cred(creds);
6997 return id;
6998}
6999
7000static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
7001{
7002 const struct cred *old_creds;
7003
7004 old_creds = idr_remove(&ctx->personality_idr, id);
7005 if (old_creds) {
7006 put_cred(old_creds);
7007 return 0;
7008 }
7009
7010 return -EINVAL;
7011}
7012
7013static bool io_register_op_must_quiesce(int op)
7014{
7015 switch (op) {
7016 case IORING_UNREGISTER_FILES:
7017 case IORING_REGISTER_FILES_UPDATE:
7018 case IORING_REGISTER_PROBE:
7019 case IORING_REGISTER_PERSONALITY:
7020 case IORING_UNREGISTER_PERSONALITY:
7021 return false;
7022 default:
7023 return true;
7024 }
7025}
7026
edafccee
JA
7027static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
7028 void __user *arg, unsigned nr_args)
b19062a5
JA
7029 __releases(ctx->uring_lock)
7030 __acquires(ctx->uring_lock)
edafccee
JA
7031{
7032 int ret;
7033
35fa71a0
JA
7034 /*
7035 * We're inside the ring mutex, if the ref is already dying, then
7036 * someone else killed the ctx or is already going through
7037 * io_uring_register().
7038 */
7039 if (percpu_ref_is_dying(&ctx->refs))
7040 return -ENXIO;
7041
071698e1 7042 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 7043 percpu_ref_kill(&ctx->refs);
b19062a5 7044
05f3fb3c
JA
7045 /*
7046 * Drop uring mutex before waiting for references to exit. If
7047 * another thread is currently inside io_uring_enter() it might
7048 * need to grab the uring_lock to make progress. If we hold it
7049 * here across the drain wait, then we can deadlock. It's safe
7050 * to drop the mutex here, since no new references will come in
7051 * after we've killed the percpu ref.
7052 */
7053 mutex_unlock(&ctx->uring_lock);
c150368b 7054 ret = wait_for_completion_interruptible(&ctx->completions[0]);
05f3fb3c 7055 mutex_lock(&ctx->uring_lock);
c150368b
JA
7056 if (ret) {
7057 percpu_ref_resurrect(&ctx->refs);
7058 ret = -EINTR;
7059 goto out;
7060 }
05f3fb3c 7061 }
edafccee
JA
7062
7063 switch (opcode) {
7064 case IORING_REGISTER_BUFFERS:
7065 ret = io_sqe_buffer_register(ctx, arg, nr_args);
7066 break;
7067 case IORING_UNREGISTER_BUFFERS:
7068 ret = -EINVAL;
7069 if (arg || nr_args)
7070 break;
7071 ret = io_sqe_buffer_unregister(ctx);
7072 break;
6b06314c
JA
7073 case IORING_REGISTER_FILES:
7074 ret = io_sqe_files_register(ctx, arg, nr_args);
7075 break;
7076 case IORING_UNREGISTER_FILES:
7077 ret = -EINVAL;
7078 if (arg || nr_args)
7079 break;
7080 ret = io_sqe_files_unregister(ctx);
7081 break;
c3a31e60
JA
7082 case IORING_REGISTER_FILES_UPDATE:
7083 ret = io_sqe_files_update(ctx, arg, nr_args);
7084 break;
9b402849 7085 case IORING_REGISTER_EVENTFD:
f2842ab5 7086 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
7087 ret = -EINVAL;
7088 if (nr_args != 1)
7089 break;
7090 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
7091 if (ret)
7092 break;
7093 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
7094 ctx->eventfd_async = 1;
7095 else
7096 ctx->eventfd_async = 0;
9b402849
JA
7097 break;
7098 case IORING_UNREGISTER_EVENTFD:
7099 ret = -EINVAL;
7100 if (arg || nr_args)
7101 break;
7102 ret = io_eventfd_unregister(ctx);
7103 break;
66f4af93
JA
7104 case IORING_REGISTER_PROBE:
7105 ret = -EINVAL;
7106 if (!arg || nr_args > 256)
7107 break;
7108 ret = io_probe(ctx, arg, nr_args);
7109 break;
071698e1
JA
7110 case IORING_REGISTER_PERSONALITY:
7111 ret = -EINVAL;
7112 if (arg || nr_args)
7113 break;
7114 ret = io_register_personality(ctx);
7115 break;
7116 case IORING_UNREGISTER_PERSONALITY:
7117 ret = -EINVAL;
7118 if (arg)
7119 break;
7120 ret = io_unregister_personality(ctx, nr_args);
7121 break;
edafccee
JA
7122 default:
7123 ret = -EINVAL;
7124 break;
7125 }
7126
071698e1 7127 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 7128 /* bring the ctx back to life */
05f3fb3c 7129 percpu_ref_reinit(&ctx->refs);
c150368b
JA
7130out:
7131 reinit_completion(&ctx->completions[0]);
05f3fb3c 7132 }
edafccee
JA
7133 return ret;
7134}
7135
7136SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
7137 void __user *, arg, unsigned int, nr_args)
7138{
7139 struct io_ring_ctx *ctx;
7140 long ret = -EBADF;
7141 struct fd f;
7142
7143 f = fdget(fd);
7144 if (!f.file)
7145 return -EBADF;
7146
7147 ret = -EOPNOTSUPP;
7148 if (f.file->f_op != &io_uring_fops)
7149 goto out_fput;
7150
7151 ctx = f.file->private_data;
7152
7153 mutex_lock(&ctx->uring_lock);
7154 ret = __io_uring_register(ctx, opcode, arg, nr_args);
7155 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
7156 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
7157 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
7158out_fput:
7159 fdput(f);
7160 return ret;
7161}
7162
2b188cc1
JA
7163static int __init io_uring_init(void)
7164{
d7f62e82
SM
7165#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
7166 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
7167 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
7168} while (0)
7169
7170#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
7171 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
7172 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
7173 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
7174 BUILD_BUG_SQE_ELEM(1, __u8, flags);
7175 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
7176 BUILD_BUG_SQE_ELEM(4, __s32, fd);
7177 BUILD_BUG_SQE_ELEM(8, __u64, off);
7178 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
7179 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7180 BUILD_BUG_SQE_ELEM(24, __u32, len);
7181 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
7182 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
7183 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
7184 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
7185 BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
7186 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
7187 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
7188 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
7189 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
7190 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
7191 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
7192 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
7193 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7194 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
7195 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
7196 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7197
d3656344 7198 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
2b188cc1
JA
7199 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
7200 return 0;
7201};
7202__initcall(io_uring_init);