]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/io_uring.c
io_uring: optimise sqe-to-req flags translation
[mirror_ubuntu-jammy-kernel.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <linux/refcount.h>
48#include <linux/uio.h>
6b47ee6e 49#include <linux/bits.h>
2b188cc1
JA
50
51#include <linux/sched/signal.h>
52#include <linux/fs.h>
53#include <linux/file.h>
54#include <linux/fdtable.h>
55#include <linux/mm.h>
56#include <linux/mman.h>
57#include <linux/mmu_context.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
6c271ce2 60#include <linux/kthread.h>
2b188cc1 61#include <linux/blkdev.h>
edafccee 62#include <linux/bvec.h>
2b188cc1
JA
63#include <linux/net.h>
64#include <net/sock.h>
65#include <net/af_unix.h>
6b06314c 66#include <net/scm.h>
2b188cc1
JA
67#include <linux/anon_inodes.h>
68#include <linux/sched/mm.h>
69#include <linux/uaccess.h>
70#include <linux/nospec.h>
edafccee
JA
71#include <linux/sizes.h>
72#include <linux/hugetlb.h>
aa4c3967 73#include <linux/highmem.h>
15b71abe
JA
74#include <linux/namei.h>
75#include <linux/fsnotify.h>
4840e418 76#include <linux/fadvise.h>
2b188cc1 77
c826bd7a
DD
78#define CREATE_TRACE_POINTS
79#include <trace/events/io_uring.h>
80
2b188cc1
JA
81#include <uapi/linux/io_uring.h>
82
83#include "internal.h"
561fb04a 84#include "io-wq.h"
2b188cc1 85
5277deaa 86#define IORING_MAX_ENTRIES 32768
33a107f0 87#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
65e19f54
JA
88
89/*
90 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
91 */
92#define IORING_FILE_TABLE_SHIFT 9
93#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
94#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
95#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
2b188cc1
JA
96
97struct io_uring {
98 u32 head ____cacheline_aligned_in_smp;
99 u32 tail ____cacheline_aligned_in_smp;
100};
101
1e84b97b 102/*
75b28aff
HV
103 * This data is shared with the application through the mmap at offsets
104 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
105 *
106 * The offsets to the member fields are published through struct
107 * io_sqring_offsets when calling io_uring_setup.
108 */
75b28aff 109struct io_rings {
1e84b97b
SB
110 /*
111 * Head and tail offsets into the ring; the offsets need to be
112 * masked to get valid indices.
113 *
75b28aff
HV
114 * The kernel controls head of the sq ring and the tail of the cq ring,
115 * and the application controls tail of the sq ring and the head of the
116 * cq ring.
1e84b97b 117 */
75b28aff 118 struct io_uring sq, cq;
1e84b97b 119 /*
75b28aff 120 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
121 * ring_entries - 1)
122 */
75b28aff
HV
123 u32 sq_ring_mask, cq_ring_mask;
124 /* Ring sizes (constant, power of 2) */
125 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
126 /*
127 * Number of invalid entries dropped by the kernel due to
128 * invalid index stored in array
129 *
130 * Written by the kernel, shouldn't be modified by the
131 * application (i.e. get number of "new events" by comparing to
132 * cached value).
133 *
134 * After a new SQ head value was read by the application this
135 * counter includes all submissions that were dropped reaching
136 * the new SQ head (and possibly more).
137 */
75b28aff 138 u32 sq_dropped;
1e84b97b
SB
139 /*
140 * Runtime flags
141 *
142 * Written by the kernel, shouldn't be modified by the
143 * application.
144 *
145 * The application needs a full memory barrier before checking
146 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
147 */
75b28aff 148 u32 sq_flags;
1e84b97b
SB
149 /*
150 * Number of completion events lost because the queue was full;
151 * this should be avoided by the application by making sure
0b4295b5 152 * there are not more requests pending than there is space in
1e84b97b
SB
153 * the completion queue.
154 *
155 * Written by the kernel, shouldn't be modified by the
156 * application (i.e. get number of "new events" by comparing to
157 * cached value).
158 *
159 * As completion events come in out of order this counter is not
160 * ordered with any other data.
161 */
75b28aff 162 u32 cq_overflow;
1e84b97b
SB
163 /*
164 * Ring buffer of completion events.
165 *
166 * The kernel writes completion events fresh every time they are
167 * produced, so the application is allowed to modify pending
168 * entries.
169 */
75b28aff 170 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
171};
172
edafccee
JA
173struct io_mapped_ubuf {
174 u64 ubuf;
175 size_t len;
176 struct bio_vec *bvec;
177 unsigned int nr_bvecs;
178};
179
65e19f54
JA
180struct fixed_file_table {
181 struct file **files;
31b51510
JA
182};
183
05f3fb3c
JA
184enum {
185 FFD_F_ATOMIC,
186};
187
188struct fixed_file_data {
189 struct fixed_file_table *table;
190 struct io_ring_ctx *ctx;
191
192 struct percpu_ref refs;
193 struct llist_head put_llist;
194 unsigned long state;
195 struct work_struct ref_work;
196 struct completion done;
197};
198
2b188cc1
JA
199struct io_ring_ctx {
200 struct {
201 struct percpu_ref refs;
202 } ____cacheline_aligned_in_smp;
203
204 struct {
205 unsigned int flags;
69b3e546
JA
206 int compat: 1;
207 int account_mem: 1;
208 int cq_overflow_flushed: 1;
209 int drain_next: 1;
f2842ab5 210 int eventfd_async: 1;
2b188cc1 211
75b28aff
HV
212 /*
213 * Ring buffer of indices into array of io_uring_sqe, which is
214 * mmapped by the application using the IORING_OFF_SQES offset.
215 *
216 * This indirection could e.g. be used to assign fixed
217 * io_uring_sqe entries to operations and only submit them to
218 * the queue when needed.
219 *
220 * The kernel modifies neither the indices array nor the entries
221 * array.
222 */
223 u32 *sq_array;
2b188cc1
JA
224 unsigned cached_sq_head;
225 unsigned sq_entries;
226 unsigned sq_mask;
6c271ce2 227 unsigned sq_thread_idle;
498ccd9e 228 unsigned cached_sq_dropped;
206aefde 229 atomic_t cached_cq_overflow;
ad3eb2c8 230 unsigned long sq_check_overflow;
de0617e4
JA
231
232 struct list_head defer_list;
5262f567 233 struct list_head timeout_list;
1d7bb1d5 234 struct list_head cq_overflow_list;
fcb323cc
JA
235
236 wait_queue_head_t inflight_wait;
ad3eb2c8 237 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
238 } ____cacheline_aligned_in_smp;
239
206aefde
JA
240 struct io_rings *rings;
241
2b188cc1 242 /* IO offload */
561fb04a 243 struct io_wq *io_wq;
6c271ce2 244 struct task_struct *sqo_thread; /* if using sq thread polling */
2b188cc1 245 struct mm_struct *sqo_mm;
6c271ce2 246 wait_queue_head_t sqo_wait;
75b28aff 247
6b06314c
JA
248 /*
249 * If used, fixed file set. Writers must ensure that ->refs is dead,
250 * readers must ensure that ->refs is alive as long as the file* is
251 * used. Only updated through io_uring_register(2).
252 */
05f3fb3c 253 struct fixed_file_data *file_data;
6b06314c 254 unsigned nr_user_files;
b14cca0c
PB
255 int ring_fd;
256 struct file *ring_file;
6b06314c 257
edafccee
JA
258 /* if used, fixed mapped user buffers */
259 unsigned nr_user_bufs;
260 struct io_mapped_ubuf *user_bufs;
261
2b188cc1
JA
262 struct user_struct *user;
263
0b8c0ec7 264 const struct cred *creds;
181e448d 265
206aefde
JA
266 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
267 struct completion *completions;
268
0ddf92e8
JA
269 /* if all else fails... */
270 struct io_kiocb *fallback_req;
271
206aefde
JA
272#if defined(CONFIG_UNIX)
273 struct socket *ring_sock;
274#endif
275
276 struct {
277 unsigned cached_cq_tail;
278 unsigned cq_entries;
279 unsigned cq_mask;
280 atomic_t cq_timeouts;
ad3eb2c8 281 unsigned long cq_check_overflow;
206aefde
JA
282 struct wait_queue_head cq_wait;
283 struct fasync_struct *cq_fasync;
284 struct eventfd_ctx *cq_ev_fd;
285 } ____cacheline_aligned_in_smp;
2b188cc1
JA
286
287 struct {
288 struct mutex uring_lock;
289 wait_queue_head_t wait;
290 } ____cacheline_aligned_in_smp;
291
292 struct {
293 spinlock_t completion_lock;
e94f141b
JA
294 struct llist_head poll_llist;
295
def596e9
JA
296 /*
297 * ->poll_list is protected by the ctx->uring_lock for
298 * io_uring instances that don't use IORING_SETUP_SQPOLL.
299 * For SQPOLL, only the single threaded io_sq_thread() will
300 * manipulate the list, hence no extra locking is needed there.
301 */
302 struct list_head poll_list;
78076bb6
JA
303 struct hlist_head *cancel_hash;
304 unsigned cancel_hash_bits;
e94f141b 305 bool poll_multi_file;
31b51510 306
fcb323cc
JA
307 spinlock_t inflight_lock;
308 struct list_head inflight_list;
2b188cc1 309 } ____cacheline_aligned_in_smp;
2b188cc1
JA
310};
311
09bb8394
JA
312/*
313 * First field must be the file pointer in all the
314 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
315 */
221c5eb2
JA
316struct io_poll_iocb {
317 struct file *file;
0969e783
JA
318 union {
319 struct wait_queue_head *head;
320 u64 addr;
321 };
221c5eb2 322 __poll_t events;
8c838788 323 bool done;
221c5eb2 324 bool canceled;
392edb45 325 struct wait_queue_entry wait;
221c5eb2
JA
326};
327
b5dba59e
JA
328struct io_close {
329 struct file *file;
330 struct file *put_file;
331 int fd;
332};
333
ad8a48ac
JA
334struct io_timeout_data {
335 struct io_kiocb *req;
336 struct hrtimer timer;
337 struct timespec64 ts;
338 enum hrtimer_mode mode;
cc42e0ac 339 u32 seq_offset;
ad8a48ac
JA
340};
341
8ed8d3c3
JA
342struct io_accept {
343 struct file *file;
344 struct sockaddr __user *addr;
345 int __user *addr_len;
346 int flags;
347};
348
349struct io_sync {
350 struct file *file;
351 loff_t len;
352 loff_t off;
353 int flags;
d63d1b5e 354 int mode;
8ed8d3c3
JA
355};
356
fbf23849
JA
357struct io_cancel {
358 struct file *file;
359 u64 addr;
360};
361
b29472ee
JA
362struct io_timeout {
363 struct file *file;
364 u64 addr;
365 int flags;
26a61679 366 unsigned count;
b29472ee
JA
367};
368
9adbd45d
JA
369struct io_rw {
370 /* NOTE: kiocb has the file as the first member, so don't do it here */
371 struct kiocb kiocb;
372 u64 addr;
373 u64 len;
374};
375
3fbb51c1
JA
376struct io_connect {
377 struct file *file;
378 struct sockaddr __user *addr;
379 int addr_len;
380};
381
e47293fd
JA
382struct io_sr_msg {
383 struct file *file;
fddaface
JA
384 union {
385 struct user_msghdr __user *msg;
386 void __user *buf;
387 };
e47293fd 388 int msg_flags;
fddaface 389 size_t len;
e47293fd
JA
390};
391
15b71abe
JA
392struct io_open {
393 struct file *file;
394 int dfd;
eddc7ef5 395 union {
eddc7ef5
JA
396 unsigned mask;
397 };
15b71abe 398 struct filename *filename;
eddc7ef5 399 struct statx __user *buffer;
c12cedf2 400 struct open_how how;
15b71abe
JA
401};
402
05f3fb3c
JA
403struct io_files_update {
404 struct file *file;
405 u64 arg;
406 u32 nr_args;
407 u32 offset;
408};
409
4840e418
JA
410struct io_fadvise {
411 struct file *file;
412 u64 offset;
413 u32 len;
414 u32 advice;
415};
416
c1ca757b
JA
417struct io_madvise {
418 struct file *file;
419 u64 addr;
420 u32 len;
421 u32 advice;
422};
423
f499a021
JA
424struct io_async_connect {
425 struct sockaddr_storage address;
426};
427
03b1230c
JA
428struct io_async_msghdr {
429 struct iovec fast_iov[UIO_FASTIOV];
430 struct iovec *iov;
431 struct sockaddr __user *uaddr;
432 struct msghdr msg;
433};
434
f67676d1
JA
435struct io_async_rw {
436 struct iovec fast_iov[UIO_FASTIOV];
437 struct iovec *iov;
438 ssize_t nr_segs;
439 ssize_t size;
440};
441
15b71abe
JA
442struct io_async_open {
443 struct filename *filename;
444};
445
1a6b74fc 446struct io_async_ctx {
f67676d1
JA
447 union {
448 struct io_async_rw rw;
03b1230c 449 struct io_async_msghdr msg;
f499a021 450 struct io_async_connect connect;
2d28390a 451 struct io_timeout_data timeout;
15b71abe 452 struct io_async_open open;
f67676d1 453 };
1a6b74fc
JA
454};
455
6b47ee6e
PB
456enum {
457 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
458 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
459 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
460 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
461 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
462
463 REQ_F_LINK_NEXT_BIT,
464 REQ_F_FAIL_LINK_BIT,
465 REQ_F_INFLIGHT_BIT,
466 REQ_F_CUR_POS_BIT,
467 REQ_F_NOWAIT_BIT,
468 REQ_F_IOPOLL_COMPLETED_BIT,
469 REQ_F_LINK_TIMEOUT_BIT,
470 REQ_F_TIMEOUT_BIT,
471 REQ_F_ISREG_BIT,
472 REQ_F_MUST_PUNT_BIT,
473 REQ_F_TIMEOUT_NOSEQ_BIT,
474 REQ_F_COMP_LOCKED_BIT,
475};
476
477enum {
478 /* ctx owns file */
479 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
480 /* drain existing IO first */
481 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
482 /* linked sqes */
483 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
484 /* doesn't sever on completion < 0 */
485 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
486 /* IOSQE_ASYNC */
487 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
488
489 /* already grabbed next link */
490 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
491 /* fail rest of links */
492 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
493 /* on inflight list */
494 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
495 /* read/write uses file position */
496 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
497 /* must not punt to workers */
498 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
499 /* polled IO has completed */
500 REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
501 /* has linked timeout */
502 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
503 /* timeout request */
504 REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
505 /* regular file */
506 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
507 /* must be punted even for NONBLOCK */
508 REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
509 /* no timeout sequence */
510 REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
511 /* completion under lock */
512 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
513};
514
09bb8394
JA
515/*
516 * NOTE! Each of the iocb union members has the file pointer
517 * as the first entry in their struct definition. So you can
518 * access the file pointer through any of the sub-structs,
519 * or directly as just 'ki_filp' in this struct.
520 */
2b188cc1 521struct io_kiocb {
221c5eb2 522 union {
09bb8394 523 struct file *file;
9adbd45d 524 struct io_rw rw;
221c5eb2 525 struct io_poll_iocb poll;
8ed8d3c3
JA
526 struct io_accept accept;
527 struct io_sync sync;
fbf23849 528 struct io_cancel cancel;
b29472ee 529 struct io_timeout timeout;
3fbb51c1 530 struct io_connect connect;
e47293fd 531 struct io_sr_msg sr_msg;
15b71abe 532 struct io_open open;
b5dba59e 533 struct io_close close;
05f3fb3c 534 struct io_files_update files_update;
4840e418 535 struct io_fadvise fadvise;
c1ca757b 536 struct io_madvise madvise;
221c5eb2 537 };
2b188cc1 538
1a6b74fc 539 struct io_async_ctx *io;
b14cca0c
PB
540 /*
541 * llist_node is only used for poll deferred completions
542 */
543 struct llist_node llist_node;
cf6fd4bd
PB
544 bool has_user;
545 bool in_async;
546 bool needs_fixed_file;
d625c6ee 547 u8 opcode;
2b188cc1
JA
548
549 struct io_ring_ctx *ctx;
eac406c6
JA
550 union {
551 struct list_head list;
78076bb6 552 struct hlist_node hash_node;
eac406c6 553 };
9e645e11 554 struct list_head link_list;
2b188cc1 555 unsigned int flags;
c16361c1 556 refcount_t refs;
2b188cc1 557 u64 user_data;
9e645e11 558 u32 result;
de0617e4 559 u32 sequence;
2b188cc1 560
fcb323cc
JA
561 struct list_head inflight_entry;
562
561fb04a 563 struct io_wq_work work;
2b188cc1
JA
564};
565
566#define IO_PLUG_THRESHOLD 2
def596e9 567#define IO_IOPOLL_BATCH 8
2b188cc1 568
9a56a232
JA
569struct io_submit_state {
570 struct blk_plug plug;
571
2579f913
JA
572 /*
573 * io_kiocb alloc cache
574 */
575 void *reqs[IO_IOPOLL_BATCH];
576 unsigned int free_reqs;
577 unsigned int cur_req;
578
9a56a232
JA
579 /*
580 * File reference cache
581 */
582 struct file *file;
583 unsigned int fd;
584 unsigned int has_refs;
585 unsigned int used_refs;
586 unsigned int ios_left;
587};
588
d3656344
JA
589struct io_op_def {
590 /* needs req->io allocated for deferral/async */
591 unsigned async_ctx : 1;
592 /* needs current->mm setup, does mm access */
593 unsigned needs_mm : 1;
594 /* needs req->file assigned */
595 unsigned needs_file : 1;
596 /* needs req->file assigned IFF fd is >= 0 */
597 unsigned fd_non_neg : 1;
598 /* hash wq insertion if file is a regular file */
599 unsigned hash_reg_file : 1;
600 /* unbound wq insertion if file is a non-regular file */
601 unsigned unbound_nonreg_file : 1;
66f4af93
JA
602 /* opcode is not supported by this kernel */
603 unsigned not_supported : 1;
d3656344
JA
604};
605
606static const struct io_op_def io_op_defs[] = {
607 {
608 /* IORING_OP_NOP */
609 },
610 {
611 /* IORING_OP_READV */
612 .async_ctx = 1,
613 .needs_mm = 1,
614 .needs_file = 1,
615 .unbound_nonreg_file = 1,
616 },
617 {
618 /* IORING_OP_WRITEV */
619 .async_ctx = 1,
620 .needs_mm = 1,
621 .needs_file = 1,
622 .hash_reg_file = 1,
623 .unbound_nonreg_file = 1,
624 },
625 {
626 /* IORING_OP_FSYNC */
627 .needs_file = 1,
628 },
629 {
630 /* IORING_OP_READ_FIXED */
631 .needs_file = 1,
632 .unbound_nonreg_file = 1,
633 },
634 {
635 /* IORING_OP_WRITE_FIXED */
636 .needs_file = 1,
637 .hash_reg_file = 1,
638 .unbound_nonreg_file = 1,
639 },
640 {
641 /* IORING_OP_POLL_ADD */
642 .needs_file = 1,
643 .unbound_nonreg_file = 1,
644 },
645 {
646 /* IORING_OP_POLL_REMOVE */
647 },
648 {
649 /* IORING_OP_SYNC_FILE_RANGE */
650 .needs_file = 1,
651 },
652 {
653 /* IORING_OP_SENDMSG */
654 .async_ctx = 1,
655 .needs_mm = 1,
656 .needs_file = 1,
657 .unbound_nonreg_file = 1,
658 },
659 {
660 /* IORING_OP_RECVMSG */
661 .async_ctx = 1,
662 .needs_mm = 1,
663 .needs_file = 1,
664 .unbound_nonreg_file = 1,
665 },
666 {
667 /* IORING_OP_TIMEOUT */
668 .async_ctx = 1,
669 .needs_mm = 1,
670 },
671 {
672 /* IORING_OP_TIMEOUT_REMOVE */
673 },
674 {
675 /* IORING_OP_ACCEPT */
676 .needs_mm = 1,
677 .needs_file = 1,
678 .unbound_nonreg_file = 1,
679 },
680 {
681 /* IORING_OP_ASYNC_CANCEL */
682 },
683 {
684 /* IORING_OP_LINK_TIMEOUT */
685 .async_ctx = 1,
686 .needs_mm = 1,
687 },
688 {
689 /* IORING_OP_CONNECT */
690 .async_ctx = 1,
691 .needs_mm = 1,
692 .needs_file = 1,
693 .unbound_nonreg_file = 1,
694 },
695 {
696 /* IORING_OP_FALLOCATE */
697 .needs_file = 1,
698 },
699 {
700 /* IORING_OP_OPENAT */
701 .needs_file = 1,
702 .fd_non_neg = 1,
703 },
704 {
705 /* IORING_OP_CLOSE */
706 .needs_file = 1,
707 },
708 {
709 /* IORING_OP_FILES_UPDATE */
710 .needs_mm = 1,
711 },
712 {
713 /* IORING_OP_STATX */
714 .needs_mm = 1,
715 .needs_file = 1,
716 .fd_non_neg = 1,
717 },
3a6820f2
JA
718 {
719 /* IORING_OP_READ */
720 .needs_mm = 1,
721 .needs_file = 1,
722 .unbound_nonreg_file = 1,
723 },
724 {
725 /* IORING_OP_WRITE */
726 .needs_mm = 1,
727 .needs_file = 1,
728 .unbound_nonreg_file = 1,
729 },
4840e418
JA
730 {
731 /* IORING_OP_FADVISE */
732 .needs_file = 1,
733 },
c1ca757b
JA
734 {
735 /* IORING_OP_MADVISE */
736 .needs_mm = 1,
737 },
fddaface
JA
738 {
739 /* IORING_OP_SEND */
740 .needs_mm = 1,
741 .needs_file = 1,
742 .unbound_nonreg_file = 1,
743 },
744 {
745 /* IORING_OP_RECV */
746 .needs_mm = 1,
747 .needs_file = 1,
748 .unbound_nonreg_file = 1,
749 },
cebdb986
JA
750 {
751 /* IORING_OP_OPENAT2 */
752 .needs_file = 1,
753 .fd_non_neg = 1,
754 },
d3656344
JA
755};
756
561fb04a 757static void io_wq_submit_work(struct io_wq_work **workptr);
78e19bbe 758static void io_cqring_fill_event(struct io_kiocb *req, long res);
ec9c02ad 759static void io_put_req(struct io_kiocb *req);
978db57e 760static void __io_double_put_req(struct io_kiocb *req);
94ae5e77
JA
761static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
762static void io_queue_linked_timeout(struct io_kiocb *req);
05f3fb3c
JA
763static int __io_sqe_files_update(struct io_ring_ctx *ctx,
764 struct io_uring_files_update *ip,
765 unsigned nr_args);
de0617e4 766
2b188cc1
JA
767static struct kmem_cache *req_cachep;
768
769static const struct file_operations io_uring_fops;
770
771struct sock *io_uring_get_socket(struct file *file)
772{
773#if defined(CONFIG_UNIX)
774 if (file->f_op == &io_uring_fops) {
775 struct io_ring_ctx *ctx = file->private_data;
776
777 return ctx->ring_sock->sk;
778 }
779#endif
780 return NULL;
781}
782EXPORT_SYMBOL(io_uring_get_socket);
783
784static void io_ring_ctx_ref_free(struct percpu_ref *ref)
785{
786 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
787
206aefde 788 complete(&ctx->completions[0]);
2b188cc1
JA
789}
790
791static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
792{
793 struct io_ring_ctx *ctx;
78076bb6 794 int hash_bits;
2b188cc1
JA
795
796 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
797 if (!ctx)
798 return NULL;
799
0ddf92e8
JA
800 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
801 if (!ctx->fallback_req)
802 goto err;
803
206aefde
JA
804 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
805 if (!ctx->completions)
806 goto err;
807
78076bb6
JA
808 /*
809 * Use 5 bits less than the max cq entries, that should give us around
810 * 32 entries per hash list if totally full and uniformly spread.
811 */
812 hash_bits = ilog2(p->cq_entries);
813 hash_bits -= 5;
814 if (hash_bits <= 0)
815 hash_bits = 1;
816 ctx->cancel_hash_bits = hash_bits;
817 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
818 GFP_KERNEL);
819 if (!ctx->cancel_hash)
820 goto err;
821 __hash_init(ctx->cancel_hash, 1U << hash_bits);
822
21482896 823 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
824 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
825 goto err;
2b188cc1
JA
826
827 ctx->flags = p->flags;
828 init_waitqueue_head(&ctx->cq_wait);
1d7bb1d5 829 INIT_LIST_HEAD(&ctx->cq_overflow_list);
206aefde
JA
830 init_completion(&ctx->completions[0]);
831 init_completion(&ctx->completions[1]);
2b188cc1
JA
832 mutex_init(&ctx->uring_lock);
833 init_waitqueue_head(&ctx->wait);
834 spin_lock_init(&ctx->completion_lock);
e94f141b 835 init_llist_head(&ctx->poll_llist);
def596e9 836 INIT_LIST_HEAD(&ctx->poll_list);
de0617e4 837 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 838 INIT_LIST_HEAD(&ctx->timeout_list);
fcb323cc
JA
839 init_waitqueue_head(&ctx->inflight_wait);
840 spin_lock_init(&ctx->inflight_lock);
841 INIT_LIST_HEAD(&ctx->inflight_list);
2b188cc1 842 return ctx;
206aefde 843err:
0ddf92e8
JA
844 if (ctx->fallback_req)
845 kmem_cache_free(req_cachep, ctx->fallback_req);
206aefde 846 kfree(ctx->completions);
78076bb6 847 kfree(ctx->cancel_hash);
206aefde
JA
848 kfree(ctx);
849 return NULL;
2b188cc1
JA
850}
851
9d858b21 852static inline bool __req_need_defer(struct io_kiocb *req)
7adf4eaf 853{
a197f664
JL
854 struct io_ring_ctx *ctx = req->ctx;
855
498ccd9e
JA
856 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
857 + atomic_read(&ctx->cached_cq_overflow);
7adf4eaf
JA
858}
859
9d858b21 860static inline bool req_need_defer(struct io_kiocb *req)
de0617e4 861{
87987898 862 if (unlikely(req->flags & REQ_F_IO_DRAIN))
9d858b21 863 return __req_need_defer(req);
de0617e4 864
9d858b21 865 return false;
de0617e4
JA
866}
867
7adf4eaf 868static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
de0617e4
JA
869{
870 struct io_kiocb *req;
871
7adf4eaf 872 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
9d858b21 873 if (req && !req_need_defer(req)) {
de0617e4
JA
874 list_del_init(&req->list);
875 return req;
876 }
877
878 return NULL;
879}
880
5262f567
JA
881static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
882{
7adf4eaf
JA
883 struct io_kiocb *req;
884
885 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
93bd25bb
JA
886 if (req) {
887 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
888 return NULL;
fb4b3d3f 889 if (!__req_need_defer(req)) {
93bd25bb
JA
890 list_del_init(&req->list);
891 return req;
892 }
7adf4eaf
JA
893 }
894
895 return NULL;
5262f567
JA
896}
897
de0617e4 898static void __io_commit_cqring(struct io_ring_ctx *ctx)
2b188cc1 899{
75b28aff 900 struct io_rings *rings = ctx->rings;
2b188cc1 901
07910158
PB
902 /* order cqe stores with ring update */
903 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
2b188cc1 904
07910158
PB
905 if (wq_has_sleeper(&ctx->cq_wait)) {
906 wake_up_interruptible(&ctx->cq_wait);
907 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
2b188cc1
JA
908 }
909}
910
94ae5e77
JA
911static inline bool io_prep_async_work(struct io_kiocb *req,
912 struct io_kiocb **link)
18d9be1a 913{
d3656344 914 const struct io_op_def *def = &io_op_defs[req->opcode];
561fb04a 915 bool do_hashed = false;
54a91f3b 916
d3656344
JA
917 if (req->flags & REQ_F_ISREG) {
918 if (def->hash_reg_file)
3529d8c2 919 do_hashed = true;
d3656344
JA
920 } else {
921 if (def->unbound_nonreg_file)
3529d8c2 922 req->work.flags |= IO_WQ_WORK_UNBOUND;
54a91f3b 923 }
d3656344 924 if (def->needs_mm)
3529d8c2 925 req->work.flags |= IO_WQ_WORK_NEEDS_USER;
54a91f3b 926
94ae5e77 927 *link = io_prep_linked_timeout(req);
561fb04a
JA
928 return do_hashed;
929}
930
a197f664 931static inline void io_queue_async_work(struct io_kiocb *req)
561fb04a 932{
a197f664 933 struct io_ring_ctx *ctx = req->ctx;
94ae5e77
JA
934 struct io_kiocb *link;
935 bool do_hashed;
936
937 do_hashed = io_prep_async_work(req, &link);
561fb04a
JA
938
939 trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
940 req->flags);
941 if (!do_hashed) {
942 io_wq_enqueue(ctx->io_wq, &req->work);
943 } else {
944 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
945 file_inode(req->file));
946 }
94ae5e77
JA
947
948 if (link)
949 io_queue_linked_timeout(link);
18d9be1a
JA
950}
951
5262f567
JA
952static void io_kill_timeout(struct io_kiocb *req)
953{
954 int ret;
955
2d28390a 956 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
5262f567
JA
957 if (ret != -1) {
958 atomic_inc(&req->ctx->cq_timeouts);
842f9612 959 list_del_init(&req->list);
78e19bbe 960 io_cqring_fill_event(req, 0);
ec9c02ad 961 io_put_req(req);
5262f567
JA
962 }
963}
964
965static void io_kill_timeouts(struct io_ring_ctx *ctx)
966{
967 struct io_kiocb *req, *tmp;
968
969 spin_lock_irq(&ctx->completion_lock);
970 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
971 io_kill_timeout(req);
972 spin_unlock_irq(&ctx->completion_lock);
973}
974
de0617e4
JA
975static void io_commit_cqring(struct io_ring_ctx *ctx)
976{
977 struct io_kiocb *req;
978
5262f567
JA
979 while ((req = io_get_timeout_req(ctx)) != NULL)
980 io_kill_timeout(req);
981
de0617e4
JA
982 __io_commit_cqring(ctx);
983
87987898 984 while ((req = io_get_deferred_req(ctx)) != NULL)
a197f664 985 io_queue_async_work(req);
de0617e4
JA
986}
987
2b188cc1
JA
988static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
989{
75b28aff 990 struct io_rings *rings = ctx->rings;
2b188cc1
JA
991 unsigned tail;
992
993 tail = ctx->cached_cq_tail;
115e12e5
SB
994 /*
995 * writes to the cq entry need to come after reading head; the
996 * control dependency is enough as we're using WRITE_ONCE to
997 * fill the cq entry
998 */
75b28aff 999 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
2b188cc1
JA
1000 return NULL;
1001
1002 ctx->cached_cq_tail++;
75b28aff 1003 return &rings->cqes[tail & ctx->cq_mask];
2b188cc1
JA
1004}
1005
f2842ab5
JA
1006static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1007{
1008 if (!ctx->eventfd_async)
1009 return true;
1010 return io_wq_current_is_worker() || in_interrupt();
1011}
1012
1d7bb1d5
JA
1013static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1014{
1015 if (waitqueue_active(&ctx->wait))
1016 wake_up(&ctx->wait);
1017 if (waitqueue_active(&ctx->sqo_wait))
1018 wake_up(&ctx->sqo_wait);
f2842ab5 1019 if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
1d7bb1d5
JA
1020 eventfd_signal(ctx->cq_ev_fd, 1);
1021}
1022
c4a2ed72
JA
1023/* Returns true if there are no backlogged entries after the flush */
1024static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5
JA
1025{
1026 struct io_rings *rings = ctx->rings;
1027 struct io_uring_cqe *cqe;
1028 struct io_kiocb *req;
1029 unsigned long flags;
1030 LIST_HEAD(list);
1031
1032 if (!force) {
1033 if (list_empty_careful(&ctx->cq_overflow_list))
c4a2ed72 1034 return true;
1d7bb1d5
JA
1035 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1036 rings->cq_ring_entries))
c4a2ed72 1037 return false;
1d7bb1d5
JA
1038 }
1039
1040 spin_lock_irqsave(&ctx->completion_lock, flags);
1041
1042 /* if force is set, the ring is going away. always drop after that */
1043 if (force)
69b3e546 1044 ctx->cq_overflow_flushed = 1;
1d7bb1d5 1045
c4a2ed72 1046 cqe = NULL;
1d7bb1d5
JA
1047 while (!list_empty(&ctx->cq_overflow_list)) {
1048 cqe = io_get_cqring(ctx);
1049 if (!cqe && !force)
1050 break;
1051
1052 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1053 list);
1054 list_move(&req->list, &list);
1055 if (cqe) {
1056 WRITE_ONCE(cqe->user_data, req->user_data);
1057 WRITE_ONCE(cqe->res, req->result);
1058 WRITE_ONCE(cqe->flags, 0);
1059 } else {
1060 WRITE_ONCE(ctx->rings->cq_overflow,
1061 atomic_inc_return(&ctx->cached_cq_overflow));
1062 }
1063 }
1064
1065 io_commit_cqring(ctx);
ad3eb2c8
JA
1066 if (cqe) {
1067 clear_bit(0, &ctx->sq_check_overflow);
1068 clear_bit(0, &ctx->cq_check_overflow);
1069 }
1d7bb1d5
JA
1070 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1071 io_cqring_ev_posted(ctx);
1072
1073 while (!list_empty(&list)) {
1074 req = list_first_entry(&list, struct io_kiocb, list);
1075 list_del(&req->list);
ec9c02ad 1076 io_put_req(req);
1d7bb1d5 1077 }
c4a2ed72
JA
1078
1079 return cqe != NULL;
1d7bb1d5
JA
1080}
1081
78e19bbe 1082static void io_cqring_fill_event(struct io_kiocb *req, long res)
2b188cc1 1083{
78e19bbe 1084 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1085 struct io_uring_cqe *cqe;
1086
78e19bbe 1087 trace_io_uring_complete(ctx, req->user_data, res);
51c3ff62 1088
2b188cc1
JA
1089 /*
1090 * If we can't get a cq entry, userspace overflowed the
1091 * submission (by quite a lot). Increment the overflow count in
1092 * the ring.
1093 */
1094 cqe = io_get_cqring(ctx);
1d7bb1d5 1095 if (likely(cqe)) {
78e19bbe 1096 WRITE_ONCE(cqe->user_data, req->user_data);
2b188cc1 1097 WRITE_ONCE(cqe->res, res);
c71ffb67 1098 WRITE_ONCE(cqe->flags, 0);
1d7bb1d5 1099 } else if (ctx->cq_overflow_flushed) {
498ccd9e
JA
1100 WRITE_ONCE(ctx->rings->cq_overflow,
1101 atomic_inc_return(&ctx->cached_cq_overflow));
1d7bb1d5 1102 } else {
ad3eb2c8
JA
1103 if (list_empty(&ctx->cq_overflow_list)) {
1104 set_bit(0, &ctx->sq_check_overflow);
1105 set_bit(0, &ctx->cq_check_overflow);
1106 }
1d7bb1d5
JA
1107 refcount_inc(&req->refs);
1108 req->result = res;
1109 list_add_tail(&req->list, &ctx->cq_overflow_list);
2b188cc1
JA
1110 }
1111}
1112
78e19bbe 1113static void io_cqring_add_event(struct io_kiocb *req, long res)
2b188cc1 1114{
78e19bbe 1115 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1116 unsigned long flags;
1117
1118 spin_lock_irqsave(&ctx->completion_lock, flags);
78e19bbe 1119 io_cqring_fill_event(req, res);
2b188cc1
JA
1120 io_commit_cqring(ctx);
1121 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1122
8c838788 1123 io_cqring_ev_posted(ctx);
2b188cc1
JA
1124}
1125
0ddf92e8
JA
1126static inline bool io_is_fallback_req(struct io_kiocb *req)
1127{
1128 return req == (struct io_kiocb *)
1129 ((unsigned long) req->ctx->fallback_req & ~1UL);
1130}
1131
1132static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1133{
1134 struct io_kiocb *req;
1135
1136 req = ctx->fallback_req;
1137 if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
1138 return req;
1139
1140 return NULL;
1141}
1142
2579f913
JA
1143static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
1144 struct io_submit_state *state)
2b188cc1 1145{
fd6fab2c 1146 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2b188cc1
JA
1147 struct io_kiocb *req;
1148
2579f913 1149 if (!state) {
fd6fab2c 1150 req = kmem_cache_alloc(req_cachep, gfp);
2579f913 1151 if (unlikely(!req))
0ddf92e8 1152 goto fallback;
2579f913
JA
1153 } else if (!state->free_reqs) {
1154 size_t sz;
1155 int ret;
1156
1157 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
fd6fab2c
JA
1158 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1159
1160 /*
1161 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1162 * retry single alloc to be on the safe side.
1163 */
1164 if (unlikely(ret <= 0)) {
1165 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1166 if (!state->reqs[0])
0ddf92e8 1167 goto fallback;
fd6fab2c
JA
1168 ret = 1;
1169 }
2579f913
JA
1170 state->free_reqs = ret - 1;
1171 state->cur_req = 1;
1172 req = state->reqs[0];
1173 } else {
1174 req = state->reqs[state->cur_req];
1175 state->free_reqs--;
1176 state->cur_req++;
2b188cc1
JA
1177 }
1178
0ddf92e8 1179got_it:
1a6b74fc 1180 req->io = NULL;
60c112b0 1181 req->file = NULL;
2579f913
JA
1182 req->ctx = ctx;
1183 req->flags = 0;
e65ef56d
JA
1184 /* one is dropped after submission, the other at completion */
1185 refcount_set(&req->refs, 2);
9e645e11 1186 req->result = 0;
561fb04a 1187 INIT_IO_WORK(&req->work, io_wq_submit_work);
2579f913 1188 return req;
0ddf92e8
JA
1189fallback:
1190 req = io_get_fallback_req(ctx);
1191 if (req)
1192 goto got_it;
6805b32e 1193 percpu_ref_put(&ctx->refs);
2b188cc1
JA
1194 return NULL;
1195}
1196
2b85edfc
PB
1197static void __io_req_do_free(struct io_kiocb *req)
1198{
1199 if (likely(!io_is_fallback_req(req)))
1200 kmem_cache_free(req_cachep, req);
1201 else
1202 clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
1203}
1204
c6ca97b3 1205static void __io_req_aux_free(struct io_kiocb *req)
2b188cc1 1206{
fcb323cc
JA
1207 struct io_ring_ctx *ctx = req->ctx;
1208
96fd84d8 1209 kfree(req->io);
05f3fb3c
JA
1210 if (req->file) {
1211 if (req->flags & REQ_F_FIXED_FILE)
1212 percpu_ref_put(&ctx->file_data->refs);
1213 else
1214 fput(req->file);
1215 }
c6ca97b3
JA
1216}
1217
1218static void __io_free_req(struct io_kiocb *req)
1219{
1220 __io_req_aux_free(req);
1221
fcb323cc 1222 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3 1223 struct io_ring_ctx *ctx = req->ctx;
fcb323cc
JA
1224 unsigned long flags;
1225
1226 spin_lock_irqsave(&ctx->inflight_lock, flags);
1227 list_del(&req->inflight_entry);
1228 if (waitqueue_active(&ctx->inflight_wait))
1229 wake_up(&ctx->inflight_wait);
1230 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1231 }
2b85edfc
PB
1232
1233 percpu_ref_put(&req->ctx->refs);
1234 __io_req_do_free(req);
e65ef56d
JA
1235}
1236
c6ca97b3
JA
1237struct req_batch {
1238 void *reqs[IO_IOPOLL_BATCH];
1239 int to_free;
1240 int need_iter;
1241};
1242
1243static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1244{
10fef4be
JA
1245 int fixed_refs = rb->to_free;
1246
c6ca97b3
JA
1247 if (!rb->to_free)
1248 return;
1249 if (rb->need_iter) {
1250 int i, inflight = 0;
1251 unsigned long flags;
1252
10fef4be 1253 fixed_refs = 0;
c6ca97b3
JA
1254 for (i = 0; i < rb->to_free; i++) {
1255 struct io_kiocb *req = rb->reqs[i];
1256
10fef4be 1257 if (req->flags & REQ_F_FIXED_FILE) {
c6ca97b3 1258 req->file = NULL;
10fef4be
JA
1259 fixed_refs++;
1260 }
c6ca97b3
JA
1261 if (req->flags & REQ_F_INFLIGHT)
1262 inflight++;
c6ca97b3
JA
1263 __io_req_aux_free(req);
1264 }
1265 if (!inflight)
1266 goto do_free;
1267
1268 spin_lock_irqsave(&ctx->inflight_lock, flags);
1269 for (i = 0; i < rb->to_free; i++) {
1270 struct io_kiocb *req = rb->reqs[i];
1271
10fef4be 1272 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3
JA
1273 list_del(&req->inflight_entry);
1274 if (!--inflight)
1275 break;
1276 }
1277 }
1278 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1279
1280 if (waitqueue_active(&ctx->inflight_wait))
1281 wake_up(&ctx->inflight_wait);
1282 }
1283do_free:
1284 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
10fef4be
JA
1285 if (fixed_refs)
1286 percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
c6ca97b3 1287 percpu_ref_put_many(&ctx->refs, rb->to_free);
c6ca97b3
JA
1288 rb->to_free = rb->need_iter = 0;
1289}
1290
a197f664 1291static bool io_link_cancel_timeout(struct io_kiocb *req)
2665abfd 1292{
a197f664 1293 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1294 int ret;
1295
2d28390a 1296 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
2665abfd 1297 if (ret != -1) {
78e19bbe 1298 io_cqring_fill_event(req, -ECANCELED);
2665abfd
JA
1299 io_commit_cqring(ctx);
1300 req->flags &= ~REQ_F_LINK;
ec9c02ad 1301 io_put_req(req);
2665abfd
JA
1302 return true;
1303 }
1304
1305 return false;
e65ef56d
JA
1306}
1307
ba816ad6 1308static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
9e645e11 1309{
2665abfd 1310 struct io_ring_ctx *ctx = req->ctx;
2665abfd 1311 bool wake_ev = false;
9e645e11 1312
4d7dd462
JA
1313 /* Already got next link */
1314 if (req->flags & REQ_F_LINK_NEXT)
1315 return;
1316
9e645e11
JA
1317 /*
1318 * The list should never be empty when we are called here. But could
1319 * potentially happen if the chain is messed up, check to be on the
1320 * safe side.
1321 */
4493233e
PB
1322 while (!list_empty(&req->link_list)) {
1323 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1324 struct io_kiocb, link_list);
94ae5e77 1325
4493233e
PB
1326 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1327 (nxt->flags & REQ_F_TIMEOUT))) {
1328 list_del_init(&nxt->link_list);
94ae5e77 1329 wake_ev |= io_link_cancel_timeout(nxt);
94ae5e77
JA
1330 req->flags &= ~REQ_F_LINK_TIMEOUT;
1331 continue;
1332 }
9e645e11 1333
4493233e
PB
1334 list_del_init(&req->link_list);
1335 if (!list_empty(&nxt->link_list))
1336 nxt->flags |= REQ_F_LINK;
b18fdf71 1337 *nxtptr = nxt;
94ae5e77 1338 break;
9e645e11 1339 }
2665abfd 1340
4d7dd462 1341 req->flags |= REQ_F_LINK_NEXT;
2665abfd
JA
1342 if (wake_ev)
1343 io_cqring_ev_posted(ctx);
9e645e11
JA
1344}
1345
1346/*
1347 * Called if REQ_F_LINK is set, and we fail the head request
1348 */
1349static void io_fail_links(struct io_kiocb *req)
1350{
2665abfd 1351 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1352 unsigned long flags;
1353
1354 spin_lock_irqsave(&ctx->completion_lock, flags);
9e645e11
JA
1355
1356 while (!list_empty(&req->link_list)) {
4493233e
PB
1357 struct io_kiocb *link = list_first_entry(&req->link_list,
1358 struct io_kiocb, link_list);
9e645e11 1359
4493233e 1360 list_del_init(&link->link_list);
c826bd7a 1361 trace_io_uring_fail_link(req, link);
2665abfd
JA
1362
1363 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
d625c6ee 1364 link->opcode == IORING_OP_LINK_TIMEOUT) {
a197f664 1365 io_link_cancel_timeout(link);
2665abfd 1366 } else {
78e19bbe 1367 io_cqring_fill_event(link, -ECANCELED);
978db57e 1368 __io_double_put_req(link);
2665abfd 1369 }
5d960724 1370 req->flags &= ~REQ_F_LINK_TIMEOUT;
9e645e11 1371 }
2665abfd
JA
1372
1373 io_commit_cqring(ctx);
1374 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1375 io_cqring_ev_posted(ctx);
9e645e11
JA
1376}
1377
4d7dd462 1378static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
9e645e11 1379{
4d7dd462 1380 if (likely(!(req->flags & REQ_F_LINK)))
2665abfd 1381 return;
2665abfd 1382
9e645e11
JA
1383 /*
1384 * If LINK is set, we have dependent requests in this chain. If we
1385 * didn't fail this request, queue the first one up, moving any other
1386 * dependencies to the next request. In case of failure, fail the rest
1387 * of the chain.
1388 */
2665abfd
JA
1389 if (req->flags & REQ_F_FAIL_LINK) {
1390 io_fail_links(req);
7c9e7f0f
JA
1391 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1392 REQ_F_LINK_TIMEOUT) {
2665abfd
JA
1393 struct io_ring_ctx *ctx = req->ctx;
1394 unsigned long flags;
1395
1396 /*
1397 * If this is a timeout link, we could be racing with the
1398 * timeout timer. Grab the completion lock for this case to
7c9e7f0f 1399 * protect against that.
2665abfd
JA
1400 */
1401 spin_lock_irqsave(&ctx->completion_lock, flags);
1402 io_req_link_next(req, nxt);
1403 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1404 } else {
1405 io_req_link_next(req, nxt);
9e645e11 1406 }
4d7dd462 1407}
9e645e11 1408
c69f8dbe
JL
1409static void io_free_req(struct io_kiocb *req)
1410{
944e58bf
PB
1411 struct io_kiocb *nxt = NULL;
1412
1413 io_req_find_next(req, &nxt);
70cf9f32 1414 __io_free_req(req);
944e58bf
PB
1415
1416 if (nxt)
1417 io_queue_async_work(nxt);
c69f8dbe
JL
1418}
1419
ba816ad6
JA
1420/*
1421 * Drop reference to request, return next in chain (if there is one) if this
1422 * was the last reference to this request.
1423 */
f9bd67f6 1424__attribute__((nonnull))
ec9c02ad 1425static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
e65ef56d 1426{
f9bd67f6 1427 io_req_find_next(req, nxtptr);
4d7dd462 1428
e65ef56d 1429 if (refcount_dec_and_test(&req->refs))
4d7dd462 1430 __io_free_req(req);
2b188cc1
JA
1431}
1432
e65ef56d
JA
1433static void io_put_req(struct io_kiocb *req)
1434{
1435 if (refcount_dec_and_test(&req->refs))
1436 io_free_req(req);
2b188cc1
JA
1437}
1438
978db57e
JA
1439/*
1440 * Must only be used if we don't need to care about links, usually from
1441 * within the completion handling itself.
1442 */
1443static void __io_double_put_req(struct io_kiocb *req)
78e19bbe
JA
1444{
1445 /* drop both submit and complete references */
1446 if (refcount_sub_and_test(2, &req->refs))
1447 __io_free_req(req);
1448}
1449
978db57e
JA
1450static void io_double_put_req(struct io_kiocb *req)
1451{
1452 /* drop both submit and complete references */
1453 if (refcount_sub_and_test(2, &req->refs))
1454 io_free_req(req);
1455}
1456
1d7bb1d5 1457static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
a3a0e43f 1458{
84f97dc2
JA
1459 struct io_rings *rings = ctx->rings;
1460
ad3eb2c8
JA
1461 if (test_bit(0, &ctx->cq_check_overflow)) {
1462 /*
1463 * noflush == true is from the waitqueue handler, just ensure
1464 * we wake up the task, and the next invocation will flush the
1465 * entries. We cannot safely to it from here.
1466 */
1467 if (noflush && !list_empty(&ctx->cq_overflow_list))
1468 return -1U;
1d7bb1d5 1469
ad3eb2c8
JA
1470 io_cqring_overflow_flush(ctx, false);
1471 }
1d7bb1d5 1472
a3a0e43f
JA
1473 /* See comment at the top of this file */
1474 smp_rmb();
ad3eb2c8 1475 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
a3a0e43f
JA
1476}
1477
fb5ccc98
PB
1478static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1479{
1480 struct io_rings *rings = ctx->rings;
1481
1482 /* make sure SQ entry isn't read before tail */
1483 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1484}
1485
8237e045 1486static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
e94f141b 1487{
c6ca97b3
JA
1488 if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
1489 return false;
e94f141b 1490
c6ca97b3
JA
1491 if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
1492 rb->need_iter++;
1493
1494 rb->reqs[rb->to_free++] = req;
1495 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1496 io_free_req_many(req->ctx, rb);
1497 return true;
e94f141b
JA
1498}
1499
def596e9
JA
1500/*
1501 * Find and free completed poll iocbs
1502 */
1503static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1504 struct list_head *done)
1505{
8237e045 1506 struct req_batch rb;
def596e9 1507 struct io_kiocb *req;
def596e9 1508
c6ca97b3 1509 rb.to_free = rb.need_iter = 0;
def596e9
JA
1510 while (!list_empty(done)) {
1511 req = list_first_entry(done, struct io_kiocb, list);
1512 list_del(&req->list);
1513
78e19bbe 1514 io_cqring_fill_event(req, req->result);
def596e9
JA
1515 (*nr_events)++;
1516
8237e045
JA
1517 if (refcount_dec_and_test(&req->refs) &&
1518 !io_req_multi_free(&rb, req))
1519 io_free_req(req);
def596e9 1520 }
def596e9 1521
09bb8394 1522 io_commit_cqring(ctx);
8237e045 1523 io_free_req_many(ctx, &rb);
def596e9
JA
1524}
1525
1526static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1527 long min)
1528{
1529 struct io_kiocb *req, *tmp;
1530 LIST_HEAD(done);
1531 bool spin;
1532 int ret;
1533
1534 /*
1535 * Only spin for completions if we don't have multiple devices hanging
1536 * off our complete list, and we're under the requested amount.
1537 */
1538 spin = !ctx->poll_multi_file && *nr_events < min;
1539
1540 ret = 0;
1541 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
9adbd45d 1542 struct kiocb *kiocb = &req->rw.kiocb;
def596e9
JA
1543
1544 /*
1545 * Move completed entries to our local list. If we find a
1546 * request that requires polling, break out and complete
1547 * the done list first, if we have entries there.
1548 */
1549 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1550 list_move_tail(&req->list, &done);
1551 continue;
1552 }
1553 if (!list_empty(&done))
1554 break;
1555
1556 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1557 if (ret < 0)
1558 break;
1559
1560 if (ret && spin)
1561 spin = false;
1562 ret = 0;
1563 }
1564
1565 if (!list_empty(&done))
1566 io_iopoll_complete(ctx, nr_events, &done);
1567
1568 return ret;
1569}
1570
1571/*
d195a66e 1572 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
def596e9
JA
1573 * non-spinning poll check - we'll still enter the driver poll loop, but only
1574 * as a non-spinning completion check.
1575 */
1576static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1577 long min)
1578{
08f5439f 1579 while (!list_empty(&ctx->poll_list) && !need_resched()) {
def596e9
JA
1580 int ret;
1581
1582 ret = io_do_iopoll(ctx, nr_events, min);
1583 if (ret < 0)
1584 return ret;
1585 if (!min || *nr_events >= min)
1586 return 0;
1587 }
1588
1589 return 1;
1590}
1591
1592/*
1593 * We can't just wait for polled events to come to us, we have to actively
1594 * find and complete them.
1595 */
1596static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1597{
1598 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1599 return;
1600
1601 mutex_lock(&ctx->uring_lock);
1602 while (!list_empty(&ctx->poll_list)) {
1603 unsigned int nr_events = 0;
1604
1605 io_iopoll_getevents(ctx, &nr_events, 1);
08f5439f
JA
1606
1607 /*
1608 * Ensure we allow local-to-the-cpu processing to take place,
1609 * in this case we need to ensure that we reap all events.
1610 */
1611 cond_resched();
def596e9
JA
1612 }
1613 mutex_unlock(&ctx->uring_lock);
1614}
1615
2b2ed975
JA
1616static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1617 long min)
def596e9 1618{
2b2ed975 1619 int iters = 0, ret = 0;
500f9fba 1620
def596e9
JA
1621 do {
1622 int tmin = 0;
1623
a3a0e43f
JA
1624 /*
1625 * Don't enter poll loop if we already have events pending.
1626 * If we do, we can potentially be spinning for commands that
1627 * already triggered a CQE (eg in error).
1628 */
1d7bb1d5 1629 if (io_cqring_events(ctx, false))
a3a0e43f
JA
1630 break;
1631
500f9fba
JA
1632 /*
1633 * If a submit got punted to a workqueue, we can have the
1634 * application entering polling for a command before it gets
1635 * issued. That app will hold the uring_lock for the duration
1636 * of the poll right here, so we need to take a breather every
1637 * now and then to ensure that the issue has a chance to add
1638 * the poll to the issued list. Otherwise we can spin here
1639 * forever, while the workqueue is stuck trying to acquire the
1640 * very same mutex.
1641 */
1642 if (!(++iters & 7)) {
1643 mutex_unlock(&ctx->uring_lock);
1644 mutex_lock(&ctx->uring_lock);
1645 }
1646
def596e9
JA
1647 if (*nr_events < min)
1648 tmin = min - *nr_events;
1649
1650 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1651 if (ret <= 0)
1652 break;
1653 ret = 0;
1654 } while (min && !*nr_events && !need_resched());
1655
2b2ed975
JA
1656 return ret;
1657}
1658
1659static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1660 long min)
1661{
1662 int ret;
1663
1664 /*
1665 * We disallow the app entering submit/complete with polling, but we
1666 * still need to lock the ring to prevent racing with polled issue
1667 * that got punted to a workqueue.
1668 */
1669 mutex_lock(&ctx->uring_lock);
1670 ret = __io_iopoll_check(ctx, nr_events, min);
500f9fba 1671 mutex_unlock(&ctx->uring_lock);
def596e9
JA
1672 return ret;
1673}
1674
491381ce 1675static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 1676{
491381ce
JA
1677 /*
1678 * Tell lockdep we inherited freeze protection from submission
1679 * thread.
1680 */
1681 if (req->flags & REQ_F_ISREG) {
1682 struct inode *inode = file_inode(req->file);
2b188cc1 1683
491381ce 1684 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1 1685 }
491381ce 1686 file_end_write(req->file);
2b188cc1
JA
1687}
1688
4e88d6e7
JA
1689static inline void req_set_fail_links(struct io_kiocb *req)
1690{
1691 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1692 req->flags |= REQ_F_FAIL_LINK;
1693}
1694
ba816ad6 1695static void io_complete_rw_common(struct kiocb *kiocb, long res)
2b188cc1 1696{
9adbd45d 1697 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2b188cc1 1698
491381ce
JA
1699 if (kiocb->ki_flags & IOCB_WRITE)
1700 kiocb_end_write(req);
2b188cc1 1701
4e88d6e7
JA
1702 if (res != req->result)
1703 req_set_fail_links(req);
78e19bbe 1704 io_cqring_add_event(req, res);
ba816ad6
JA
1705}
1706
1707static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1708{
9adbd45d 1709 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6
JA
1710
1711 io_complete_rw_common(kiocb, res);
e65ef56d 1712 io_put_req(req);
2b188cc1
JA
1713}
1714
ba816ad6
JA
1715static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1716{
9adbd45d 1717 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ec9c02ad 1718 struct io_kiocb *nxt = NULL;
ba816ad6
JA
1719
1720 io_complete_rw_common(kiocb, res);
ec9c02ad
JL
1721 io_put_req_find_next(req, &nxt);
1722
1723 return nxt;
2b188cc1
JA
1724}
1725
def596e9
JA
1726static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1727{
9adbd45d 1728 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 1729
491381ce
JA
1730 if (kiocb->ki_flags & IOCB_WRITE)
1731 kiocb_end_write(req);
def596e9 1732
4e88d6e7
JA
1733 if (res != req->result)
1734 req_set_fail_links(req);
9e645e11 1735 req->result = res;
def596e9
JA
1736 if (res != -EAGAIN)
1737 req->flags |= REQ_F_IOPOLL_COMPLETED;
1738}
1739
1740/*
1741 * After the iocb has been issued, it's safe to be found on the poll list.
1742 * Adding the kiocb to the list AFTER submission ensures that we don't
1743 * find it from a io_iopoll_getevents() thread before the issuer is done
1744 * accessing the kiocb cookie.
1745 */
1746static void io_iopoll_req_issued(struct io_kiocb *req)
1747{
1748 struct io_ring_ctx *ctx = req->ctx;
1749
1750 /*
1751 * Track whether we have multiple files in our lists. This will impact
1752 * how we do polling eventually, not spinning if we're on potentially
1753 * different devices.
1754 */
1755 if (list_empty(&ctx->poll_list)) {
1756 ctx->poll_multi_file = false;
1757 } else if (!ctx->poll_multi_file) {
1758 struct io_kiocb *list_req;
1759
1760 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1761 list);
9adbd45d 1762 if (list_req->file != req->file)
def596e9
JA
1763 ctx->poll_multi_file = true;
1764 }
1765
1766 /*
1767 * For fast devices, IO may have already completed. If it has, add
1768 * it to the front so we find it first.
1769 */
1770 if (req->flags & REQ_F_IOPOLL_COMPLETED)
1771 list_add(&req->list, &ctx->poll_list);
1772 else
1773 list_add_tail(&req->list, &ctx->poll_list);
1774}
1775
3d6770fb 1776static void io_file_put(struct io_submit_state *state)
9a56a232 1777{
3d6770fb 1778 if (state->file) {
9a56a232
JA
1779 int diff = state->has_refs - state->used_refs;
1780
1781 if (diff)
1782 fput_many(state->file, diff);
1783 state->file = NULL;
1784 }
1785}
1786
1787/*
1788 * Get as many references to a file as we have IOs left in this submission,
1789 * assuming most submissions are for one file, or at least that each file
1790 * has more than one submission.
1791 */
1792static struct file *io_file_get(struct io_submit_state *state, int fd)
1793{
1794 if (!state)
1795 return fget(fd);
1796
1797 if (state->file) {
1798 if (state->fd == fd) {
1799 state->used_refs++;
1800 state->ios_left--;
1801 return state->file;
1802 }
3d6770fb 1803 io_file_put(state);
9a56a232
JA
1804 }
1805 state->file = fget_many(fd, state->ios_left);
1806 if (!state->file)
1807 return NULL;
1808
1809 state->fd = fd;
1810 state->has_refs = state->ios_left;
1811 state->used_refs = 1;
1812 state->ios_left--;
1813 return state->file;
1814}
1815
2b188cc1
JA
1816/*
1817 * If we tracked the file through the SCM inflight mechanism, we could support
1818 * any file. For now, just ensure that anything potentially problematic is done
1819 * inline.
1820 */
1821static bool io_file_supports_async(struct file *file)
1822{
1823 umode_t mode = file_inode(file)->i_mode;
1824
10d59345 1825 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2b188cc1
JA
1826 return true;
1827 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1828 return true;
1829
1830 return false;
1831}
1832
3529d8c2
JA
1833static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1834 bool force_nonblock)
2b188cc1 1835{
def596e9 1836 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 1837 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
1838 unsigned ioprio;
1839 int ret;
2b188cc1 1840
09bb8394
JA
1841 if (!req->file)
1842 return -EBADF;
2b188cc1 1843
491381ce
JA
1844 if (S_ISREG(file_inode(req->file)->i_mode))
1845 req->flags |= REQ_F_ISREG;
1846
2b188cc1 1847 kiocb->ki_pos = READ_ONCE(sqe->off);
ba04291e
JA
1848 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
1849 req->flags |= REQ_F_CUR_POS;
1850 kiocb->ki_pos = req->file->f_pos;
1851 }
2b188cc1
JA
1852 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1853 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1854
1855 ioprio = READ_ONCE(sqe->ioprio);
1856 if (ioprio) {
1857 ret = ioprio_check_cap(ioprio);
1858 if (ret)
09bb8394 1859 return ret;
2b188cc1
JA
1860
1861 kiocb->ki_ioprio = ioprio;
1862 } else
1863 kiocb->ki_ioprio = get_current_ioprio();
1864
1865 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1866 if (unlikely(ret))
09bb8394 1867 return ret;
8449eeda
SB
1868
1869 /* don't allow async punt if RWF_NOWAIT was requested */
491381ce
JA
1870 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1871 (req->file->f_flags & O_NONBLOCK))
8449eeda
SB
1872 req->flags |= REQ_F_NOWAIT;
1873
1874 if (force_nonblock)
2b188cc1 1875 kiocb->ki_flags |= IOCB_NOWAIT;
8449eeda 1876
def596e9 1877 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
1878 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1879 !kiocb->ki_filp->f_op->iopoll)
09bb8394 1880 return -EOPNOTSUPP;
2b188cc1 1881
def596e9
JA
1882 kiocb->ki_flags |= IOCB_HIPRI;
1883 kiocb->ki_complete = io_complete_rw_iopoll;
6873e0bd 1884 req->result = 0;
def596e9 1885 } else {
09bb8394
JA
1886 if (kiocb->ki_flags & IOCB_HIPRI)
1887 return -EINVAL;
def596e9
JA
1888 kiocb->ki_complete = io_complete_rw;
1889 }
9adbd45d 1890
3529d8c2
JA
1891 req->rw.addr = READ_ONCE(sqe->addr);
1892 req->rw.len = READ_ONCE(sqe->len);
9adbd45d
JA
1893 /* we own ->private, reuse it for the buffer index */
1894 req->rw.kiocb.private = (void *) (unsigned long)
3529d8c2 1895 READ_ONCE(sqe->buf_index);
2b188cc1 1896 return 0;
2b188cc1
JA
1897}
1898
1899static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1900{
1901 switch (ret) {
1902 case -EIOCBQUEUED:
1903 break;
1904 case -ERESTARTSYS:
1905 case -ERESTARTNOINTR:
1906 case -ERESTARTNOHAND:
1907 case -ERESTART_RESTARTBLOCK:
1908 /*
1909 * We can't just restart the syscall, since previously
1910 * submitted sqes may already be in progress. Just fail this
1911 * IO with EINTR.
1912 */
1913 ret = -EINTR;
1914 /* fall through */
1915 default:
1916 kiocb->ki_complete(kiocb, ret, 0);
1917 }
1918}
1919
ba816ad6
JA
1920static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1921 bool in_async)
1922{
ba04291e
JA
1923 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1924
1925 if (req->flags & REQ_F_CUR_POS)
1926 req->file->f_pos = kiocb->ki_pos;
f9bd67f6 1927 if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
ba816ad6
JA
1928 *nxt = __io_complete_rw(kiocb, ret);
1929 else
1930 io_rw_done(kiocb, ret);
1931}
1932
9adbd45d 1933static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
7d009165 1934 struct iov_iter *iter)
edafccee 1935{
9adbd45d
JA
1936 struct io_ring_ctx *ctx = req->ctx;
1937 size_t len = req->rw.len;
edafccee
JA
1938 struct io_mapped_ubuf *imu;
1939 unsigned index, buf_index;
1940 size_t offset;
1941 u64 buf_addr;
1942
1943 /* attempt to use fixed buffers without having provided iovecs */
1944 if (unlikely(!ctx->user_bufs))
1945 return -EFAULT;
1946
9adbd45d 1947 buf_index = (unsigned long) req->rw.kiocb.private;
edafccee
JA
1948 if (unlikely(buf_index >= ctx->nr_user_bufs))
1949 return -EFAULT;
1950
1951 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1952 imu = &ctx->user_bufs[index];
9adbd45d 1953 buf_addr = req->rw.addr;
edafccee
JA
1954
1955 /* overflow */
1956 if (buf_addr + len < buf_addr)
1957 return -EFAULT;
1958 /* not inside the mapped region */
1959 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1960 return -EFAULT;
1961
1962 /*
1963 * May not be a start of buffer, set size appropriately
1964 * and advance us to the beginning.
1965 */
1966 offset = buf_addr - imu->ubuf;
1967 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
1968
1969 if (offset) {
1970 /*
1971 * Don't use iov_iter_advance() here, as it's really slow for
1972 * using the latter parts of a big fixed buffer - it iterates
1973 * over each segment manually. We can cheat a bit here, because
1974 * we know that:
1975 *
1976 * 1) it's a BVEC iter, we set it up
1977 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1978 * first and last bvec
1979 *
1980 * So just find our index, and adjust the iterator afterwards.
1981 * If the offset is within the first bvec (or the whole first
1982 * bvec, just use iov_iter_advance(). This makes it easier
1983 * since we can just skip the first segment, which may not
1984 * be PAGE_SIZE aligned.
1985 */
1986 const struct bio_vec *bvec = imu->bvec;
1987
1988 if (offset <= bvec->bv_len) {
1989 iov_iter_advance(iter, offset);
1990 } else {
1991 unsigned long seg_skip;
1992
1993 /* skip first vec */
1994 offset -= bvec->bv_len;
1995 seg_skip = 1 + (offset >> PAGE_SHIFT);
1996
1997 iter->bvec = bvec + seg_skip;
1998 iter->nr_segs -= seg_skip;
99c79f66 1999 iter->count -= bvec->bv_len + offset;
bd11b3a3 2000 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2001 }
2002 }
2003
5e559561 2004 return len;
edafccee
JA
2005}
2006
cf6fd4bd
PB
2007static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2008 struct iovec **iovec, struct iov_iter *iter)
2b188cc1 2009{
9adbd45d
JA
2010 void __user *buf = u64_to_user_ptr(req->rw.addr);
2011 size_t sqe_len = req->rw.len;
edafccee
JA
2012 u8 opcode;
2013
d625c6ee 2014 opcode = req->opcode;
7d009165 2015 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 2016 *iovec = NULL;
9adbd45d 2017 return io_import_fixed(req, rw, iter);
edafccee 2018 }
2b188cc1 2019
9adbd45d
JA
2020 /* buffer index only valid with fixed read/write */
2021 if (req->rw.kiocb.private)
2022 return -EINVAL;
2023
3a6820f2
JA
2024 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2025 ssize_t ret;
2026 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2027 *iovec = NULL;
2028 return ret;
2029 }
2030
f67676d1
JA
2031 if (req->io) {
2032 struct io_async_rw *iorw = &req->io->rw;
2033
2034 *iovec = iorw->iov;
2035 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2036 if (iorw->iov == iorw->fast_iov)
2037 *iovec = NULL;
2038 return iorw->size;
2039 }
2040
cf6fd4bd 2041 if (!req->has_user)
2b188cc1
JA
2042 return -EFAULT;
2043
2044#ifdef CONFIG_COMPAT
cf6fd4bd 2045 if (req->ctx->compat)
2b188cc1
JA
2046 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2047 iovec, iter);
2048#endif
2049
2050 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2051}
2052
31b51510 2053/*
32960613
JA
2054 * For files that don't have ->read_iter() and ->write_iter(), handle them
2055 * by looping over ->read() or ->write() manually.
31b51510 2056 */
32960613
JA
2057static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2058 struct iov_iter *iter)
2059{
2060 ssize_t ret = 0;
2061
2062 /*
2063 * Don't support polled IO through this interface, and we can't
2064 * support non-blocking either. For the latter, this just causes
2065 * the kiocb to be handled from an async context.
2066 */
2067 if (kiocb->ki_flags & IOCB_HIPRI)
2068 return -EOPNOTSUPP;
2069 if (kiocb->ki_flags & IOCB_NOWAIT)
2070 return -EAGAIN;
2071
2072 while (iov_iter_count(iter)) {
311ae9e1 2073 struct iovec iovec;
32960613
JA
2074 ssize_t nr;
2075
311ae9e1
PB
2076 if (!iov_iter_is_bvec(iter)) {
2077 iovec = iov_iter_iovec(iter);
2078 } else {
2079 /* fixed buffers import bvec */
2080 iovec.iov_base = kmap(iter->bvec->bv_page)
2081 + iter->iov_offset;
2082 iovec.iov_len = min(iter->count,
2083 iter->bvec->bv_len - iter->iov_offset);
2084 }
2085
32960613
JA
2086 if (rw == READ) {
2087 nr = file->f_op->read(file, iovec.iov_base,
2088 iovec.iov_len, &kiocb->ki_pos);
2089 } else {
2090 nr = file->f_op->write(file, iovec.iov_base,
2091 iovec.iov_len, &kiocb->ki_pos);
2092 }
2093
311ae9e1
PB
2094 if (iov_iter_is_bvec(iter))
2095 kunmap(iter->bvec->bv_page);
2096
32960613
JA
2097 if (nr < 0) {
2098 if (!ret)
2099 ret = nr;
2100 break;
2101 }
2102 ret += nr;
2103 if (nr != iovec.iov_len)
2104 break;
2105 iov_iter_advance(iter, nr);
2106 }
2107
2108 return ret;
2109}
2110
b7bb4f7d 2111static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
f67676d1
JA
2112 struct iovec *iovec, struct iovec *fast_iov,
2113 struct iov_iter *iter)
2114{
2115 req->io->rw.nr_segs = iter->nr_segs;
2116 req->io->rw.size = io_size;
2117 req->io->rw.iov = iovec;
2118 if (!req->io->rw.iov) {
2119 req->io->rw.iov = req->io->rw.fast_iov;
2120 memcpy(req->io->rw.iov, fast_iov,
2121 sizeof(struct iovec) * iter->nr_segs);
2122 }
2123}
2124
b7bb4f7d 2125static int io_alloc_async_ctx(struct io_kiocb *req)
f67676d1 2126{
d3656344
JA
2127 if (!io_op_defs[req->opcode].async_ctx)
2128 return 0;
f67676d1 2129 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
06b76d44 2130 return req->io == NULL;
b7bb4f7d
JA
2131}
2132
2133static void io_rw_async(struct io_wq_work **workptr)
2134{
2135 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2136 struct iovec *iov = NULL;
2137
2138 if (req->io->rw.iov != req->io->rw.fast_iov)
2139 iov = req->io->rw.iov;
2140 io_wq_submit_work(workptr);
2141 kfree(iov);
2142}
2143
2144static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2145 struct iovec *iovec, struct iovec *fast_iov,
2146 struct iov_iter *iter)
2147{
74566df3
JA
2148 if (req->opcode == IORING_OP_READ_FIXED ||
2149 req->opcode == IORING_OP_WRITE_FIXED)
2150 return 0;
b7bb4f7d
JA
2151 if (!req->io && io_alloc_async_ctx(req))
2152 return -ENOMEM;
2153
2154 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2155 req->work.func = io_rw_async;
2156 return 0;
f67676d1
JA
2157}
2158
3529d8c2
JA
2159static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2160 bool force_nonblock)
f67676d1 2161{
3529d8c2
JA
2162 struct io_async_ctx *io;
2163 struct iov_iter iter;
f67676d1
JA
2164 ssize_t ret;
2165
3529d8c2
JA
2166 ret = io_prep_rw(req, sqe, force_nonblock);
2167 if (ret)
2168 return ret;
f67676d1 2169
3529d8c2
JA
2170 if (unlikely(!(req->file->f_mode & FMODE_READ)))
2171 return -EBADF;
f67676d1 2172
3529d8c2
JA
2173 if (!req->io)
2174 return 0;
2175
2176 io = req->io;
2177 io->rw.iov = io->rw.fast_iov;
2178 req->io = NULL;
2179 ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
2180 req->io = io;
2181 if (ret < 0)
2182 return ret;
2183
2184 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2185 return 0;
f67676d1
JA
2186}
2187
267bc904 2188static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
8358e3a8 2189 bool force_nonblock)
2b188cc1
JA
2190{
2191 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2192 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2193 struct iov_iter iter;
31b51510 2194 size_t iov_count;
f67676d1 2195 ssize_t io_size, ret;
2b188cc1 2196
3529d8c2 2197 ret = io_import_iovec(READ, req, &iovec, &iter);
06b76d44
JA
2198 if (ret < 0)
2199 return ret;
2b188cc1 2200
fd6c2e4c
JA
2201 /* Ensure we clear previously set non-block flag */
2202 if (!force_nonblock)
9adbd45d 2203 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2204
797f3f53 2205 req->result = 0;
f67676d1 2206 io_size = ret;
9e645e11 2207 if (req->flags & REQ_F_LINK)
f67676d1
JA
2208 req->result = io_size;
2209
2210 /*
2211 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2212 * we know to async punt it even if it was opened O_NONBLOCK
2213 */
9adbd45d 2214 if (force_nonblock && !io_file_supports_async(req->file)) {
f67676d1
JA
2215 req->flags |= REQ_F_MUST_PUNT;
2216 goto copy_iov;
2217 }
9e645e11 2218
31b51510 2219 iov_count = iov_iter_count(&iter);
9adbd45d 2220 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2b188cc1
JA
2221 if (!ret) {
2222 ssize_t ret2;
2223
9adbd45d
JA
2224 if (req->file->f_op->read_iter)
2225 ret2 = call_read_iter(req->file, kiocb, &iter);
32960613 2226 else
9adbd45d 2227 ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
32960613 2228
9d93a3f5 2229 /* Catch -EAGAIN return for forced non-blocking submission */
f67676d1 2230 if (!force_nonblock || ret2 != -EAGAIN) {
cf6fd4bd 2231 kiocb_done(kiocb, ret2, nxt, req->in_async);
f67676d1
JA
2232 } else {
2233copy_iov:
b7bb4f7d 2234 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2235 inline_vecs, &iter);
2236 if (ret)
2237 goto out_free;
2238 return -EAGAIN;
2239 }
2b188cc1 2240 }
f67676d1 2241out_free:
b7bb4f7d
JA
2242 if (!io_wq_current_is_worker())
2243 kfree(iovec);
2b188cc1
JA
2244 return ret;
2245}
2246
3529d8c2
JA
2247static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2248 bool force_nonblock)
f67676d1 2249{
3529d8c2
JA
2250 struct io_async_ctx *io;
2251 struct iov_iter iter;
f67676d1
JA
2252 ssize_t ret;
2253
3529d8c2
JA
2254 ret = io_prep_rw(req, sqe, force_nonblock);
2255 if (ret)
2256 return ret;
f67676d1 2257
3529d8c2
JA
2258 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2259 return -EBADF;
f67676d1 2260
3529d8c2
JA
2261 if (!req->io)
2262 return 0;
2263
2264 io = req->io;
2265 io->rw.iov = io->rw.fast_iov;
2266 req->io = NULL;
2267 ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
2268 req->io = io;
2269 if (ret < 0)
2270 return ret;
2271
2272 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2273 return 0;
f67676d1
JA
2274}
2275
267bc904 2276static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
8358e3a8 2277 bool force_nonblock)
2b188cc1
JA
2278{
2279 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2280 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2281 struct iov_iter iter;
31b51510 2282 size_t iov_count;
f67676d1 2283 ssize_t ret, io_size;
2b188cc1 2284
3529d8c2 2285 ret = io_import_iovec(WRITE, req, &iovec, &iter);
06b76d44
JA
2286 if (ret < 0)
2287 return ret;
2b188cc1 2288
fd6c2e4c
JA
2289 /* Ensure we clear previously set non-block flag */
2290 if (!force_nonblock)
9adbd45d 2291 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2292
797f3f53 2293 req->result = 0;
f67676d1 2294 io_size = ret;
9e645e11 2295 if (req->flags & REQ_F_LINK)
f67676d1 2296 req->result = io_size;
9e645e11 2297
f67676d1
JA
2298 /*
2299 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2300 * we know to async punt it even if it was opened O_NONBLOCK
2301 */
2302 if (force_nonblock && !io_file_supports_async(req->file)) {
2303 req->flags |= REQ_F_MUST_PUNT;
2304 goto copy_iov;
2305 }
31b51510 2306
10d59345
JA
2307 /* file path doesn't support NOWAIT for non-direct_IO */
2308 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2309 (req->flags & REQ_F_ISREG))
f67676d1 2310 goto copy_iov;
31b51510 2311
f67676d1 2312 iov_count = iov_iter_count(&iter);
9adbd45d 2313 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2b188cc1 2314 if (!ret) {
9bf7933f
RP
2315 ssize_t ret2;
2316
2b188cc1
JA
2317 /*
2318 * Open-code file_start_write here to grab freeze protection,
2319 * which will be released by another thread in
2320 * io_complete_rw(). Fool lockdep by telling it the lock got
2321 * released so that it doesn't complain about the held lock when
2322 * we return to userspace.
2323 */
491381ce 2324 if (req->flags & REQ_F_ISREG) {
9adbd45d 2325 __sb_start_write(file_inode(req->file)->i_sb,
2b188cc1 2326 SB_FREEZE_WRITE, true);
9adbd45d 2327 __sb_writers_release(file_inode(req->file)->i_sb,
2b188cc1
JA
2328 SB_FREEZE_WRITE);
2329 }
2330 kiocb->ki_flags |= IOCB_WRITE;
9bf7933f 2331
9adbd45d
JA
2332 if (req->file->f_op->write_iter)
2333 ret2 = call_write_iter(req->file, kiocb, &iter);
32960613 2334 else
9adbd45d 2335 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
f67676d1 2336 if (!force_nonblock || ret2 != -EAGAIN) {
cf6fd4bd 2337 kiocb_done(kiocb, ret2, nxt, req->in_async);
f67676d1
JA
2338 } else {
2339copy_iov:
b7bb4f7d 2340 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2341 inline_vecs, &iter);
2342 if (ret)
2343 goto out_free;
2344 return -EAGAIN;
2345 }
2b188cc1 2346 }
31b51510 2347out_free:
b7bb4f7d
JA
2348 if (!io_wq_current_is_worker())
2349 kfree(iovec);
2b188cc1
JA
2350 return ret;
2351}
2352
2353/*
2354 * IORING_OP_NOP just posts a completion event, nothing else.
2355 */
78e19bbe 2356static int io_nop(struct io_kiocb *req)
2b188cc1
JA
2357{
2358 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 2359
def596e9
JA
2360 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2361 return -EINVAL;
2362
78e19bbe 2363 io_cqring_add_event(req, 0);
e65ef56d 2364 io_put_req(req);
2b188cc1
JA
2365 return 0;
2366}
2367
3529d8c2 2368static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 2369{
6b06314c 2370 struct io_ring_ctx *ctx = req->ctx;
c992fe29 2371
09bb8394
JA
2372 if (!req->file)
2373 return -EBADF;
c992fe29 2374
6b06314c 2375 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 2376 return -EINVAL;
edafccee 2377 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
2378 return -EINVAL;
2379
8ed8d3c3
JA
2380 req->sync.flags = READ_ONCE(sqe->fsync_flags);
2381 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2382 return -EINVAL;
2383
2384 req->sync.off = READ_ONCE(sqe->off);
2385 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
2386 return 0;
2387}
2388
8ed8d3c3
JA
2389static bool io_req_cancelled(struct io_kiocb *req)
2390{
2391 if (req->work.flags & IO_WQ_WORK_CANCEL) {
2392 req_set_fail_links(req);
2393 io_cqring_add_event(req, -ECANCELED);
2394 io_put_req(req);
2395 return true;
2396 }
2397
2398 return false;
2399}
2400
78912934
JA
2401static void io_link_work_cb(struct io_wq_work **workptr)
2402{
2403 struct io_wq_work *work = *workptr;
2404 struct io_kiocb *link = work->data;
2405
2406 io_queue_linked_timeout(link);
2407 work->func = io_wq_submit_work;
2408}
2409
2410static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
2411{
2412 struct io_kiocb *link;
2413
2414 io_prep_async_work(nxt, &link);
2415 *workptr = &nxt->work;
2416 if (link) {
2417 nxt->work.flags |= IO_WQ_WORK_CB;
2418 nxt->work.func = io_link_work_cb;
2419 nxt->work.data = link;
2420 }
2421}
2422
8ed8d3c3
JA
2423static void io_fsync_finish(struct io_wq_work **workptr)
2424{
2425 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2426 loff_t end = req->sync.off + req->sync.len;
2427 struct io_kiocb *nxt = NULL;
2428 int ret;
2429
2430 if (io_req_cancelled(req))
2431 return;
2432
9adbd45d 2433 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
2434 end > 0 ? end : LLONG_MAX,
2435 req->sync.flags & IORING_FSYNC_DATASYNC);
2436 if (ret < 0)
2437 req_set_fail_links(req);
2438 io_cqring_add_event(req, ret);
2439 io_put_req_find_next(req, &nxt);
2440 if (nxt)
78912934 2441 io_wq_assign_next(workptr, nxt);
8ed8d3c3
JA
2442}
2443
fc4df999
JA
2444static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
2445 bool force_nonblock)
c992fe29 2446{
8ed8d3c3 2447 struct io_wq_work *work, *old_work;
c992fe29
CH
2448
2449 /* fsync always requires a blocking context */
8ed8d3c3
JA
2450 if (force_nonblock) {
2451 io_put_req(req);
2452 req->work.func = io_fsync_finish;
c992fe29 2453 return -EAGAIN;
8ed8d3c3 2454 }
c992fe29 2455
8ed8d3c3
JA
2456 work = old_work = &req->work;
2457 io_fsync_finish(&work);
2458 if (work && work != old_work)
2459 *nxt = container_of(work, struct io_kiocb, work);
c992fe29
CH
2460 return 0;
2461}
2462
d63d1b5e
JA
2463static void io_fallocate_finish(struct io_wq_work **workptr)
2464{
2465 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2466 struct io_kiocb *nxt = NULL;
2467 int ret;
2468
2469 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2470 req->sync.len);
2471 if (ret < 0)
2472 req_set_fail_links(req);
2473 io_cqring_add_event(req, ret);
2474 io_put_req_find_next(req, &nxt);
2475 if (nxt)
2476 io_wq_assign_next(workptr, nxt);
2477}
2478
2479static int io_fallocate_prep(struct io_kiocb *req,
2480 const struct io_uring_sqe *sqe)
2481{
2482 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2483 return -EINVAL;
2484
2485 req->sync.off = READ_ONCE(sqe->off);
2486 req->sync.len = READ_ONCE(sqe->addr);
2487 req->sync.mode = READ_ONCE(sqe->len);
2488 return 0;
2489}
2490
2491static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
2492 bool force_nonblock)
2493{
2494 struct io_wq_work *work, *old_work;
2495
2496 /* fallocate always requiring blocking context */
2497 if (force_nonblock) {
2498 io_put_req(req);
2499 req->work.func = io_fallocate_finish;
2500 return -EAGAIN;
2501 }
2502
2503 work = old_work = &req->work;
2504 io_fallocate_finish(&work);
2505 if (work && work != old_work)
2506 *nxt = container_of(work, struct io_kiocb, work);
2507
2508 return 0;
2509}
2510
15b71abe
JA
2511static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2512{
f8748881 2513 const char __user *fname;
15b71abe
JA
2514 int ret;
2515
2516 if (sqe->ioprio || sqe->buf_index)
2517 return -EINVAL;
2518
2519 req->open.dfd = READ_ONCE(sqe->fd);
c12cedf2 2520 req->open.how.mode = READ_ONCE(sqe->len);
f8748881 2521 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
c12cedf2 2522 req->open.how.flags = READ_ONCE(sqe->open_flags);
15b71abe 2523
f8748881 2524 req->open.filename = getname(fname);
15b71abe
JA
2525 if (IS_ERR(req->open.filename)) {
2526 ret = PTR_ERR(req->open.filename);
2527 req->open.filename = NULL;
2528 return ret;
2529 }
2530
2531 return 0;
2532}
2533
cebdb986
JA
2534static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2535{
2536 struct open_how __user *how;
2537 const char __user *fname;
2538 size_t len;
2539 int ret;
2540
2541 if (sqe->ioprio || sqe->buf_index)
2542 return -EINVAL;
2543
2544 req->open.dfd = READ_ONCE(sqe->fd);
2545 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2546 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2547 len = READ_ONCE(sqe->len);
2548
2549 if (len < OPEN_HOW_SIZE_VER0)
2550 return -EINVAL;
2551
2552 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2553 len);
2554 if (ret)
2555 return ret;
2556
2557 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2558 req->open.how.flags |= O_LARGEFILE;
2559
2560 req->open.filename = getname(fname);
2561 if (IS_ERR(req->open.filename)) {
2562 ret = PTR_ERR(req->open.filename);
2563 req->open.filename = NULL;
2564 return ret;
2565 }
2566
2567 return 0;
2568}
2569
2570static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
2571 bool force_nonblock)
15b71abe
JA
2572{
2573 struct open_flags op;
15b71abe
JA
2574 struct file *file;
2575 int ret;
2576
2577 if (force_nonblock) {
2578 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
2579 return -EAGAIN;
2580 }
2581
cebdb986 2582 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
2583 if (ret)
2584 goto err;
2585
cebdb986 2586 ret = get_unused_fd_flags(req->open.how.flags);
15b71abe
JA
2587 if (ret < 0)
2588 goto err;
2589
2590 file = do_filp_open(req->open.dfd, req->open.filename, &op);
2591 if (IS_ERR(file)) {
2592 put_unused_fd(ret);
2593 ret = PTR_ERR(file);
2594 } else {
2595 fsnotify_open(file);
2596 fd_install(ret, file);
2597 }
2598err:
2599 putname(req->open.filename);
2600 if (ret < 0)
2601 req_set_fail_links(req);
2602 io_cqring_add_event(req, ret);
2603 io_put_req_find_next(req, nxt);
2604 return 0;
2605}
2606
cebdb986
JA
2607static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
2608 bool force_nonblock)
2609{
2610 req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
2611 return io_openat2(req, nxt, force_nonblock);
2612}
2613
c1ca757b
JA
2614static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2615{
2616#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2617 if (sqe->ioprio || sqe->buf_index || sqe->off)
2618 return -EINVAL;
2619
2620 req->madvise.addr = READ_ONCE(sqe->addr);
2621 req->madvise.len = READ_ONCE(sqe->len);
2622 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
2623 return 0;
2624#else
2625 return -EOPNOTSUPP;
2626#endif
2627}
2628
2629static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
2630 bool force_nonblock)
2631{
2632#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2633 struct io_madvise *ma = &req->madvise;
2634 int ret;
2635
2636 if (force_nonblock)
2637 return -EAGAIN;
2638
2639 ret = do_madvise(ma->addr, ma->len, ma->advice);
2640 if (ret < 0)
2641 req_set_fail_links(req);
2642 io_cqring_add_event(req, ret);
2643 io_put_req_find_next(req, nxt);
2644 return 0;
2645#else
2646 return -EOPNOTSUPP;
2647#endif
2648}
2649
4840e418
JA
2650static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2651{
2652 if (sqe->ioprio || sqe->buf_index || sqe->addr)
2653 return -EINVAL;
2654
2655 req->fadvise.offset = READ_ONCE(sqe->off);
2656 req->fadvise.len = READ_ONCE(sqe->len);
2657 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
2658 return 0;
2659}
2660
2661static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
2662 bool force_nonblock)
2663{
2664 struct io_fadvise *fa = &req->fadvise;
2665 int ret;
2666
2667 /* DONTNEED may block, others _should_ not */
2668 if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
2669 return -EAGAIN;
2670
2671 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
2672 if (ret < 0)
2673 req_set_fail_links(req);
2674 io_cqring_add_event(req, ret);
2675 io_put_req_find_next(req, nxt);
2676 return 0;
2677}
2678
eddc7ef5
JA
2679static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2680{
f8748881 2681 const char __user *fname;
eddc7ef5
JA
2682 unsigned lookup_flags;
2683 int ret;
2684
2685 if (sqe->ioprio || sqe->buf_index)
2686 return -EINVAL;
2687
2688 req->open.dfd = READ_ONCE(sqe->fd);
2689 req->open.mask = READ_ONCE(sqe->len);
f8748881 2690 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
eddc7ef5 2691 req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
c12cedf2 2692 req->open.how.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 2693
c12cedf2 2694 if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
eddc7ef5
JA
2695 return -EINVAL;
2696
f8748881 2697 req->open.filename = getname_flags(fname, lookup_flags, NULL);
eddc7ef5
JA
2698 if (IS_ERR(req->open.filename)) {
2699 ret = PTR_ERR(req->open.filename);
2700 req->open.filename = NULL;
2701 return ret;
2702 }
2703
2704 return 0;
2705}
2706
2707static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
2708 bool force_nonblock)
2709{
2710 struct io_open *ctx = &req->open;
2711 unsigned lookup_flags;
2712 struct path path;
2713 struct kstat stat;
2714 int ret;
2715
2716 if (force_nonblock)
2717 return -EAGAIN;
2718
c12cedf2 2719 if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
eddc7ef5
JA
2720 return -EINVAL;
2721
2722retry:
2723 /* filename_lookup() drops it, keep a reference */
2724 ctx->filename->refcnt++;
2725
2726 ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
2727 NULL);
2728 if (ret)
2729 goto err;
2730
c12cedf2 2731 ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
eddc7ef5
JA
2732 path_put(&path);
2733 if (retry_estale(ret, lookup_flags)) {
2734 lookup_flags |= LOOKUP_REVAL;
2735 goto retry;
2736 }
2737 if (!ret)
2738 ret = cp_statx(&stat, ctx->buffer);
2739err:
2740 putname(ctx->filename);
2741 if (ret < 0)
2742 req_set_fail_links(req);
2743 io_cqring_add_event(req, ret);
2744 io_put_req_find_next(req, nxt);
2745 return 0;
2746}
2747
b5dba59e
JA
2748static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2749{
2750 /*
2751 * If we queue this for async, it must not be cancellable. That would
2752 * leave the 'file' in an undeterminate state.
2753 */
2754 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
2755
2756 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
2757 sqe->rw_flags || sqe->buf_index)
2758 return -EINVAL;
2759 if (sqe->flags & IOSQE_FIXED_FILE)
2760 return -EINVAL;
2761
2762 req->close.fd = READ_ONCE(sqe->fd);
2763 if (req->file->f_op == &io_uring_fops ||
b14cca0c 2764 req->close.fd == req->ctx->ring_fd)
b5dba59e
JA
2765 return -EBADF;
2766
2767 return 0;
2768}
2769
2770static void io_close_finish(struct io_wq_work **workptr)
2771{
2772 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2773 struct io_kiocb *nxt = NULL;
2774
2775 /* Invoked with files, we need to do the close */
2776 if (req->work.files) {
2777 int ret;
2778
2779 ret = filp_close(req->close.put_file, req->work.files);
2780 if (ret < 0) {
2781 req_set_fail_links(req);
2782 }
2783 io_cqring_add_event(req, ret);
2784 }
2785
2786 fput(req->close.put_file);
2787
2788 /* we bypassed the re-issue, drop the submission reference */
2789 io_put_req(req);
2790 io_put_req_find_next(req, &nxt);
2791 if (nxt)
2792 io_wq_assign_next(workptr, nxt);
2793}
2794
2795static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
2796 bool force_nonblock)
2797{
2798 int ret;
2799
2800 req->close.put_file = NULL;
2801 ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
2802 if (ret < 0)
2803 return ret;
2804
2805 /* if the file has a flush method, be safe and punt to async */
2806 if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) {
2807 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
2808 goto eagain;
2809 }
2810
2811 /*
2812 * No ->flush(), safely close from here and just punt the
2813 * fput() to async context.
2814 */
2815 ret = filp_close(req->close.put_file, current->files);
2816
2817 if (ret < 0)
2818 req_set_fail_links(req);
2819 io_cqring_add_event(req, ret);
2820
2821 if (io_wq_current_is_worker()) {
2822 struct io_wq_work *old_work, *work;
2823
2824 old_work = work = &req->work;
2825 io_close_finish(&work);
2826 if (work && work != old_work)
2827 *nxt = container_of(work, struct io_kiocb, work);
2828 return 0;
2829 }
2830
2831eagain:
2832 req->work.func = io_close_finish;
2833 return -EAGAIN;
2834}
2835
3529d8c2 2836static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
2837{
2838 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4
JA
2839
2840 if (!req->file)
2841 return -EBADF;
5d17b4a4
JA
2842
2843 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2844 return -EINVAL;
2845 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2846 return -EINVAL;
2847
8ed8d3c3
JA
2848 req->sync.off = READ_ONCE(sqe->off);
2849 req->sync.len = READ_ONCE(sqe->len);
2850 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
2851 return 0;
2852}
2853
2854static void io_sync_file_range_finish(struct io_wq_work **workptr)
2855{
2856 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2857 struct io_kiocb *nxt = NULL;
2858 int ret;
2859
2860 if (io_req_cancelled(req))
2861 return;
2862
9adbd45d 2863 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
2864 req->sync.flags);
2865 if (ret < 0)
2866 req_set_fail_links(req);
2867 io_cqring_add_event(req, ret);
2868 io_put_req_find_next(req, &nxt);
2869 if (nxt)
78912934 2870 io_wq_assign_next(workptr, nxt);
5d17b4a4
JA
2871}
2872
fc4df999 2873static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
5d17b4a4
JA
2874 bool force_nonblock)
2875{
8ed8d3c3 2876 struct io_wq_work *work, *old_work;
5d17b4a4
JA
2877
2878 /* sync_file_range always requires a blocking context */
8ed8d3c3
JA
2879 if (force_nonblock) {
2880 io_put_req(req);
2881 req->work.func = io_sync_file_range_finish;
5d17b4a4 2882 return -EAGAIN;
8ed8d3c3 2883 }
5d17b4a4 2884
8ed8d3c3
JA
2885 work = old_work = &req->work;
2886 io_sync_file_range_finish(&work);
2887 if (work && work != old_work)
2888 *nxt = container_of(work, struct io_kiocb, work);
5d17b4a4
JA
2889 return 0;
2890}
2891
b7bb4f7d
JA
2892#if defined(CONFIG_NET)
2893static void io_sendrecv_async(struct io_wq_work **workptr)
2894{
2895 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2896 struct iovec *iov = NULL;
2897
2898 if (req->io->rw.iov != req->io->rw.fast_iov)
2899 iov = req->io->msg.iov;
2900 io_wq_submit_work(workptr);
2901 kfree(iov);
2902}
2903#endif
2904
3529d8c2 2905static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 2906{
0fa03c62 2907#if defined(CONFIG_NET)
e47293fd 2908 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 2909 struct io_async_ctx *io = req->io;
03b1230c 2910
e47293fd
JA
2911 sr->msg_flags = READ_ONCE(sqe->msg_flags);
2912 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 2913 sr->len = READ_ONCE(sqe->len);
3529d8c2 2914
fddaface 2915 if (!io || req->opcode == IORING_OP_SEND)
3529d8c2
JA
2916 return 0;
2917
d9688565 2918 io->msg.iov = io->msg.fast_iov;
3529d8c2 2919 return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 2920 &io->msg.iov);
03b1230c 2921#else
e47293fd 2922 return -EOPNOTSUPP;
03b1230c
JA
2923#endif
2924}
2925
fc4df999
JA
2926static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2927 bool force_nonblock)
aa1fa28f 2928{
03b1230c 2929#if defined(CONFIG_NET)
0b416c3e 2930 struct io_async_msghdr *kmsg = NULL;
0fa03c62
JA
2931 struct socket *sock;
2932 int ret;
2933
2934 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2935 return -EINVAL;
2936
2937 sock = sock_from_file(req->file, &ret);
2938 if (sock) {
b7bb4f7d 2939 struct io_async_ctx io;
03b1230c 2940 struct sockaddr_storage addr;
0fa03c62
JA
2941 unsigned flags;
2942
03b1230c 2943 if (req->io) {
0b416c3e
JA
2944 kmsg = &req->io->msg;
2945 kmsg->msg.msg_name = &addr;
2946 /* if iov is set, it's allocated already */
2947 if (!kmsg->iov)
2948 kmsg->iov = kmsg->fast_iov;
2949 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 2950 } else {
3529d8c2
JA
2951 struct io_sr_msg *sr = &req->sr_msg;
2952
0b416c3e
JA
2953 kmsg = &io.msg;
2954 kmsg->msg.msg_name = &addr;
3529d8c2
JA
2955
2956 io.msg.iov = io.msg.fast_iov;
2957 ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
2958 sr->msg_flags, &io.msg.iov);
03b1230c 2959 if (ret)
3529d8c2 2960 return ret;
03b1230c 2961 }
0fa03c62 2962
e47293fd
JA
2963 flags = req->sr_msg.msg_flags;
2964 if (flags & MSG_DONTWAIT)
2965 req->flags |= REQ_F_NOWAIT;
2966 else if (force_nonblock)
2967 flags |= MSG_DONTWAIT;
2968
0b416c3e 2969 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
03b1230c 2970 if (force_nonblock && ret == -EAGAIN) {
b7bb4f7d
JA
2971 if (req->io)
2972 return -EAGAIN;
2973 if (io_alloc_async_ctx(req))
2974 return -ENOMEM;
2975 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2976 req->work.func = io_sendrecv_async;
0b416c3e 2977 return -EAGAIN;
03b1230c 2978 }
441cdbd5
JA
2979 if (ret == -ERESTARTSYS)
2980 ret = -EINTR;
0fa03c62
JA
2981 }
2982
b7bb4f7d 2983 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 2984 kfree(kmsg->iov);
78e19bbe 2985 io_cqring_add_event(req, ret);
4e88d6e7
JA
2986 if (ret < 0)
2987 req_set_fail_links(req);
ec9c02ad 2988 io_put_req_find_next(req, nxt);
5d17b4a4 2989 return 0;
03b1230c
JA
2990#else
2991 return -EOPNOTSUPP;
aa1fa28f 2992#endif
03b1230c 2993}
aa1fa28f 2994
fddaface
JA
2995static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
2996 bool force_nonblock)
2997{
2998#if defined(CONFIG_NET)
2999 struct socket *sock;
3000 int ret;
3001
3002 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3003 return -EINVAL;
3004
3005 sock = sock_from_file(req->file, &ret);
3006 if (sock) {
3007 struct io_sr_msg *sr = &req->sr_msg;
3008 struct msghdr msg;
3009 struct iovec iov;
3010 unsigned flags;
3011
3012 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3013 &msg.msg_iter);
3014 if (ret)
3015 return ret;
3016
3017 msg.msg_name = NULL;
3018 msg.msg_control = NULL;
3019 msg.msg_controllen = 0;
3020 msg.msg_namelen = 0;
3021
3022 flags = req->sr_msg.msg_flags;
3023 if (flags & MSG_DONTWAIT)
3024 req->flags |= REQ_F_NOWAIT;
3025 else if (force_nonblock)
3026 flags |= MSG_DONTWAIT;
3027
3028 ret = __sys_sendmsg_sock(sock, &msg, flags);
3029 if (force_nonblock && ret == -EAGAIN)
3030 return -EAGAIN;
3031 if (ret == -ERESTARTSYS)
3032 ret = -EINTR;
3033 }
3034
3035 io_cqring_add_event(req, ret);
3036 if (ret < 0)
3037 req_set_fail_links(req);
3038 io_put_req_find_next(req, nxt);
3039 return 0;
3040#else
3041 return -EOPNOTSUPP;
3042#endif
3043}
3044
3529d8c2
JA
3045static int io_recvmsg_prep(struct io_kiocb *req,
3046 const struct io_uring_sqe *sqe)
aa1fa28f
JA
3047{
3048#if defined(CONFIG_NET)
e47293fd 3049 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2
JA
3050 struct io_async_ctx *io = req->io;
3051
3052 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3053 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
06b76d44 3054
fddaface 3055 if (!io || req->opcode == IORING_OP_RECV)
06b76d44 3056 return 0;
03b1230c 3057
d9688565 3058 io->msg.iov = io->msg.fast_iov;
3529d8c2 3059 return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 3060 &io->msg.uaddr, &io->msg.iov);
aa1fa28f 3061#else
e47293fd 3062 return -EOPNOTSUPP;
aa1fa28f
JA
3063#endif
3064}
3065
fc4df999
JA
3066static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
3067 bool force_nonblock)
aa1fa28f
JA
3068{
3069#if defined(CONFIG_NET)
0b416c3e 3070 struct io_async_msghdr *kmsg = NULL;
03b1230c
JA
3071 struct socket *sock;
3072 int ret;
3073
3074 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3075 return -EINVAL;
3076
3077 sock = sock_from_file(req->file, &ret);
3078 if (sock) {
b7bb4f7d 3079 struct io_async_ctx io;
03b1230c 3080 struct sockaddr_storage addr;
03b1230c
JA
3081 unsigned flags;
3082
03b1230c 3083 if (req->io) {
0b416c3e
JA
3084 kmsg = &req->io->msg;
3085 kmsg->msg.msg_name = &addr;
3086 /* if iov is set, it's allocated already */
3087 if (!kmsg->iov)
3088 kmsg->iov = kmsg->fast_iov;
3089 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3090 } else {
3529d8c2
JA
3091 struct io_sr_msg *sr = &req->sr_msg;
3092
0b416c3e
JA
3093 kmsg = &io.msg;
3094 kmsg->msg.msg_name = &addr;
3529d8c2
JA
3095
3096 io.msg.iov = io.msg.fast_iov;
3097 ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
3098 sr->msg_flags, &io.msg.uaddr,
3099 &io.msg.iov);
03b1230c 3100 if (ret)
3529d8c2 3101 return ret;
03b1230c
JA
3102 }
3103
e47293fd
JA
3104 flags = req->sr_msg.msg_flags;
3105 if (flags & MSG_DONTWAIT)
3106 req->flags |= REQ_F_NOWAIT;
3107 else if (force_nonblock)
3108 flags |= MSG_DONTWAIT;
3109
3110 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3111 kmsg->uaddr, flags);
03b1230c 3112 if (force_nonblock && ret == -EAGAIN) {
b7bb4f7d
JA
3113 if (req->io)
3114 return -EAGAIN;
3115 if (io_alloc_async_ctx(req))
3116 return -ENOMEM;
3117 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
3118 req->work.func = io_sendrecv_async;
0b416c3e 3119 return -EAGAIN;
03b1230c
JA
3120 }
3121 if (ret == -ERESTARTSYS)
3122 ret = -EINTR;
3123 }
3124
b7bb4f7d 3125 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3126 kfree(kmsg->iov);
03b1230c 3127 io_cqring_add_event(req, ret);
4e88d6e7
JA
3128 if (ret < 0)
3129 req_set_fail_links(req);
03b1230c
JA
3130 io_put_req_find_next(req, nxt);
3131 return 0;
0fa03c62
JA
3132#else
3133 return -EOPNOTSUPP;
3134#endif
3135}
5d17b4a4 3136
fddaface
JA
3137static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
3138 bool force_nonblock)
3139{
3140#if defined(CONFIG_NET)
3141 struct socket *sock;
3142 int ret;
3143
3144 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3145 return -EINVAL;
3146
3147 sock = sock_from_file(req->file, &ret);
3148 if (sock) {
3149 struct io_sr_msg *sr = &req->sr_msg;
3150 struct msghdr msg;
3151 struct iovec iov;
3152 unsigned flags;
3153
3154 ret = import_single_range(READ, sr->buf, sr->len, &iov,
3155 &msg.msg_iter);
3156 if (ret)
3157 return ret;
3158
3159 msg.msg_name = NULL;
3160 msg.msg_control = NULL;
3161 msg.msg_controllen = 0;
3162 msg.msg_namelen = 0;
3163 msg.msg_iocb = NULL;
3164 msg.msg_flags = 0;
3165
3166 flags = req->sr_msg.msg_flags;
3167 if (flags & MSG_DONTWAIT)
3168 req->flags |= REQ_F_NOWAIT;
3169 else if (force_nonblock)
3170 flags |= MSG_DONTWAIT;
3171
3172 ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
3173 if (force_nonblock && ret == -EAGAIN)
3174 return -EAGAIN;
3175 if (ret == -ERESTARTSYS)
3176 ret = -EINTR;
3177 }
3178
3179 io_cqring_add_event(req, ret);
3180 if (ret < 0)
3181 req_set_fail_links(req);
3182 io_put_req_find_next(req, nxt);
3183 return 0;
3184#else
3185 return -EOPNOTSUPP;
3186#endif
3187}
3188
3189
3529d8c2 3190static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35
JA
3191{
3192#if defined(CONFIG_NET)
8ed8d3c3
JA
3193 struct io_accept *accept = &req->accept;
3194
17f2fe35
JA
3195 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3196 return -EINVAL;
8042d6ce 3197 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
3198 return -EINVAL;
3199
d55e5f5b
JA
3200 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3201 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 3202 accept->flags = READ_ONCE(sqe->accept_flags);
8ed8d3c3
JA
3203 return 0;
3204#else
3205 return -EOPNOTSUPP;
3206#endif
3207}
17f2fe35 3208
8ed8d3c3
JA
3209#if defined(CONFIG_NET)
3210static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
3211 bool force_nonblock)
3212{
3213 struct io_accept *accept = &req->accept;
3214 unsigned file_flags;
3215 int ret;
3216
3217 file_flags = force_nonblock ? O_NONBLOCK : 0;
3218 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3219 accept->addr_len, accept->flags);
3220 if (ret == -EAGAIN && force_nonblock)
17f2fe35 3221 return -EAGAIN;
8e3cca12
JA
3222 if (ret == -ERESTARTSYS)
3223 ret = -EINTR;
4e88d6e7
JA
3224 if (ret < 0)
3225 req_set_fail_links(req);
78e19bbe 3226 io_cqring_add_event(req, ret);
ec9c02ad 3227 io_put_req_find_next(req, nxt);
17f2fe35 3228 return 0;
8ed8d3c3
JA
3229}
3230
3231static void io_accept_finish(struct io_wq_work **workptr)
3232{
3233 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3234 struct io_kiocb *nxt = NULL;
3235
3236 if (io_req_cancelled(req))
3237 return;
3238 __io_accept(req, &nxt, false);
3239 if (nxt)
78912934 3240 io_wq_assign_next(workptr, nxt);
8ed8d3c3
JA
3241}
3242#endif
3243
3244static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
3245 bool force_nonblock)
3246{
3247#if defined(CONFIG_NET)
3248 int ret;
3249
8ed8d3c3
JA
3250 ret = __io_accept(req, nxt, force_nonblock);
3251 if (ret == -EAGAIN && force_nonblock) {
3252 req->work.func = io_accept_finish;
3253 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
3254 io_put_req(req);
3255 return -EAGAIN;
3256 }
3257 return 0;
0fa03c62
JA
3258#else
3259 return -EOPNOTSUPP;
3260#endif
3261}
5d17b4a4 3262
3529d8c2 3263static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021
JA
3264{
3265#if defined(CONFIG_NET)
3529d8c2
JA
3266 struct io_connect *conn = &req->connect;
3267 struct io_async_ctx *io = req->io;
f499a021 3268
3fbb51c1
JA
3269 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3270 return -EINVAL;
3271 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3272 return -EINVAL;
3273
3529d8c2
JA
3274 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3275 conn->addr_len = READ_ONCE(sqe->addr2);
3276
3277 if (!io)
3278 return 0;
3279
3280 return move_addr_to_kernel(conn->addr, conn->addr_len,
3fbb51c1 3281 &io->connect.address);
f499a021 3282#else
3fbb51c1 3283 return -EOPNOTSUPP;
f499a021
JA
3284#endif
3285}
3286
fc4df999
JA
3287static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
3288 bool force_nonblock)
f8e85cf2
JA
3289{
3290#if defined(CONFIG_NET)
f499a021 3291 struct io_async_ctx __io, *io;
f8e85cf2 3292 unsigned file_flags;
3fbb51c1 3293 int ret;
f8e85cf2 3294
f499a021
JA
3295 if (req->io) {
3296 io = req->io;
3297 } else {
3529d8c2
JA
3298 ret = move_addr_to_kernel(req->connect.addr,
3299 req->connect.addr_len,
3300 &__io.connect.address);
f499a021
JA
3301 if (ret)
3302 goto out;
3303 io = &__io;
3304 }
3305
3fbb51c1
JA
3306 file_flags = force_nonblock ? O_NONBLOCK : 0;
3307
3308 ret = __sys_connect_file(req->file, &io->connect.address,
3309 req->connect.addr_len, file_flags);
87f80d62 3310 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
b7bb4f7d
JA
3311 if (req->io)
3312 return -EAGAIN;
3313 if (io_alloc_async_ctx(req)) {
f499a021
JA
3314 ret = -ENOMEM;
3315 goto out;
3316 }
b7bb4f7d 3317 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
f8e85cf2 3318 return -EAGAIN;
f499a021 3319 }
f8e85cf2
JA
3320 if (ret == -ERESTARTSYS)
3321 ret = -EINTR;
f499a021 3322out:
4e88d6e7
JA
3323 if (ret < 0)
3324 req_set_fail_links(req);
f8e85cf2
JA
3325 io_cqring_add_event(req, ret);
3326 io_put_req_find_next(req, nxt);
3327 return 0;
3328#else
3329 return -EOPNOTSUPP;
3330#endif
3331}
3332
221c5eb2
JA
3333static void io_poll_remove_one(struct io_kiocb *req)
3334{
3335 struct io_poll_iocb *poll = &req->poll;
3336
3337 spin_lock(&poll->head->lock);
3338 WRITE_ONCE(poll->canceled, true);
392edb45
JA
3339 if (!list_empty(&poll->wait.entry)) {
3340 list_del_init(&poll->wait.entry);
a197f664 3341 io_queue_async_work(req);
221c5eb2
JA
3342 }
3343 spin_unlock(&poll->head->lock);
78076bb6 3344 hash_del(&req->hash_node);
221c5eb2
JA
3345}
3346
3347static void io_poll_remove_all(struct io_ring_ctx *ctx)
3348{
78076bb6 3349 struct hlist_node *tmp;
221c5eb2 3350 struct io_kiocb *req;
78076bb6 3351 int i;
221c5eb2
JA
3352
3353 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
3354 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
3355 struct hlist_head *list;
3356
3357 list = &ctx->cancel_hash[i];
3358 hlist_for_each_entry_safe(req, tmp, list, hash_node)
3359 io_poll_remove_one(req);
221c5eb2
JA
3360 }
3361 spin_unlock_irq(&ctx->completion_lock);
3362}
3363
47f46768
JA
3364static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
3365{
78076bb6 3366 struct hlist_head *list;
47f46768
JA
3367 struct io_kiocb *req;
3368
78076bb6
JA
3369 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
3370 hlist_for_each_entry(req, list, hash_node) {
3371 if (sqe_addr == req->user_data) {
eac406c6
JA
3372 io_poll_remove_one(req);
3373 return 0;
3374 }
47f46768
JA
3375 }
3376
3377 return -ENOENT;
3378}
3379
3529d8c2
JA
3380static int io_poll_remove_prep(struct io_kiocb *req,
3381 const struct io_uring_sqe *sqe)
0969e783 3382{
0969e783
JA
3383 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3384 return -EINVAL;
3385 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3386 sqe->poll_events)
3387 return -EINVAL;
3388
3389 req->poll.addr = READ_ONCE(sqe->addr);
0969e783
JA
3390 return 0;
3391}
3392
221c5eb2
JA
3393/*
3394 * Find a running poll command that matches one specified in sqe->addr,
3395 * and remove it if found.
3396 */
fc4df999 3397static int io_poll_remove(struct io_kiocb *req)
221c5eb2
JA
3398{
3399 struct io_ring_ctx *ctx = req->ctx;
0969e783 3400 u64 addr;
47f46768 3401 int ret;
221c5eb2 3402
0969e783 3403 addr = req->poll.addr;
221c5eb2 3404 spin_lock_irq(&ctx->completion_lock);
0969e783 3405 ret = io_poll_cancel(ctx, addr);
221c5eb2
JA
3406 spin_unlock_irq(&ctx->completion_lock);
3407
78e19bbe 3408 io_cqring_add_event(req, ret);
4e88d6e7
JA
3409 if (ret < 0)
3410 req_set_fail_links(req);
e65ef56d 3411 io_put_req(req);
221c5eb2
JA
3412 return 0;
3413}
3414
b0dd8a41 3415static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
221c5eb2 3416{
a197f664
JL
3417 struct io_ring_ctx *ctx = req->ctx;
3418
8c838788 3419 req->poll.done = true;
b0dd8a41
JA
3420 if (error)
3421 io_cqring_fill_event(req, error);
3422 else
3423 io_cqring_fill_event(req, mangle_poll(mask));
8c838788 3424 io_commit_cqring(ctx);
221c5eb2
JA
3425}
3426
561fb04a 3427static void io_poll_complete_work(struct io_wq_work **workptr)
221c5eb2 3428{
561fb04a 3429 struct io_wq_work *work = *workptr;
221c5eb2
JA
3430 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3431 struct io_poll_iocb *poll = &req->poll;
3432 struct poll_table_struct pt = { ._key = poll->events };
3433 struct io_ring_ctx *ctx = req->ctx;
89723d0b 3434 struct io_kiocb *nxt = NULL;
221c5eb2 3435 __poll_t mask = 0;
b0dd8a41 3436 int ret = 0;
221c5eb2 3437
b0dd8a41 3438 if (work->flags & IO_WQ_WORK_CANCEL) {
561fb04a 3439 WRITE_ONCE(poll->canceled, true);
b0dd8a41
JA
3440 ret = -ECANCELED;
3441 } else if (READ_ONCE(poll->canceled)) {
3442 ret = -ECANCELED;
3443 }
561fb04a 3444
b0dd8a41 3445 if (ret != -ECANCELED)
221c5eb2
JA
3446 mask = vfs_poll(poll->file, &pt) & poll->events;
3447
3448 /*
3449 * Note that ->ki_cancel callers also delete iocb from active_reqs after
3450 * calling ->ki_cancel. We need the ctx_lock roundtrip here to
3451 * synchronize with them. In the cancellation case the list_del_init
3452 * itself is not actually needed, but harmless so we keep it in to
3453 * avoid further branches in the fast path.
3454 */
3455 spin_lock_irq(&ctx->completion_lock);
b0dd8a41 3456 if (!mask && ret != -ECANCELED) {
392edb45 3457 add_wait_queue(poll->head, &poll->wait);
221c5eb2
JA
3458 spin_unlock_irq(&ctx->completion_lock);
3459 return;
3460 }
78076bb6 3461 hash_del(&req->hash_node);
b0dd8a41 3462 io_poll_complete(req, mask, ret);
221c5eb2
JA
3463 spin_unlock_irq(&ctx->completion_lock);
3464
8c838788 3465 io_cqring_ev_posted(ctx);
89723d0b 3466
4e88d6e7
JA
3467 if (ret < 0)
3468 req_set_fail_links(req);
ec9c02ad 3469 io_put_req_find_next(req, &nxt);
89723d0b 3470 if (nxt)
78912934 3471 io_wq_assign_next(workptr, nxt);
221c5eb2
JA
3472}
3473
e94f141b
JA
3474static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
3475{
e94f141b 3476 struct io_kiocb *req, *tmp;
8237e045 3477 struct req_batch rb;
e94f141b 3478
c6ca97b3 3479 rb.to_free = rb.need_iter = 0;
e94f141b
JA
3480 spin_lock_irq(&ctx->completion_lock);
3481 llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
3482 hash_del(&req->hash_node);
3483 io_poll_complete(req, req->result, 0);
3484
8237e045
JA
3485 if (refcount_dec_and_test(&req->refs) &&
3486 !io_req_multi_free(&rb, req)) {
3487 req->flags |= REQ_F_COMP_LOCKED;
3488 io_free_req(req);
e94f141b
JA
3489 }
3490 }
3491 spin_unlock_irq(&ctx->completion_lock);
3492
3493 io_cqring_ev_posted(ctx);
8237e045 3494 io_free_req_many(ctx, &rb);
e94f141b
JA
3495}
3496
3497static void io_poll_flush(struct io_wq_work **workptr)
3498{
3499 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3500 struct llist_node *nodes;
3501
3502 nodes = llist_del_all(&req->ctx->poll_llist);
3503 if (nodes)
3504 __io_poll_flush(req->ctx, nodes);
3505}
3506
221c5eb2
JA
3507static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
3508 void *key)
3509{
e944475e 3510 struct io_poll_iocb *poll = wait->private;
221c5eb2
JA
3511 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
3512 struct io_ring_ctx *ctx = req->ctx;
3513 __poll_t mask = key_to_poll(key);
221c5eb2
JA
3514
3515 /* for instances that support it check for an event match first: */
8c838788
JA
3516 if (mask && !(mask & poll->events))
3517 return 0;
221c5eb2 3518
392edb45 3519 list_del_init(&poll->wait.entry);
221c5eb2 3520
7c9e7f0f
JA
3521 /*
3522 * Run completion inline if we can. We're using trylock here because
3523 * we are violating the completion_lock -> poll wq lock ordering.
3524 * If we have a link timeout we're going to need the completion_lock
3525 * for finalizing the request, mark us as having grabbed that already.
3526 */
e94f141b
JA
3527 if (mask) {
3528 unsigned long flags;
221c5eb2 3529
e94f141b
JA
3530 if (llist_empty(&ctx->poll_llist) &&
3531 spin_trylock_irqsave(&ctx->completion_lock, flags)) {
3532 hash_del(&req->hash_node);
3533 io_poll_complete(req, mask, 0);
3534 req->flags |= REQ_F_COMP_LOCKED;
3535 io_put_req(req);
3536 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3537
3538 io_cqring_ev_posted(ctx);
3539 req = NULL;
3540 } else {
3541 req->result = mask;
3542 req->llist_node.next = NULL;
3543 /* if the list wasn't empty, we're done */
3544 if (!llist_add(&req->llist_node, &ctx->poll_llist))
3545 req = NULL;
3546 else
3547 req->work.func = io_poll_flush;
3548 }
221c5eb2 3549 }
e94f141b
JA
3550 if (req)
3551 io_queue_async_work(req);
221c5eb2 3552
221c5eb2
JA
3553 return 1;
3554}
3555
3556struct io_poll_table {
3557 struct poll_table_struct pt;
3558 struct io_kiocb *req;
3559 int error;
3560};
3561
3562static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
3563 struct poll_table_struct *p)
3564{
3565 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
3566
3567 if (unlikely(pt->req->poll.head)) {
3568 pt->error = -EINVAL;
3569 return;
3570 }
3571
3572 pt->error = 0;
3573 pt->req->poll.head = head;
392edb45 3574 add_wait_queue(head, &pt->req->poll.wait);
221c5eb2
JA
3575}
3576
eac406c6
JA
3577static void io_poll_req_insert(struct io_kiocb *req)
3578{
3579 struct io_ring_ctx *ctx = req->ctx;
78076bb6
JA
3580 struct hlist_head *list;
3581
3582 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
3583 hlist_add_head(&req->hash_node, list);
eac406c6
JA
3584}
3585
3529d8c2 3586static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
3587{
3588 struct io_poll_iocb *poll = &req->poll;
221c5eb2 3589 u16 events;
221c5eb2
JA
3590
3591 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3592 return -EINVAL;
3593 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
3594 return -EINVAL;
09bb8394
JA
3595 if (!poll->file)
3596 return -EBADF;
221c5eb2 3597
221c5eb2
JA
3598 events = READ_ONCE(sqe->poll_events);
3599 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
0969e783
JA
3600 return 0;
3601}
3602
3603static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
3604{
3605 struct io_poll_iocb *poll = &req->poll;
3606 struct io_ring_ctx *ctx = req->ctx;
3607 struct io_poll_table ipt;
3608 bool cancel = false;
3609 __poll_t mask;
0969e783
JA
3610
3611 INIT_IO_WORK(&req->work, io_poll_complete_work);
78076bb6 3612 INIT_HLIST_NODE(&req->hash_node);
221c5eb2 3613
221c5eb2 3614 poll->head = NULL;
8c838788 3615 poll->done = false;
221c5eb2
JA
3616 poll->canceled = false;
3617
3618 ipt.pt._qproc = io_poll_queue_proc;
3619 ipt.pt._key = poll->events;
3620 ipt.req = req;
3621 ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
3622
3623 /* initialized the list so that we can do list_empty checks */
392edb45
JA
3624 INIT_LIST_HEAD(&poll->wait.entry);
3625 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
3626 poll->wait.private = poll;
221c5eb2 3627
36703247
JA
3628 INIT_LIST_HEAD(&req->list);
3629
221c5eb2 3630 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
221c5eb2
JA
3631
3632 spin_lock_irq(&ctx->completion_lock);
8c838788
JA
3633 if (likely(poll->head)) {
3634 spin_lock(&poll->head->lock);
392edb45 3635 if (unlikely(list_empty(&poll->wait.entry))) {
8c838788
JA
3636 if (ipt.error)
3637 cancel = true;
3638 ipt.error = 0;
3639 mask = 0;
3640 }
3641 if (mask || ipt.error)
392edb45 3642 list_del_init(&poll->wait.entry);
8c838788
JA
3643 else if (cancel)
3644 WRITE_ONCE(poll->canceled, true);
3645 else if (!poll->done) /* actually waiting for an event */
eac406c6 3646 io_poll_req_insert(req);
8c838788
JA
3647 spin_unlock(&poll->head->lock);
3648 }
3649 if (mask) { /* no async, we'd stolen it */
221c5eb2 3650 ipt.error = 0;
b0dd8a41 3651 io_poll_complete(req, mask, 0);
221c5eb2 3652 }
221c5eb2
JA
3653 spin_unlock_irq(&ctx->completion_lock);
3654
8c838788
JA
3655 if (mask) {
3656 io_cqring_ev_posted(ctx);
ec9c02ad 3657 io_put_req_find_next(req, nxt);
221c5eb2 3658 }
8c838788 3659 return ipt.error;
221c5eb2
JA
3660}
3661
5262f567
JA
3662static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
3663{
ad8a48ac
JA
3664 struct io_timeout_data *data = container_of(timer,
3665 struct io_timeout_data, timer);
3666 struct io_kiocb *req = data->req;
3667 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
3668 unsigned long flags;
3669
5262f567
JA
3670 atomic_inc(&ctx->cq_timeouts);
3671
3672 spin_lock_irqsave(&ctx->completion_lock, flags);
ef03681a 3673 /*
11365043
JA
3674 * We could be racing with timeout deletion. If the list is empty,
3675 * then timeout lookup already found it and will be handling it.
ef03681a 3676 */
842f9612 3677 if (!list_empty(&req->list)) {
11365043 3678 struct io_kiocb *prev;
5262f567 3679
11365043
JA
3680 /*
3681 * Adjust the reqs sequence before the current one because it
d195a66e 3682 * will consume a slot in the cq_ring and the cq_tail
11365043
JA
3683 * pointer will be increased, otherwise other timeout reqs may
3684 * return in advance without waiting for enough wait_nr.
3685 */
3686 prev = req;
3687 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
3688 prev->sequence++;
11365043 3689 list_del_init(&req->list);
11365043 3690 }
5262f567 3691
78e19bbe 3692 io_cqring_fill_event(req, -ETIME);
5262f567
JA
3693 io_commit_cqring(ctx);
3694 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3695
3696 io_cqring_ev_posted(ctx);
4e88d6e7 3697 req_set_fail_links(req);
5262f567
JA
3698 io_put_req(req);
3699 return HRTIMER_NORESTART;
3700}
3701
47f46768
JA
3702static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
3703{
3704 struct io_kiocb *req;
3705 int ret = -ENOENT;
3706
3707 list_for_each_entry(req, &ctx->timeout_list, list) {
3708 if (user_data == req->user_data) {
3709 list_del_init(&req->list);
3710 ret = 0;
3711 break;
3712 }
3713 }
3714
3715 if (ret == -ENOENT)
3716 return ret;
3717
2d28390a 3718 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
47f46768
JA
3719 if (ret == -1)
3720 return -EALREADY;
3721
4e88d6e7 3722 req_set_fail_links(req);
47f46768
JA
3723 io_cqring_fill_event(req, -ECANCELED);
3724 io_put_req(req);
3725 return 0;
3726}
3727
3529d8c2
JA
3728static int io_timeout_remove_prep(struct io_kiocb *req,
3729 const struct io_uring_sqe *sqe)
b29472ee 3730{
b29472ee
JA
3731 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3732 return -EINVAL;
3733 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
3734 return -EINVAL;
3735
3736 req->timeout.addr = READ_ONCE(sqe->addr);
3737 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
3738 if (req->timeout.flags)
3739 return -EINVAL;
3740
b29472ee
JA
3741 return 0;
3742}
3743
11365043
JA
3744/*
3745 * Remove or update an existing timeout command
3746 */
fc4df999 3747static int io_timeout_remove(struct io_kiocb *req)
11365043
JA
3748{
3749 struct io_ring_ctx *ctx = req->ctx;
47f46768 3750 int ret;
11365043 3751
11365043 3752 spin_lock_irq(&ctx->completion_lock);
b29472ee 3753 ret = io_timeout_cancel(ctx, req->timeout.addr);
11365043 3754
47f46768 3755 io_cqring_fill_event(req, ret);
11365043
JA
3756 io_commit_cqring(ctx);
3757 spin_unlock_irq(&ctx->completion_lock);
5262f567 3758 io_cqring_ev_posted(ctx);
4e88d6e7
JA
3759 if (ret < 0)
3760 req_set_fail_links(req);
ec9c02ad 3761 io_put_req(req);
11365043 3762 return 0;
5262f567
JA
3763}
3764
3529d8c2 3765static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 3766 bool is_timeout_link)
5262f567 3767{
ad8a48ac 3768 struct io_timeout_data *data;
a41525ab 3769 unsigned flags;
5262f567 3770
ad8a48ac 3771 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 3772 return -EINVAL;
ad8a48ac 3773 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 3774 return -EINVAL;
2d28390a
JA
3775 if (sqe->off && is_timeout_link)
3776 return -EINVAL;
a41525ab
JA
3777 flags = READ_ONCE(sqe->timeout_flags);
3778 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 3779 return -EINVAL;
bdf20073 3780
26a61679
JA
3781 req->timeout.count = READ_ONCE(sqe->off);
3782
3529d8c2 3783 if (!req->io && io_alloc_async_ctx(req))
26a61679
JA
3784 return -ENOMEM;
3785
3786 data = &req->io->timeout;
ad8a48ac 3787 data->req = req;
ad8a48ac
JA
3788 req->flags |= REQ_F_TIMEOUT;
3789
3790 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
3791 return -EFAULT;
3792
11365043 3793 if (flags & IORING_TIMEOUT_ABS)
ad8a48ac 3794 data->mode = HRTIMER_MODE_ABS;
11365043 3795 else
ad8a48ac 3796 data->mode = HRTIMER_MODE_REL;
11365043 3797
ad8a48ac
JA
3798 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
3799 return 0;
3800}
3801
fc4df999 3802static int io_timeout(struct io_kiocb *req)
ad8a48ac
JA
3803{
3804 unsigned count;
3805 struct io_ring_ctx *ctx = req->ctx;
3806 struct io_timeout_data *data;
3807 struct list_head *entry;
3808 unsigned span = 0;
ad8a48ac 3809
2d28390a 3810 data = &req->io->timeout;
93bd25bb 3811
5262f567
JA
3812 /*
3813 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
3814 * timeout event to be satisfied. If it isn't set, then this is
3815 * a pure timeout request, sequence isn't used.
5262f567 3816 */
26a61679 3817 count = req->timeout.count;
93bd25bb
JA
3818 if (!count) {
3819 req->flags |= REQ_F_TIMEOUT_NOSEQ;
3820 spin_lock_irq(&ctx->completion_lock);
3821 entry = ctx->timeout_list.prev;
3822 goto add;
3823 }
5262f567
JA
3824
3825 req->sequence = ctx->cached_sq_head + count - 1;
2d28390a 3826 data->seq_offset = count;
5262f567
JA
3827
3828 /*
3829 * Insertion sort, ensuring the first entry in the list is always
3830 * the one we need first.
3831 */
5262f567
JA
3832 spin_lock_irq(&ctx->completion_lock);
3833 list_for_each_prev(entry, &ctx->timeout_list) {
3834 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5da0fb1a 3835 unsigned nxt_sq_head;
3836 long long tmp, tmp_nxt;
2d28390a 3837 u32 nxt_offset = nxt->io->timeout.seq_offset;
5262f567 3838
93bd25bb
JA
3839 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
3840 continue;
3841
5da0fb1a 3842 /*
3843 * Since cached_sq_head + count - 1 can overflow, use type long
3844 * long to store it.
3845 */
3846 tmp = (long long)ctx->cached_sq_head + count - 1;
cc42e0ac
PB
3847 nxt_sq_head = nxt->sequence - nxt_offset + 1;
3848 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
5da0fb1a 3849
3850 /*
3851 * cached_sq_head may overflow, and it will never overflow twice
3852 * once there is some timeout req still be valid.
3853 */
3854 if (ctx->cached_sq_head < nxt_sq_head)
8b07a65a 3855 tmp += UINT_MAX;
5da0fb1a 3856
a1f58ba4 3857 if (tmp > tmp_nxt)
5262f567 3858 break;
a1f58ba4 3859
3860 /*
3861 * Sequence of reqs after the insert one and itself should
3862 * be adjusted because each timeout req consumes a slot.
3863 */
3864 span++;
3865 nxt->sequence++;
5262f567 3866 }
a1f58ba4 3867 req->sequence -= span;
93bd25bb 3868add:
5262f567 3869 list_add(&req->list, entry);
ad8a48ac
JA
3870 data->timer.function = io_timeout_fn;
3871 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 3872 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
3873 return 0;
3874}
5262f567 3875
62755e35
JA
3876static bool io_cancel_cb(struct io_wq_work *work, void *data)
3877{
3878 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3879
3880 return req->user_data == (unsigned long) data;
3881}
3882
e977d6d3 3883static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
62755e35 3884{
62755e35 3885 enum io_wq_cancel cancel_ret;
62755e35
JA
3886 int ret = 0;
3887
62755e35
JA
3888 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
3889 switch (cancel_ret) {
3890 case IO_WQ_CANCEL_OK:
3891 ret = 0;
3892 break;
3893 case IO_WQ_CANCEL_RUNNING:
3894 ret = -EALREADY;
3895 break;
3896 case IO_WQ_CANCEL_NOTFOUND:
3897 ret = -ENOENT;
3898 break;
3899 }
3900
e977d6d3
JA
3901 return ret;
3902}
3903
47f46768
JA
3904static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
3905 struct io_kiocb *req, __u64 sqe_addr,
b0dd8a41 3906 struct io_kiocb **nxt, int success_ret)
47f46768
JA
3907{
3908 unsigned long flags;
3909 int ret;
3910
3911 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
3912 if (ret != -ENOENT) {
3913 spin_lock_irqsave(&ctx->completion_lock, flags);
3914 goto done;
3915 }
3916
3917 spin_lock_irqsave(&ctx->completion_lock, flags);
3918 ret = io_timeout_cancel(ctx, sqe_addr);
3919 if (ret != -ENOENT)
3920 goto done;
3921 ret = io_poll_cancel(ctx, sqe_addr);
3922done:
b0dd8a41
JA
3923 if (!ret)
3924 ret = success_ret;
47f46768
JA
3925 io_cqring_fill_event(req, ret);
3926 io_commit_cqring(ctx);
3927 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3928 io_cqring_ev_posted(ctx);
3929
4e88d6e7
JA
3930 if (ret < 0)
3931 req_set_fail_links(req);
47f46768
JA
3932 io_put_req_find_next(req, nxt);
3933}
3934
3529d8c2
JA
3935static int io_async_cancel_prep(struct io_kiocb *req,
3936 const struct io_uring_sqe *sqe)
e977d6d3 3937{
fbf23849 3938 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3
JA
3939 return -EINVAL;
3940 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
3941 sqe->cancel_flags)
3942 return -EINVAL;
3943
fbf23849
JA
3944 req->cancel.addr = READ_ONCE(sqe->addr);
3945 return 0;
3946}
3947
3948static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
3949{
3950 struct io_ring_ctx *ctx = req->ctx;
fbf23849
JA
3951
3952 io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
5262f567
JA
3953 return 0;
3954}
3955
05f3fb3c
JA
3956static int io_files_update_prep(struct io_kiocb *req,
3957 const struct io_uring_sqe *sqe)
3958{
3959 if (sqe->flags || sqe->ioprio || sqe->rw_flags)
3960 return -EINVAL;
3961
3962 req->files_update.offset = READ_ONCE(sqe->off);
3963 req->files_update.nr_args = READ_ONCE(sqe->len);
3964 if (!req->files_update.nr_args)
3965 return -EINVAL;
3966 req->files_update.arg = READ_ONCE(sqe->addr);
3967 return 0;
3968}
3969
3970static int io_files_update(struct io_kiocb *req, bool force_nonblock)
3971{
3972 struct io_ring_ctx *ctx = req->ctx;
3973 struct io_uring_files_update up;
3974 int ret;
3975
3976 if (force_nonblock) {
3977 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
3978 return -EAGAIN;
3979 }
3980
3981 up.offset = req->files_update.offset;
3982 up.fds = req->files_update.arg;
3983
3984 mutex_lock(&ctx->uring_lock);
3985 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
3986 mutex_unlock(&ctx->uring_lock);
3987
3988 if (ret < 0)
3989 req_set_fail_links(req);
3990 io_cqring_add_event(req, ret);
3991 io_put_req(req);
3992 return 0;
3993}
3994
3529d8c2
JA
3995static int io_req_defer_prep(struct io_kiocb *req,
3996 const struct io_uring_sqe *sqe)
f67676d1 3997{
e781573e 3998 ssize_t ret = 0;
f67676d1 3999
d625c6ee 4000 switch (req->opcode) {
e781573e
JA
4001 case IORING_OP_NOP:
4002 break;
f67676d1
JA
4003 case IORING_OP_READV:
4004 case IORING_OP_READ_FIXED:
3a6820f2 4005 case IORING_OP_READ:
3529d8c2 4006 ret = io_read_prep(req, sqe, true);
f67676d1
JA
4007 break;
4008 case IORING_OP_WRITEV:
4009 case IORING_OP_WRITE_FIXED:
3a6820f2 4010 case IORING_OP_WRITE:
3529d8c2 4011 ret = io_write_prep(req, sqe, true);
f67676d1 4012 break;
0969e783 4013 case IORING_OP_POLL_ADD:
3529d8c2 4014 ret = io_poll_add_prep(req, sqe);
0969e783
JA
4015 break;
4016 case IORING_OP_POLL_REMOVE:
3529d8c2 4017 ret = io_poll_remove_prep(req, sqe);
0969e783 4018 break;
8ed8d3c3 4019 case IORING_OP_FSYNC:
3529d8c2 4020 ret = io_prep_fsync(req, sqe);
8ed8d3c3
JA
4021 break;
4022 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2 4023 ret = io_prep_sfr(req, sqe);
8ed8d3c3 4024 break;
03b1230c 4025 case IORING_OP_SENDMSG:
fddaface 4026 case IORING_OP_SEND:
3529d8c2 4027 ret = io_sendmsg_prep(req, sqe);
03b1230c
JA
4028 break;
4029 case IORING_OP_RECVMSG:
fddaface 4030 case IORING_OP_RECV:
3529d8c2 4031 ret = io_recvmsg_prep(req, sqe);
03b1230c 4032 break;
f499a021 4033 case IORING_OP_CONNECT:
3529d8c2 4034 ret = io_connect_prep(req, sqe);
f499a021 4035 break;
2d28390a 4036 case IORING_OP_TIMEOUT:
3529d8c2 4037 ret = io_timeout_prep(req, sqe, false);
b7bb4f7d 4038 break;
b29472ee 4039 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2 4040 ret = io_timeout_remove_prep(req, sqe);
b29472ee 4041 break;
fbf23849 4042 case IORING_OP_ASYNC_CANCEL:
3529d8c2 4043 ret = io_async_cancel_prep(req, sqe);
fbf23849 4044 break;
2d28390a 4045 case IORING_OP_LINK_TIMEOUT:
3529d8c2 4046 ret = io_timeout_prep(req, sqe, true);
b7bb4f7d 4047 break;
8ed8d3c3 4048 case IORING_OP_ACCEPT:
3529d8c2 4049 ret = io_accept_prep(req, sqe);
8ed8d3c3 4050 break;
d63d1b5e
JA
4051 case IORING_OP_FALLOCATE:
4052 ret = io_fallocate_prep(req, sqe);
4053 break;
15b71abe
JA
4054 case IORING_OP_OPENAT:
4055 ret = io_openat_prep(req, sqe);
4056 break;
b5dba59e
JA
4057 case IORING_OP_CLOSE:
4058 ret = io_close_prep(req, sqe);
4059 break;
05f3fb3c
JA
4060 case IORING_OP_FILES_UPDATE:
4061 ret = io_files_update_prep(req, sqe);
4062 break;
eddc7ef5
JA
4063 case IORING_OP_STATX:
4064 ret = io_statx_prep(req, sqe);
4065 break;
4840e418
JA
4066 case IORING_OP_FADVISE:
4067 ret = io_fadvise_prep(req, sqe);
4068 break;
c1ca757b
JA
4069 case IORING_OP_MADVISE:
4070 ret = io_madvise_prep(req, sqe);
4071 break;
cebdb986
JA
4072 case IORING_OP_OPENAT2:
4073 ret = io_openat2_prep(req, sqe);
4074 break;
f67676d1 4075 default:
e781573e
JA
4076 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4077 req->opcode);
4078 ret = -EINVAL;
b7bb4f7d 4079 break;
f67676d1
JA
4080 }
4081
b7bb4f7d 4082 return ret;
f67676d1
JA
4083}
4084
3529d8c2 4085static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
de0617e4 4086{
a197f664 4087 struct io_ring_ctx *ctx = req->ctx;
f67676d1 4088 int ret;
de0617e4 4089
9d858b21
BL
4090 /* Still need defer if there is pending req in defer list. */
4091 if (!req_need_defer(req) && list_empty(&ctx->defer_list))
de0617e4
JA
4092 return 0;
4093
3529d8c2 4094 if (!req->io && io_alloc_async_ctx(req))
de0617e4
JA
4095 return -EAGAIN;
4096
3529d8c2 4097 ret = io_req_defer_prep(req, sqe);
b7bb4f7d 4098 if (ret < 0)
2d28390a 4099 return ret;
2d28390a 4100
de0617e4 4101 spin_lock_irq(&ctx->completion_lock);
9d858b21 4102 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
de0617e4 4103 spin_unlock_irq(&ctx->completion_lock);
de0617e4
JA
4104 return 0;
4105 }
4106
915967f6 4107 trace_io_uring_defer(ctx, req, req->user_data);
de0617e4
JA
4108 list_add_tail(&req->list, &ctx->defer_list);
4109 spin_unlock_irq(&ctx->completion_lock);
4110 return -EIOCBQUEUED;
4111}
4112
3529d8c2
JA
4113static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4114 struct io_kiocb **nxt, bool force_nonblock)
2b188cc1 4115{
a197f664 4116 struct io_ring_ctx *ctx = req->ctx;
d625c6ee 4117 int ret;
2b188cc1 4118
d625c6ee 4119 switch (req->opcode) {
2b188cc1 4120 case IORING_OP_NOP:
78e19bbe 4121 ret = io_nop(req);
2b188cc1
JA
4122 break;
4123 case IORING_OP_READV:
edafccee 4124 case IORING_OP_READ_FIXED:
3a6820f2 4125 case IORING_OP_READ:
3529d8c2
JA
4126 if (sqe) {
4127 ret = io_read_prep(req, sqe, force_nonblock);
4128 if (ret < 0)
4129 break;
4130 }
267bc904 4131 ret = io_read(req, nxt, force_nonblock);
edafccee 4132 break;
3529d8c2 4133 case IORING_OP_WRITEV:
edafccee 4134 case IORING_OP_WRITE_FIXED:
3a6820f2 4135 case IORING_OP_WRITE:
3529d8c2
JA
4136 if (sqe) {
4137 ret = io_write_prep(req, sqe, force_nonblock);
4138 if (ret < 0)
4139 break;
4140 }
267bc904 4141 ret = io_write(req, nxt, force_nonblock);
2b188cc1 4142 break;
c992fe29 4143 case IORING_OP_FSYNC:
3529d8c2
JA
4144 if (sqe) {
4145 ret = io_prep_fsync(req, sqe);
4146 if (ret < 0)
4147 break;
4148 }
fc4df999 4149 ret = io_fsync(req, nxt, force_nonblock);
c992fe29 4150 break;
221c5eb2 4151 case IORING_OP_POLL_ADD:
3529d8c2
JA
4152 if (sqe) {
4153 ret = io_poll_add_prep(req, sqe);
4154 if (ret)
4155 break;
4156 }
fc4df999 4157 ret = io_poll_add(req, nxt);
221c5eb2
JA
4158 break;
4159 case IORING_OP_POLL_REMOVE:
3529d8c2
JA
4160 if (sqe) {
4161 ret = io_poll_remove_prep(req, sqe);
4162 if (ret < 0)
4163 break;
4164 }
fc4df999 4165 ret = io_poll_remove(req);
221c5eb2 4166 break;
5d17b4a4 4167 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2
JA
4168 if (sqe) {
4169 ret = io_prep_sfr(req, sqe);
4170 if (ret < 0)
4171 break;
4172 }
fc4df999 4173 ret = io_sync_file_range(req, nxt, force_nonblock);
5d17b4a4 4174 break;
0fa03c62 4175 case IORING_OP_SENDMSG:
fddaface 4176 case IORING_OP_SEND:
3529d8c2
JA
4177 if (sqe) {
4178 ret = io_sendmsg_prep(req, sqe);
4179 if (ret < 0)
4180 break;
4181 }
fddaface
JA
4182 if (req->opcode == IORING_OP_SENDMSG)
4183 ret = io_sendmsg(req, nxt, force_nonblock);
4184 else
4185 ret = io_send(req, nxt, force_nonblock);
0fa03c62 4186 break;
aa1fa28f 4187 case IORING_OP_RECVMSG:
fddaface 4188 case IORING_OP_RECV:
3529d8c2
JA
4189 if (sqe) {
4190 ret = io_recvmsg_prep(req, sqe);
4191 if (ret)
4192 break;
4193 }
fddaface
JA
4194 if (req->opcode == IORING_OP_RECVMSG)
4195 ret = io_recvmsg(req, nxt, force_nonblock);
4196 else
4197 ret = io_recv(req, nxt, force_nonblock);
aa1fa28f 4198 break;
5262f567 4199 case IORING_OP_TIMEOUT:
3529d8c2
JA
4200 if (sqe) {
4201 ret = io_timeout_prep(req, sqe, false);
4202 if (ret)
4203 break;
4204 }
fc4df999 4205 ret = io_timeout(req);
5262f567 4206 break;
11365043 4207 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2
JA
4208 if (sqe) {
4209 ret = io_timeout_remove_prep(req, sqe);
4210 if (ret)
4211 break;
4212 }
fc4df999 4213 ret = io_timeout_remove(req);
11365043 4214 break;
17f2fe35 4215 case IORING_OP_ACCEPT:
3529d8c2
JA
4216 if (sqe) {
4217 ret = io_accept_prep(req, sqe);
4218 if (ret)
4219 break;
4220 }
fc4df999 4221 ret = io_accept(req, nxt, force_nonblock);
17f2fe35 4222 break;
f8e85cf2 4223 case IORING_OP_CONNECT:
3529d8c2
JA
4224 if (sqe) {
4225 ret = io_connect_prep(req, sqe);
4226 if (ret)
4227 break;
4228 }
fc4df999 4229 ret = io_connect(req, nxt, force_nonblock);
f8e85cf2 4230 break;
62755e35 4231 case IORING_OP_ASYNC_CANCEL:
3529d8c2
JA
4232 if (sqe) {
4233 ret = io_async_cancel_prep(req, sqe);
4234 if (ret)
4235 break;
4236 }
fc4df999 4237 ret = io_async_cancel(req, nxt);
62755e35 4238 break;
d63d1b5e
JA
4239 case IORING_OP_FALLOCATE:
4240 if (sqe) {
4241 ret = io_fallocate_prep(req, sqe);
4242 if (ret)
4243 break;
4244 }
4245 ret = io_fallocate(req, nxt, force_nonblock);
4246 break;
15b71abe
JA
4247 case IORING_OP_OPENAT:
4248 if (sqe) {
4249 ret = io_openat_prep(req, sqe);
4250 if (ret)
4251 break;
4252 }
4253 ret = io_openat(req, nxt, force_nonblock);
4254 break;
b5dba59e
JA
4255 case IORING_OP_CLOSE:
4256 if (sqe) {
4257 ret = io_close_prep(req, sqe);
4258 if (ret)
4259 break;
4260 }
4261 ret = io_close(req, nxt, force_nonblock);
4262 break;
05f3fb3c
JA
4263 case IORING_OP_FILES_UPDATE:
4264 if (sqe) {
4265 ret = io_files_update_prep(req, sqe);
4266 if (ret)
4267 break;
4268 }
4269 ret = io_files_update(req, force_nonblock);
4270 break;
eddc7ef5
JA
4271 case IORING_OP_STATX:
4272 if (sqe) {
4273 ret = io_statx_prep(req, sqe);
4274 if (ret)
4275 break;
4276 }
4277 ret = io_statx(req, nxt, force_nonblock);
4278 break;
4840e418
JA
4279 case IORING_OP_FADVISE:
4280 if (sqe) {
4281 ret = io_fadvise_prep(req, sqe);
4282 if (ret)
4283 break;
4284 }
4285 ret = io_fadvise(req, nxt, force_nonblock);
4286 break;
c1ca757b
JA
4287 case IORING_OP_MADVISE:
4288 if (sqe) {
4289 ret = io_madvise_prep(req, sqe);
4290 if (ret)
4291 break;
4292 }
4293 ret = io_madvise(req, nxt, force_nonblock);
4294 break;
cebdb986
JA
4295 case IORING_OP_OPENAT2:
4296 if (sqe) {
4297 ret = io_openat2_prep(req, sqe);
4298 if (ret)
4299 break;
4300 }
4301 ret = io_openat2(req, nxt, force_nonblock);
4302 break;
2b188cc1
JA
4303 default:
4304 ret = -EINVAL;
4305 break;
4306 }
4307
def596e9
JA
4308 if (ret)
4309 return ret;
4310
4311 if (ctx->flags & IORING_SETUP_IOPOLL) {
11ba820b
JA
4312 const bool in_async = io_wq_current_is_worker();
4313
9e645e11 4314 if (req->result == -EAGAIN)
def596e9
JA
4315 return -EAGAIN;
4316
11ba820b
JA
4317 /* workqueue context doesn't hold uring_lock, grab it now */
4318 if (in_async)
4319 mutex_lock(&ctx->uring_lock);
4320
def596e9 4321 io_iopoll_req_issued(req);
11ba820b
JA
4322
4323 if (in_async)
4324 mutex_unlock(&ctx->uring_lock);
def596e9
JA
4325 }
4326
4327 return 0;
2b188cc1
JA
4328}
4329
561fb04a 4330static void io_wq_submit_work(struct io_wq_work **workptr)
2b188cc1 4331{
561fb04a 4332 struct io_wq_work *work = *workptr;
2b188cc1 4333 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
561fb04a
JA
4334 struct io_kiocb *nxt = NULL;
4335 int ret = 0;
2b188cc1 4336
0c9d5ccd
JA
4337 /* if NO_CANCEL is set, we must still run the work */
4338 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
4339 IO_WQ_WORK_CANCEL) {
561fb04a 4340 ret = -ECANCELED;
0c9d5ccd 4341 }
31b51510 4342
561fb04a 4343 if (!ret) {
cf6fd4bd
PB
4344 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
4345 req->in_async = true;
561fb04a 4346 do {
3529d8c2 4347 ret = io_issue_sqe(req, NULL, &nxt, false);
561fb04a
JA
4348 /*
4349 * We can get EAGAIN for polled IO even though we're
4350 * forcing a sync submission from here, since we can't
4351 * wait for request slots on the block side.
4352 */
4353 if (ret != -EAGAIN)
4354 break;
4355 cond_resched();
4356 } while (1);
4357 }
31b51510 4358
561fb04a 4359 /* drop submission reference */
ec9c02ad 4360 io_put_req(req);
817869d2 4361
561fb04a 4362 if (ret) {
4e88d6e7 4363 req_set_fail_links(req);
78e19bbe 4364 io_cqring_add_event(req, ret);
817869d2 4365 io_put_req(req);
edafccee 4366 }
2b188cc1 4367
561fb04a 4368 /* if a dependent link is ready, pass it back */
78912934
JA
4369 if (!ret && nxt)
4370 io_wq_assign_next(workptr, nxt);
2b188cc1
JA
4371}
4372
15b71abe 4373static int io_req_needs_file(struct io_kiocb *req, int fd)
09bb8394 4374{
d3656344 4375 if (!io_op_defs[req->opcode].needs_file)
9e3aa61a 4376 return 0;
d3656344
JA
4377 if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
4378 return 0;
4379 return 1;
09bb8394
JA
4380}
4381
65e19f54
JA
4382static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
4383 int index)
4384{
4385 struct fixed_file_table *table;
4386
05f3fb3c
JA
4387 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
4388 return table->files[index & IORING_FILE_TABLE_MASK];;
65e19f54
JA
4389}
4390
3529d8c2
JA
4391static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
4392 const struct io_uring_sqe *sqe)
09bb8394 4393{
a197f664 4394 struct io_ring_ctx *ctx = req->ctx;
09bb8394 4395 unsigned flags;
d3656344 4396 int fd;
09bb8394 4397
3529d8c2
JA
4398 flags = READ_ONCE(sqe->flags);
4399 fd = READ_ONCE(sqe->fd);
09bb8394 4400
d3656344
JA
4401 if (!io_req_needs_file(req, fd))
4402 return 0;
09bb8394
JA
4403
4404 if (flags & IOSQE_FIXED_FILE) {
05f3fb3c 4405 if (unlikely(!ctx->file_data ||
09bb8394
JA
4406 (unsigned) fd >= ctx->nr_user_files))
4407 return -EBADF;
b7620121 4408 fd = array_index_nospec(fd, ctx->nr_user_files);
65e19f54
JA
4409 req->file = io_file_from_index(ctx, fd);
4410 if (!req->file)
08a45173 4411 return -EBADF;
09bb8394 4412 req->flags |= REQ_F_FIXED_FILE;
05f3fb3c 4413 percpu_ref_get(&ctx->file_data->refs);
09bb8394 4414 } else {
cf6fd4bd 4415 if (req->needs_fixed_file)
09bb8394 4416 return -EBADF;
c826bd7a 4417 trace_io_uring_file_get(ctx, fd);
09bb8394
JA
4418 req->file = io_file_get(state, fd);
4419 if (unlikely(!req->file))
4420 return -EBADF;
4421 }
4422
4423 return 0;
4424}
4425
a197f664 4426static int io_grab_files(struct io_kiocb *req)
fcb323cc
JA
4427{
4428 int ret = -EBADF;
a197f664 4429 struct io_ring_ctx *ctx = req->ctx;
fcb323cc 4430
b14cca0c 4431 if (!ctx->ring_file)
b5dba59e
JA
4432 return -EBADF;
4433
fcb323cc
JA
4434 rcu_read_lock();
4435 spin_lock_irq(&ctx->inflight_lock);
4436 /*
4437 * We use the f_ops->flush() handler to ensure that we can flush
4438 * out work accessing these files if the fd is closed. Check if
4439 * the fd has changed since we started down this path, and disallow
4440 * this operation if it has.
4441 */
b14cca0c 4442 if (fcheck(ctx->ring_fd) == ctx->ring_file) {
fcb323cc
JA
4443 list_add(&req->inflight_entry, &ctx->inflight_list);
4444 req->flags |= REQ_F_INFLIGHT;
4445 req->work.files = current->files;
4446 ret = 0;
4447 }
4448 spin_unlock_irq(&ctx->inflight_lock);
4449 rcu_read_unlock();
4450
4451 return ret;
4452}
4453
2665abfd 4454static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 4455{
ad8a48ac
JA
4456 struct io_timeout_data *data = container_of(timer,
4457 struct io_timeout_data, timer);
4458 struct io_kiocb *req = data->req;
2665abfd
JA
4459 struct io_ring_ctx *ctx = req->ctx;
4460 struct io_kiocb *prev = NULL;
4461 unsigned long flags;
2665abfd
JA
4462
4463 spin_lock_irqsave(&ctx->completion_lock, flags);
4464
4465 /*
4466 * We don't expect the list to be empty, that will only happen if we
4467 * race with the completion of the linked work.
4468 */
4493233e
PB
4469 if (!list_empty(&req->link_list)) {
4470 prev = list_entry(req->link_list.prev, struct io_kiocb,
4471 link_list);
5d960724 4472 if (refcount_inc_not_zero(&prev->refs)) {
4493233e 4473 list_del_init(&req->link_list);
5d960724
JA
4474 prev->flags &= ~REQ_F_LINK_TIMEOUT;
4475 } else
76a46e06 4476 prev = NULL;
2665abfd
JA
4477 }
4478
4479 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4480
4481 if (prev) {
4e88d6e7 4482 req_set_fail_links(prev);
b0dd8a41
JA
4483 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
4484 -ETIME);
76a46e06 4485 io_put_req(prev);
47f46768
JA
4486 } else {
4487 io_cqring_add_event(req, -ETIME);
4488 io_put_req(req);
2665abfd 4489 }
2665abfd
JA
4490 return HRTIMER_NORESTART;
4491}
4492
ad8a48ac 4493static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 4494{
76a46e06 4495 struct io_ring_ctx *ctx = req->ctx;
2665abfd 4496
76a46e06
JA
4497 /*
4498 * If the list is now empty, then our linked request finished before
4499 * we got a chance to setup the timer
4500 */
4501 spin_lock_irq(&ctx->completion_lock);
4493233e 4502 if (!list_empty(&req->link_list)) {
2d28390a 4503 struct io_timeout_data *data = &req->io->timeout;
94ae5e77 4504
ad8a48ac
JA
4505 data->timer.function = io_link_timeout_fn;
4506 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
4507 data->mode);
2665abfd 4508 }
76a46e06 4509 spin_unlock_irq(&ctx->completion_lock);
2665abfd 4510
2665abfd 4511 /* drop submission reference */
76a46e06
JA
4512 io_put_req(req);
4513}
2665abfd 4514
ad8a48ac 4515static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd
JA
4516{
4517 struct io_kiocb *nxt;
4518
4519 if (!(req->flags & REQ_F_LINK))
4520 return NULL;
4521
4493233e
PB
4522 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
4523 link_list);
d625c6ee 4524 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 4525 return NULL;
2665abfd 4526
76a46e06 4527 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 4528 return nxt;
2665abfd
JA
4529}
4530
3529d8c2 4531static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 4532{
4a0a7a18 4533 struct io_kiocb *linked_timeout;
f9bd67f6 4534 struct io_kiocb *nxt = NULL;
e0c5c576 4535 int ret;
2b188cc1 4536
4a0a7a18
JA
4537again:
4538 linked_timeout = io_prep_linked_timeout(req);
4539
3529d8c2 4540 ret = io_issue_sqe(req, sqe, &nxt, true);
491381ce
JA
4541
4542 /*
4543 * We async punt it if the file wasn't marked NOWAIT, or if the file
4544 * doesn't support non-blocking read/write attempts
4545 */
4546 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
4547 (req->flags & REQ_F_MUST_PUNT))) {
bbad27b2
PB
4548 if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
4549 ret = io_grab_files(req);
4550 if (ret)
4551 goto err;
2b188cc1 4552 }
bbad27b2
PB
4553
4554 /*
4555 * Queued up for async execution, worker will release
4556 * submit reference when the iocb is actually submitted.
4557 */
4558 io_queue_async_work(req);
4a0a7a18 4559 goto done_req;
2b188cc1 4560 }
e65ef56d 4561
fcb323cc 4562err:
76a46e06 4563 /* drop submission reference */
ec9c02ad 4564 io_put_req(req);
e65ef56d 4565
f9bd67f6 4566 if (linked_timeout) {
76a46e06 4567 if (!ret)
f9bd67f6 4568 io_queue_linked_timeout(linked_timeout);
76a46e06 4569 else
f9bd67f6 4570 io_put_req(linked_timeout);
76a46e06
JA
4571 }
4572
e65ef56d 4573 /* and drop final reference, if we failed */
9e645e11 4574 if (ret) {
78e19bbe 4575 io_cqring_add_event(req, ret);
4e88d6e7 4576 req_set_fail_links(req);
e65ef56d 4577 io_put_req(req);
9e645e11 4578 }
4a0a7a18
JA
4579done_req:
4580 if (nxt) {
4581 req = nxt;
4582 nxt = NULL;
4583 goto again;
4584 }
2b188cc1
JA
4585}
4586
3529d8c2 4587static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4fe2c963
JL
4588{
4589 int ret;
4590
3529d8c2 4591 ret = io_req_defer(req, sqe);
4fe2c963
JL
4592 if (ret) {
4593 if (ret != -EIOCBQUEUED) {
78e19bbe 4594 io_cqring_add_event(req, ret);
4e88d6e7 4595 req_set_fail_links(req);
78e19bbe 4596 io_double_put_req(req);
4fe2c963 4597 }
2550878f 4598 } else if (req->flags & REQ_F_FORCE_ASYNC) {
ce35a47a
JA
4599 /*
4600 * Never try inline submit of IOSQE_ASYNC is set, go straight
4601 * to async execution.
4602 */
4603 req->work.flags |= IO_WQ_WORK_CONCURRENT;
4604 io_queue_async_work(req);
4605 } else {
3529d8c2 4606 __io_queue_sqe(req, sqe);
ce35a47a 4607 }
4fe2c963
JL
4608}
4609
1b4a51b6 4610static inline void io_queue_link_head(struct io_kiocb *req)
4fe2c963 4611{
94ae5e77 4612 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
1b4a51b6
PB
4613 io_cqring_add_event(req, -ECANCELED);
4614 io_double_put_req(req);
4615 } else
3529d8c2 4616 io_queue_sqe(req, NULL);
4fe2c963
JL
4617}
4618
4e88d6e7 4619#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
ce35a47a 4620 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
9e645e11 4621
3529d8c2
JA
4622static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4623 struct io_submit_state *state, struct io_kiocb **link)
9e645e11 4624{
a197f664 4625 struct io_ring_ctx *ctx = req->ctx;
32fe525b 4626 unsigned int sqe_flags;
9e645e11
JA
4627 int ret;
4628
32fe525b
PB
4629 sqe_flags = READ_ONCE(sqe->flags);
4630
9e645e11 4631 /* enforce forwards compatibility on users */
32fe525b 4632 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
9e645e11 4633 ret = -EINVAL;
196be95c 4634 goto err_req;
9e645e11 4635 }
6b47ee6e
PB
4636 /* same numerical values with corresponding REQ_F_*, safe to copy */
4637 req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
4638 IOSQE_ASYNC);
9e645e11 4639
3529d8c2 4640 ret = io_req_set_file(state, req, sqe);
9e645e11
JA
4641 if (unlikely(ret)) {
4642err_req:
78e19bbe
JA
4643 io_cqring_add_event(req, ret);
4644 io_double_put_req(req);
2e6e1fde 4645 return false;
9e645e11
JA
4646 }
4647
9e645e11
JA
4648 /*
4649 * If we already have a head request, queue this one for async
4650 * submittal once the head completes. If we don't have a head but
4651 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
4652 * submitted sync once the chain is complete. If none of those
4653 * conditions are true (normal request), then just queue it.
4654 */
4655 if (*link) {
9d76377f 4656 struct io_kiocb *head = *link;
9e645e11 4657
711be031
PB
4658 if (sqe_flags & IOSQE_IO_DRAIN) {
4659 head->flags |= REQ_F_IO_DRAIN;
4660 ctx->drain_next = 1;
4661 }
b7bb4f7d 4662 if (io_alloc_async_ctx(req)) {
9e645e11
JA
4663 ret = -EAGAIN;
4664 goto err_req;
4665 }
4666
3529d8c2 4667 ret = io_req_defer_prep(req, sqe);
2d28390a 4668 if (ret) {
4e88d6e7 4669 /* fail even hard links since we don't submit */
9d76377f 4670 head->flags |= REQ_F_FAIL_LINK;
f67676d1 4671 goto err_req;
2d28390a 4672 }
9d76377f
PB
4673 trace_io_uring_link(ctx, req, head);
4674 list_add_tail(&req->link_list, &head->link_list);
32fe525b
PB
4675
4676 /* last request of a link, enqueue the link */
4677 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
4678 io_queue_link_head(head);
4679 *link = NULL;
4680 }
9e645e11 4681 } else {
711be031
PB
4682 if (unlikely(ctx->drain_next)) {
4683 req->flags |= REQ_F_IO_DRAIN;
4684 req->ctx->drain_next = 0;
4685 }
4686 if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
4687 req->flags |= REQ_F_LINK;
711be031
PB
4688 INIT_LIST_HEAD(&req->link_list);
4689 ret = io_req_defer_prep(req, sqe);
4690 if (ret)
4691 req->flags |= REQ_F_FAIL_LINK;
4692 *link = req;
4693 } else {
4694 io_queue_sqe(req, sqe);
4695 }
9e645e11 4696 }
2e6e1fde
PB
4697
4698 return true;
9e645e11
JA
4699}
4700
9a56a232
JA
4701/*
4702 * Batched submission is done, ensure local IO is flushed out.
4703 */
4704static void io_submit_state_end(struct io_submit_state *state)
4705{
4706 blk_finish_plug(&state->plug);
3d6770fb 4707 io_file_put(state);
2579f913
JA
4708 if (state->free_reqs)
4709 kmem_cache_free_bulk(req_cachep, state->free_reqs,
4710 &state->reqs[state->cur_req]);
9a56a232
JA
4711}
4712
4713/*
4714 * Start submission side cache.
4715 */
4716static void io_submit_state_start(struct io_submit_state *state,
22efde59 4717 unsigned int max_ios)
9a56a232
JA
4718{
4719 blk_start_plug(&state->plug);
2579f913 4720 state->free_reqs = 0;
9a56a232
JA
4721 state->file = NULL;
4722 state->ios_left = max_ios;
4723}
4724
2b188cc1
JA
4725static void io_commit_sqring(struct io_ring_ctx *ctx)
4726{
75b28aff 4727 struct io_rings *rings = ctx->rings;
2b188cc1 4728
caf582c6
PB
4729 /*
4730 * Ensure any loads from the SQEs are done at this point,
4731 * since once we write the new head, the application could
4732 * write new data to them.
4733 */
4734 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
4735}
4736
2b188cc1 4737/*
3529d8c2 4738 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
2b188cc1
JA
4739 * that is mapped by userspace. This means that care needs to be taken to
4740 * ensure that reads are stable, as we cannot rely on userspace always
4741 * being a good citizen. If members of the sqe are validated and then later
4742 * used, it's important that those reads are done through READ_ONCE() to
4743 * prevent a re-load down the line.
4744 */
3529d8c2
JA
4745static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
4746 const struct io_uring_sqe **sqe_ptr)
2b188cc1 4747{
75b28aff 4748 u32 *sq_array = ctx->sq_array;
2b188cc1
JA
4749 unsigned head;
4750
4751 /*
4752 * The cached sq head (or cq tail) serves two purposes:
4753 *
4754 * 1) allows us to batch the cost of updating the user visible
4755 * head updates.
4756 * 2) allows the kernel side to track the head on its own, even
4757 * though the application is the one updating it.
4758 */
ee7d46d9 4759 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
9835d6fa 4760 if (likely(head < ctx->sq_entries)) {
cf6fd4bd
PB
4761 /*
4762 * All io need record the previous position, if LINK vs DARIN,
4763 * it can be used to mark the position of the first IO in the
4764 * link list.
4765 */
4766 req->sequence = ctx->cached_sq_head;
3529d8c2
JA
4767 *sqe_ptr = &ctx->sq_sqes[head];
4768 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
4769 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
2b188cc1
JA
4770 ctx->cached_sq_head++;
4771 return true;
4772 }
4773
4774 /* drop invalid entries */
4775 ctx->cached_sq_head++;
498ccd9e 4776 ctx->cached_sq_dropped++;
ee7d46d9 4777 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
2b188cc1
JA
4778 return false;
4779}
4780
fb5ccc98 4781static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
ae9428ca
PB
4782 struct file *ring_file, int ring_fd,
4783 struct mm_struct **mm, bool async)
6c271ce2
JA
4784{
4785 struct io_submit_state state, *statep = NULL;
9e645e11 4786 struct io_kiocb *link = NULL;
9e645e11 4787 int i, submitted = 0;
95a1b3ff 4788 bool mm_fault = false;
6c271ce2 4789
c4a2ed72 4790 /* if we have a backlog and couldn't flush it all, return BUSY */
ad3eb2c8
JA
4791 if (test_bit(0, &ctx->sq_check_overflow)) {
4792 if (!list_empty(&ctx->cq_overflow_list) &&
4793 !io_cqring_overflow_flush(ctx, false))
4794 return -EBUSY;
4795 }
6c271ce2 4796
ee7d46d9
PB
4797 /* make sure SQ entry isn't read before tail */
4798 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
9ef4f124 4799
2b85edfc
PB
4800 if (!percpu_ref_tryget_many(&ctx->refs, nr))
4801 return -EAGAIN;
4802
6c271ce2 4803 if (nr > IO_PLUG_THRESHOLD) {
22efde59 4804 io_submit_state_start(&state, nr);
6c271ce2
JA
4805 statep = &state;
4806 }
4807
b14cca0c
PB
4808 ctx->ring_fd = ring_fd;
4809 ctx->ring_file = ring_file;
4810
6c271ce2 4811 for (i = 0; i < nr; i++) {
3529d8c2 4812 const struct io_uring_sqe *sqe;
196be95c 4813 struct io_kiocb *req;
fb5ccc98 4814
196be95c
PB
4815 req = io_get_req(ctx, statep);
4816 if (unlikely(!req)) {
4817 if (!submitted)
4818 submitted = -EAGAIN;
fb5ccc98 4819 break;
196be95c 4820 }
3529d8c2 4821 if (!io_get_sqring(ctx, req, &sqe)) {
2b85edfc 4822 __io_req_do_free(req);
196be95c
PB
4823 break;
4824 }
fb5ccc98 4825
d3656344
JA
4826 /* will complete beyond this point, count as submitted */
4827 submitted++;
4828
4829 if (unlikely(req->opcode >= IORING_OP_LAST)) {
4830 io_cqring_add_event(req, -EINVAL);
4831 io_double_put_req(req);
4832 break;
4833 }
4834
4835 if (io_op_defs[req->opcode].needs_mm && !*mm) {
95a1b3ff
PB
4836 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
4837 if (!mm_fault) {
4838 use_mm(ctx->sqo_mm);
4839 *mm = ctx->sqo_mm;
4840 }
9e645e11 4841 }
9e645e11 4842
cf6fd4bd
PB
4843 req->has_user = *mm != NULL;
4844 req->in_async = async;
4845 req->needs_fixed_file = async;
354420f7
JA
4846 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
4847 true, async);
3529d8c2 4848 if (!io_submit_sqe(req, sqe, statep, &link))
2e6e1fde 4849 break;
6c271ce2
JA
4850 }
4851
2b85edfc
PB
4852 if (submitted != nr)
4853 percpu_ref_put_many(&ctx->refs, nr - submitted);
9e645e11 4854 if (link)
1b4a51b6 4855 io_queue_link_head(link);
6c271ce2
JA
4856 if (statep)
4857 io_submit_state_end(&state);
4858
ae9428ca
PB
4859 /* Commit SQ ring head once we've consumed and submitted all SQEs */
4860 io_commit_sqring(ctx);
4861
6c271ce2
JA
4862 return submitted;
4863}
4864
4865static int io_sq_thread(void *data)
4866{
6c271ce2
JA
4867 struct io_ring_ctx *ctx = data;
4868 struct mm_struct *cur_mm = NULL;
181e448d 4869 const struct cred *old_cred;
6c271ce2
JA
4870 mm_segment_t old_fs;
4871 DEFINE_WAIT(wait);
4872 unsigned inflight;
4873 unsigned long timeout;
c1edbf5f 4874 int ret;
6c271ce2 4875
206aefde 4876 complete(&ctx->completions[1]);
a4c0b3de 4877
6c271ce2
JA
4878 old_fs = get_fs();
4879 set_fs(USER_DS);
181e448d 4880 old_cred = override_creds(ctx->creds);
6c271ce2 4881
c1edbf5f 4882 ret = timeout = inflight = 0;
2bbcd6d3 4883 while (!kthread_should_park()) {
fb5ccc98 4884 unsigned int to_submit;
6c271ce2
JA
4885
4886 if (inflight) {
4887 unsigned nr_events = 0;
4888
4889 if (ctx->flags & IORING_SETUP_IOPOLL) {
2b2ed975
JA
4890 /*
4891 * inflight is the count of the maximum possible
4892 * entries we submitted, but it can be smaller
4893 * if we dropped some of them. If we don't have
4894 * poll entries available, then we know that we
4895 * have nothing left to poll for. Reset the
4896 * inflight count to zero in that case.
4897 */
4898 mutex_lock(&ctx->uring_lock);
4899 if (!list_empty(&ctx->poll_list))
4900 __io_iopoll_check(ctx, &nr_events, 0);
4901 else
4902 inflight = 0;
4903 mutex_unlock(&ctx->uring_lock);
6c271ce2
JA
4904 } else {
4905 /*
4906 * Normal IO, just pretend everything completed.
4907 * We don't have to poll completions for that.
4908 */
4909 nr_events = inflight;
4910 }
4911
4912 inflight -= nr_events;
4913 if (!inflight)
4914 timeout = jiffies + ctx->sq_thread_idle;
4915 }
4916
fb5ccc98 4917 to_submit = io_sqring_entries(ctx);
c1edbf5f
JA
4918
4919 /*
4920 * If submit got -EBUSY, flag us as needing the application
4921 * to enter the kernel to reap and flush events.
4922 */
4923 if (!to_submit || ret == -EBUSY) {
6c271ce2
JA
4924 /*
4925 * We're polling. If we're within the defined idle
4926 * period, then let us spin without work before going
c1edbf5f
JA
4927 * to sleep. The exception is if we got EBUSY doing
4928 * more IO, we should wait for the application to
4929 * reap events and wake us up.
6c271ce2 4930 */
c1edbf5f
JA
4931 if (inflight ||
4932 (!time_after(jiffies, timeout) && ret != -EBUSY)) {
9831a90c 4933 cond_resched();
6c271ce2
JA
4934 continue;
4935 }
4936
4937 /*
4938 * Drop cur_mm before scheduling, we can't hold it for
4939 * long periods (or over schedule()). Do this before
4940 * adding ourselves to the waitqueue, as the unuse/drop
4941 * may sleep.
4942 */
4943 if (cur_mm) {
4944 unuse_mm(cur_mm);
4945 mmput(cur_mm);
4946 cur_mm = NULL;
4947 }
4948
4949 prepare_to_wait(&ctx->sqo_wait, &wait,
4950 TASK_INTERRUPTIBLE);
4951
4952 /* Tell userspace we may need a wakeup call */
75b28aff 4953 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
0d7bae69
SB
4954 /* make sure to read SQ tail after writing flags */
4955 smp_mb();
6c271ce2 4956
fb5ccc98 4957 to_submit = io_sqring_entries(ctx);
c1edbf5f 4958 if (!to_submit || ret == -EBUSY) {
2bbcd6d3 4959 if (kthread_should_park()) {
6c271ce2
JA
4960 finish_wait(&ctx->sqo_wait, &wait);
4961 break;
4962 }
4963 if (signal_pending(current))
4964 flush_signals(current);
4965 schedule();
4966 finish_wait(&ctx->sqo_wait, &wait);
4967
75b28aff 4968 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
4969 continue;
4970 }
4971 finish_wait(&ctx->sqo_wait, &wait);
4972
75b28aff 4973 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
4974 }
4975
8a4955ff 4976 mutex_lock(&ctx->uring_lock);
1d7bb1d5 4977 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
8a4955ff 4978 mutex_unlock(&ctx->uring_lock);
1d7bb1d5
JA
4979 if (ret > 0)
4980 inflight += ret;
6c271ce2
JA
4981 }
4982
4983 set_fs(old_fs);
4984 if (cur_mm) {
4985 unuse_mm(cur_mm);
4986 mmput(cur_mm);
4987 }
181e448d 4988 revert_creds(old_cred);
06058632 4989
2bbcd6d3 4990 kthread_parkme();
06058632 4991
6c271ce2
JA
4992 return 0;
4993}
4994
bda52162
JA
4995struct io_wait_queue {
4996 struct wait_queue_entry wq;
4997 struct io_ring_ctx *ctx;
4998 unsigned to_wait;
4999 unsigned nr_timeouts;
5000};
5001
1d7bb1d5 5002static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
bda52162
JA
5003{
5004 struct io_ring_ctx *ctx = iowq->ctx;
5005
5006 /*
d195a66e 5007 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
5008 * started waiting. For timeouts, we always want to return to userspace,
5009 * regardless of event count.
5010 */
1d7bb1d5 5011 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
bda52162
JA
5012 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
5013}
5014
5015static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
5016 int wake_flags, void *key)
5017{
5018 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
5019 wq);
5020
1d7bb1d5
JA
5021 /* use noflush == true, as we can't safely rely on locking context */
5022 if (!io_should_wake(iowq, true))
bda52162
JA
5023 return -1;
5024
5025 return autoremove_wake_function(curr, mode, wake_flags, key);
5026}
5027
2b188cc1
JA
5028/*
5029 * Wait until events become available, if we don't already have some. The
5030 * application must reap them itself, as they reside on the shared cq ring.
5031 */
5032static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
5033 const sigset_t __user *sig, size_t sigsz)
5034{
bda52162
JA
5035 struct io_wait_queue iowq = {
5036 .wq = {
5037 .private = current,
5038 .func = io_wake_function,
5039 .entry = LIST_HEAD_INIT(iowq.wq.entry),
5040 },
5041 .ctx = ctx,
5042 .to_wait = min_events,
5043 };
75b28aff 5044 struct io_rings *rings = ctx->rings;
e9ffa5c2 5045 int ret = 0;
2b188cc1 5046
1d7bb1d5 5047 if (io_cqring_events(ctx, false) >= min_events)
2b188cc1
JA
5048 return 0;
5049
5050 if (sig) {
9e75ad5d
AB
5051#ifdef CONFIG_COMPAT
5052 if (in_compat_syscall())
5053 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 5054 sigsz);
9e75ad5d
AB
5055 else
5056#endif
b772434b 5057 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 5058
2b188cc1
JA
5059 if (ret)
5060 return ret;
5061 }
5062
bda52162 5063 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
c826bd7a 5064 trace_io_uring_cqring_wait(ctx, min_events);
bda52162
JA
5065 do {
5066 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
5067 TASK_INTERRUPTIBLE);
1d7bb1d5 5068 if (io_should_wake(&iowq, false))
bda52162
JA
5069 break;
5070 schedule();
5071 if (signal_pending(current)) {
e9ffa5c2 5072 ret = -EINTR;
bda52162
JA
5073 break;
5074 }
5075 } while (1);
5076 finish_wait(&ctx->wait, &iowq.wq);
5077
e9ffa5c2 5078 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 5079
75b28aff 5080 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
5081}
5082
6b06314c
JA
5083static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
5084{
5085#if defined(CONFIG_UNIX)
5086 if (ctx->ring_sock) {
5087 struct sock *sock = ctx->ring_sock->sk;
5088 struct sk_buff *skb;
5089
5090 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
5091 kfree_skb(skb);
5092 }
5093#else
5094 int i;
5095
65e19f54
JA
5096 for (i = 0; i < ctx->nr_user_files; i++) {
5097 struct file *file;
5098
5099 file = io_file_from_index(ctx, i);
5100 if (file)
5101 fput(file);
5102 }
6b06314c
JA
5103#endif
5104}
5105
05f3fb3c
JA
5106static void io_file_ref_kill(struct percpu_ref *ref)
5107{
5108 struct fixed_file_data *data;
5109
5110 data = container_of(ref, struct fixed_file_data, refs);
5111 complete(&data->done);
5112}
5113
6b06314c
JA
5114static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
5115{
05f3fb3c 5116 struct fixed_file_data *data = ctx->file_data;
65e19f54
JA
5117 unsigned nr_tables, i;
5118
05f3fb3c 5119 if (!data)
6b06314c
JA
5120 return -ENXIO;
5121
05f3fb3c 5122 /* protect against inflight atomic switch, which drops the ref */
05f3fb3c 5123 percpu_ref_get(&data->refs);
e46a7950
JA
5124 /* wait for existing switches */
5125 flush_work(&data->ref_work);
05f3fb3c
JA
5126 percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
5127 wait_for_completion(&data->done);
5128 percpu_ref_put(&data->refs);
e46a7950
JA
5129 /* flush potential new switch */
5130 flush_work(&data->ref_work);
05f3fb3c
JA
5131 percpu_ref_exit(&data->refs);
5132
6b06314c 5133 __io_sqe_files_unregister(ctx);
65e19f54
JA
5134 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
5135 for (i = 0; i < nr_tables; i++)
05f3fb3c
JA
5136 kfree(data->table[i].files);
5137 kfree(data->table);
5138 kfree(data);
5139 ctx->file_data = NULL;
6b06314c
JA
5140 ctx->nr_user_files = 0;
5141 return 0;
5142}
5143
6c271ce2
JA
5144static void io_sq_thread_stop(struct io_ring_ctx *ctx)
5145{
5146 if (ctx->sqo_thread) {
206aefde 5147 wait_for_completion(&ctx->completions[1]);
2bbcd6d3
RP
5148 /*
5149 * The park is a bit of a work-around, without it we get
5150 * warning spews on shutdown with SQPOLL set and affinity
5151 * set to a single CPU.
5152 */
06058632 5153 kthread_park(ctx->sqo_thread);
6c271ce2
JA
5154 kthread_stop(ctx->sqo_thread);
5155 ctx->sqo_thread = NULL;
5156 }
5157}
5158
6b06314c
JA
5159static void io_finish_async(struct io_ring_ctx *ctx)
5160{
6c271ce2
JA
5161 io_sq_thread_stop(ctx);
5162
561fb04a
JA
5163 if (ctx->io_wq) {
5164 io_wq_destroy(ctx->io_wq);
5165 ctx->io_wq = NULL;
6b06314c
JA
5166 }
5167}
5168
5169#if defined(CONFIG_UNIX)
6b06314c
JA
5170/*
5171 * Ensure the UNIX gc is aware of our file set, so we are certain that
5172 * the io_uring can be safely unregistered on process exit, even if we have
5173 * loops in the file referencing.
5174 */
5175static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
5176{
5177 struct sock *sk = ctx->ring_sock->sk;
5178 struct scm_fp_list *fpl;
5179 struct sk_buff *skb;
08a45173 5180 int i, nr_files;
6b06314c
JA
5181
5182 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
5183 unsigned long inflight = ctx->user->unix_inflight + nr;
5184
5185 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
5186 return -EMFILE;
5187 }
5188
5189 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
5190 if (!fpl)
5191 return -ENOMEM;
5192
5193 skb = alloc_skb(0, GFP_KERNEL);
5194 if (!skb) {
5195 kfree(fpl);
5196 return -ENOMEM;
5197 }
5198
5199 skb->sk = sk;
6b06314c 5200
08a45173 5201 nr_files = 0;
6b06314c
JA
5202 fpl->user = get_uid(ctx->user);
5203 for (i = 0; i < nr; i++) {
65e19f54
JA
5204 struct file *file = io_file_from_index(ctx, i + offset);
5205
5206 if (!file)
08a45173 5207 continue;
65e19f54 5208 fpl->fp[nr_files] = get_file(file);
08a45173
JA
5209 unix_inflight(fpl->user, fpl->fp[nr_files]);
5210 nr_files++;
6b06314c
JA
5211 }
5212
08a45173
JA
5213 if (nr_files) {
5214 fpl->max = SCM_MAX_FD;
5215 fpl->count = nr_files;
5216 UNIXCB(skb).fp = fpl;
05f3fb3c 5217 skb->destructor = unix_destruct_scm;
08a45173
JA
5218 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
5219 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 5220
08a45173
JA
5221 for (i = 0; i < nr_files; i++)
5222 fput(fpl->fp[i]);
5223 } else {
5224 kfree_skb(skb);
5225 kfree(fpl);
5226 }
6b06314c
JA
5227
5228 return 0;
5229}
5230
5231/*
5232 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
5233 * causes regular reference counting to break down. We rely on the UNIX
5234 * garbage collection to take care of this problem for us.
5235 */
5236static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5237{
5238 unsigned left, total;
5239 int ret = 0;
5240
5241 total = 0;
5242 left = ctx->nr_user_files;
5243 while (left) {
5244 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
5245
5246 ret = __io_sqe_files_scm(ctx, this_files, total);
5247 if (ret)
5248 break;
5249 left -= this_files;
5250 total += this_files;
5251 }
5252
5253 if (!ret)
5254 return 0;
5255
5256 while (total < ctx->nr_user_files) {
65e19f54
JA
5257 struct file *file = io_file_from_index(ctx, total);
5258
5259 if (file)
5260 fput(file);
6b06314c
JA
5261 total++;
5262 }
5263
5264 return ret;
5265}
5266#else
5267static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5268{
5269 return 0;
5270}
5271#endif
5272
65e19f54
JA
5273static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
5274 unsigned nr_files)
5275{
5276 int i;
5277
5278 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5279 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5280 unsigned this_files;
5281
5282 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
5283 table->files = kcalloc(this_files, sizeof(struct file *),
5284 GFP_KERNEL);
5285 if (!table->files)
5286 break;
5287 nr_files -= this_files;
5288 }
5289
5290 if (i == nr_tables)
5291 return 0;
5292
5293 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5294 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5295 kfree(table->files);
5296 }
5297 return 1;
5298}
5299
05f3fb3c
JA
5300static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
5301{
5302#if defined(CONFIG_UNIX)
5303 struct sock *sock = ctx->ring_sock->sk;
5304 struct sk_buff_head list, *head = &sock->sk_receive_queue;
5305 struct sk_buff *skb;
5306 int i;
5307
5308 __skb_queue_head_init(&list);
5309
5310 /*
5311 * Find the skb that holds this file in its SCM_RIGHTS. When found,
5312 * remove this entry and rearrange the file array.
5313 */
5314 skb = skb_dequeue(head);
5315 while (skb) {
5316 struct scm_fp_list *fp;
5317
5318 fp = UNIXCB(skb).fp;
5319 for (i = 0; i < fp->count; i++) {
5320 int left;
5321
5322 if (fp->fp[i] != file)
5323 continue;
5324
5325 unix_notinflight(fp->user, fp->fp[i]);
5326 left = fp->count - 1 - i;
5327 if (left) {
5328 memmove(&fp->fp[i], &fp->fp[i + 1],
5329 left * sizeof(struct file *));
5330 }
5331 fp->count--;
5332 if (!fp->count) {
5333 kfree_skb(skb);
5334 skb = NULL;
5335 } else {
5336 __skb_queue_tail(&list, skb);
5337 }
5338 fput(file);
5339 file = NULL;
5340 break;
5341 }
5342
5343 if (!file)
5344 break;
5345
5346 __skb_queue_tail(&list, skb);
5347
5348 skb = skb_dequeue(head);
5349 }
5350
5351 if (skb_peek(&list)) {
5352 spin_lock_irq(&head->lock);
5353 while ((skb = __skb_dequeue(&list)) != NULL)
5354 __skb_queue_tail(head, skb);
5355 spin_unlock_irq(&head->lock);
5356 }
5357#else
5358 fput(file);
5359#endif
5360}
5361
5362struct io_file_put {
5363 struct llist_node llist;
5364 struct file *file;
5365 struct completion *done;
5366};
5367
5368static void io_ring_file_ref_switch(struct work_struct *work)
5369{
5370 struct io_file_put *pfile, *tmp;
5371 struct fixed_file_data *data;
5372 struct llist_node *node;
5373
5374 data = container_of(work, struct fixed_file_data, ref_work);
5375
5376 while ((node = llist_del_all(&data->put_llist)) != NULL) {
5377 llist_for_each_entry_safe(pfile, tmp, node, llist) {
5378 io_ring_file_put(data->ctx, pfile->file);
5379 if (pfile->done)
5380 complete(pfile->done);
5381 else
5382 kfree(pfile);
5383 }
5384 }
5385
5386 percpu_ref_get(&data->refs);
5387 percpu_ref_switch_to_percpu(&data->refs);
5388}
5389
5390static void io_file_data_ref_zero(struct percpu_ref *ref)
5391{
5392 struct fixed_file_data *data;
5393
5394 data = container_of(ref, struct fixed_file_data, refs);
5395
5396 /* we can't safely switch from inside this context, punt to wq */
5397 queue_work(system_wq, &data->ref_work);
5398}
5399
6b06314c
JA
5400static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
5401 unsigned nr_args)
5402{
5403 __s32 __user *fds = (__s32 __user *) arg;
65e19f54 5404 unsigned nr_tables;
05f3fb3c 5405 struct file *file;
6b06314c
JA
5406 int fd, ret = 0;
5407 unsigned i;
5408
05f3fb3c 5409 if (ctx->file_data)
6b06314c
JA
5410 return -EBUSY;
5411 if (!nr_args)
5412 return -EINVAL;
5413 if (nr_args > IORING_MAX_FIXED_FILES)
5414 return -EMFILE;
5415
05f3fb3c
JA
5416 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
5417 if (!ctx->file_data)
5418 return -ENOMEM;
5419 ctx->file_data->ctx = ctx;
5420 init_completion(&ctx->file_data->done);
5421
65e19f54 5422 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
05f3fb3c
JA
5423 ctx->file_data->table = kcalloc(nr_tables,
5424 sizeof(struct fixed_file_table),
65e19f54 5425 GFP_KERNEL);
05f3fb3c
JA
5426 if (!ctx->file_data->table) {
5427 kfree(ctx->file_data);
5428 ctx->file_data = NULL;
6b06314c 5429 return -ENOMEM;
05f3fb3c
JA
5430 }
5431
5432 if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
5433 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
5434 kfree(ctx->file_data->table);
5435 kfree(ctx->file_data);
5436 ctx->file_data = NULL;
5437 return -ENOMEM;
5438 }
5439 ctx->file_data->put_llist.first = NULL;
5440 INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
6b06314c 5441
65e19f54 5442 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
05f3fb3c
JA
5443 percpu_ref_exit(&ctx->file_data->refs);
5444 kfree(ctx->file_data->table);
5445 kfree(ctx->file_data);
5446 ctx->file_data = NULL;
65e19f54
JA
5447 return -ENOMEM;
5448 }
5449
08a45173 5450 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
65e19f54
JA
5451 struct fixed_file_table *table;
5452 unsigned index;
5453
6b06314c
JA
5454 ret = -EFAULT;
5455 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
5456 break;
08a45173
JA
5457 /* allow sparse sets */
5458 if (fd == -1) {
5459 ret = 0;
5460 continue;
5461 }
6b06314c 5462
05f3fb3c 5463 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54 5464 index = i & IORING_FILE_TABLE_MASK;
05f3fb3c 5465 file = fget(fd);
6b06314c
JA
5466
5467 ret = -EBADF;
05f3fb3c 5468 if (!file)
6b06314c 5469 break;
05f3fb3c 5470
6b06314c
JA
5471 /*
5472 * Don't allow io_uring instances to be registered. If UNIX
5473 * isn't enabled, then this causes a reference cycle and this
5474 * instance can never get freed. If UNIX is enabled we'll
5475 * handle it just fine, but there's still no point in allowing
5476 * a ring fd as it doesn't support regular read/write anyway.
5477 */
05f3fb3c
JA
5478 if (file->f_op == &io_uring_fops) {
5479 fput(file);
6b06314c
JA
5480 break;
5481 }
6b06314c 5482 ret = 0;
05f3fb3c 5483 table->files[index] = file;
6b06314c
JA
5484 }
5485
5486 if (ret) {
65e19f54 5487 for (i = 0; i < ctx->nr_user_files; i++) {
65e19f54
JA
5488 file = io_file_from_index(ctx, i);
5489 if (file)
5490 fput(file);
5491 }
5492 for (i = 0; i < nr_tables; i++)
05f3fb3c 5493 kfree(ctx->file_data->table[i].files);
6b06314c 5494
05f3fb3c
JA
5495 kfree(ctx->file_data->table);
5496 kfree(ctx->file_data);
5497 ctx->file_data = NULL;
6b06314c
JA
5498 ctx->nr_user_files = 0;
5499 return ret;
5500 }
5501
5502 ret = io_sqe_files_scm(ctx);
5503 if (ret)
5504 io_sqe_files_unregister(ctx);
5505
5506 return ret;
5507}
5508
c3a31e60
JA
5509static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
5510 int index)
5511{
5512#if defined(CONFIG_UNIX)
5513 struct sock *sock = ctx->ring_sock->sk;
5514 struct sk_buff_head *head = &sock->sk_receive_queue;
5515 struct sk_buff *skb;
5516
5517 /*
5518 * See if we can merge this file into an existing skb SCM_RIGHTS
5519 * file set. If there's no room, fall back to allocating a new skb
5520 * and filling it in.
5521 */
5522 spin_lock_irq(&head->lock);
5523 skb = skb_peek(head);
5524 if (skb) {
5525 struct scm_fp_list *fpl = UNIXCB(skb).fp;
5526
5527 if (fpl->count < SCM_MAX_FD) {
5528 __skb_unlink(skb, head);
5529 spin_unlock_irq(&head->lock);
5530 fpl->fp[fpl->count] = get_file(file);
5531 unix_inflight(fpl->user, fpl->fp[fpl->count]);
5532 fpl->count++;
5533 spin_lock_irq(&head->lock);
5534 __skb_queue_head(head, skb);
5535 } else {
5536 skb = NULL;
5537 }
5538 }
5539 spin_unlock_irq(&head->lock);
5540
5541 if (skb) {
5542 fput(file);
5543 return 0;
5544 }
5545
5546 return __io_sqe_files_scm(ctx, 1, index);
5547#else
5548 return 0;
5549#endif
5550}
5551
05f3fb3c 5552static void io_atomic_switch(struct percpu_ref *ref)
c3a31e60 5553{
05f3fb3c
JA
5554 struct fixed_file_data *data;
5555
5556 data = container_of(ref, struct fixed_file_data, refs);
5557 clear_bit(FFD_F_ATOMIC, &data->state);
5558}
5559
5560static bool io_queue_file_removal(struct fixed_file_data *data,
5561 struct file *file)
5562{
5563 struct io_file_put *pfile, pfile_stack;
5564 DECLARE_COMPLETION_ONSTACK(done);
5565
5566 /*
5567 * If we fail allocating the struct we need for doing async reomval
5568 * of this file, just punt to sync and wait for it.
5569 */
5570 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
5571 if (!pfile) {
5572 pfile = &pfile_stack;
5573 pfile->done = &done;
5574 }
5575
5576 pfile->file = file;
5577 llist_add(&pfile->llist, &data->put_llist);
5578
5579 if (pfile == &pfile_stack) {
5580 if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5581 percpu_ref_put(&data->refs);
5582 percpu_ref_switch_to_atomic(&data->refs,
5583 io_atomic_switch);
5584 }
5585 wait_for_completion(&done);
5586 flush_work(&data->ref_work);
5587 return false;
5588 }
5589
5590 return true;
5591}
5592
5593static int __io_sqe_files_update(struct io_ring_ctx *ctx,
5594 struct io_uring_files_update *up,
5595 unsigned nr_args)
5596{
5597 struct fixed_file_data *data = ctx->file_data;
5598 bool ref_switch = false;
5599 struct file *file;
c3a31e60
JA
5600 __s32 __user *fds;
5601 int fd, i, err;
5602 __u32 done;
5603
05f3fb3c 5604 if (check_add_overflow(up->offset, nr_args, &done))
c3a31e60
JA
5605 return -EOVERFLOW;
5606 if (done > ctx->nr_user_files)
5607 return -EINVAL;
5608
5609 done = 0;
05f3fb3c 5610 fds = u64_to_user_ptr(up->fds);
c3a31e60 5611 while (nr_args) {
65e19f54
JA
5612 struct fixed_file_table *table;
5613 unsigned index;
5614
c3a31e60
JA
5615 err = 0;
5616 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
5617 err = -EFAULT;
5618 break;
5619 }
05f3fb3c
JA
5620 i = array_index_nospec(up->offset, ctx->nr_user_files);
5621 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54
JA
5622 index = i & IORING_FILE_TABLE_MASK;
5623 if (table->files[index]) {
05f3fb3c 5624 file = io_file_from_index(ctx, index);
65e19f54 5625 table->files[index] = NULL;
05f3fb3c
JA
5626 if (io_queue_file_removal(data, file))
5627 ref_switch = true;
c3a31e60
JA
5628 }
5629 if (fd != -1) {
c3a31e60
JA
5630 file = fget(fd);
5631 if (!file) {
5632 err = -EBADF;
5633 break;
5634 }
5635 /*
5636 * Don't allow io_uring instances to be registered. If
5637 * UNIX isn't enabled, then this causes a reference
5638 * cycle and this instance can never get freed. If UNIX
5639 * is enabled we'll handle it just fine, but there's
5640 * still no point in allowing a ring fd as it doesn't
5641 * support regular read/write anyway.
5642 */
5643 if (file->f_op == &io_uring_fops) {
5644 fput(file);
5645 err = -EBADF;
5646 break;
5647 }
65e19f54 5648 table->files[index] = file;
c3a31e60
JA
5649 err = io_sqe_file_register(ctx, file, i);
5650 if (err)
5651 break;
5652 }
5653 nr_args--;
5654 done++;
05f3fb3c
JA
5655 up->offset++;
5656 }
5657
5658 if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5659 percpu_ref_put(&data->refs);
5660 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
c3a31e60
JA
5661 }
5662
5663 return done ? done : err;
5664}
05f3fb3c
JA
5665static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
5666 unsigned nr_args)
5667{
5668 struct io_uring_files_update up;
5669
5670 if (!ctx->file_data)
5671 return -ENXIO;
5672 if (!nr_args)
5673 return -EINVAL;
5674 if (copy_from_user(&up, arg, sizeof(up)))
5675 return -EFAULT;
5676 if (up.resv)
5677 return -EINVAL;
5678
5679 return __io_sqe_files_update(ctx, &up, nr_args);
5680}
c3a31e60 5681
7d723065
JA
5682static void io_put_work(struct io_wq_work *work)
5683{
5684 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5685
5686 io_put_req(req);
5687}
5688
5689static void io_get_work(struct io_wq_work *work)
5690{
5691 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5692
5693 refcount_inc(&req->refs);
5694}
5695
6c271ce2
JA
5696static int io_sq_offload_start(struct io_ring_ctx *ctx,
5697 struct io_uring_params *p)
2b188cc1 5698{
576a347b 5699 struct io_wq_data data;
561fb04a 5700 unsigned concurrency;
2b188cc1
JA
5701 int ret;
5702
6c271ce2 5703 init_waitqueue_head(&ctx->sqo_wait);
2b188cc1
JA
5704 mmgrab(current->mm);
5705 ctx->sqo_mm = current->mm;
5706
6c271ce2 5707 if (ctx->flags & IORING_SETUP_SQPOLL) {
3ec482d1
JA
5708 ret = -EPERM;
5709 if (!capable(CAP_SYS_ADMIN))
5710 goto err;
5711
917257da
JA
5712 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
5713 if (!ctx->sq_thread_idle)
5714 ctx->sq_thread_idle = HZ;
5715
6c271ce2 5716 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 5717 int cpu = p->sq_thread_cpu;
6c271ce2 5718
917257da 5719 ret = -EINVAL;
44a9bd18
JA
5720 if (cpu >= nr_cpu_ids)
5721 goto err;
7889f44d 5722 if (!cpu_online(cpu))
917257da
JA
5723 goto err;
5724
6c271ce2
JA
5725 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
5726 ctx, cpu,
5727 "io_uring-sq");
5728 } else {
5729 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
5730 "io_uring-sq");
5731 }
5732 if (IS_ERR(ctx->sqo_thread)) {
5733 ret = PTR_ERR(ctx->sqo_thread);
5734 ctx->sqo_thread = NULL;
5735 goto err;
5736 }
5737 wake_up_process(ctx->sqo_thread);
5738 } else if (p->flags & IORING_SETUP_SQ_AFF) {
5739 /* Can't have SQ_AFF without SQPOLL */
5740 ret = -EINVAL;
5741 goto err;
5742 }
5743
576a347b
JA
5744 data.mm = ctx->sqo_mm;
5745 data.user = ctx->user;
181e448d 5746 data.creds = ctx->creds;
576a347b
JA
5747 data.get_work = io_get_work;
5748 data.put_work = io_put_work;
5749
561fb04a
JA
5750 /* Do QD, or 4 * CPUS, whatever is smallest */
5751 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
576a347b 5752 ctx->io_wq = io_wq_create(concurrency, &data);
975c99a5
JA
5753 if (IS_ERR(ctx->io_wq)) {
5754 ret = PTR_ERR(ctx->io_wq);
5755 ctx->io_wq = NULL;
2b188cc1
JA
5756 goto err;
5757 }
5758
5759 return 0;
5760err:
54a91f3b 5761 io_finish_async(ctx);
2b188cc1
JA
5762 mmdrop(ctx->sqo_mm);
5763 ctx->sqo_mm = NULL;
5764 return ret;
5765}
5766
5767static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
5768{
5769 atomic_long_sub(nr_pages, &user->locked_vm);
5770}
5771
5772static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
5773{
5774 unsigned long page_limit, cur_pages, new_pages;
5775
5776 /* Don't allow more pages than we can safely lock */
5777 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
5778
5779 do {
5780 cur_pages = atomic_long_read(&user->locked_vm);
5781 new_pages = cur_pages + nr_pages;
5782 if (new_pages > page_limit)
5783 return -ENOMEM;
5784 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
5785 new_pages) != cur_pages);
5786
5787 return 0;
5788}
5789
5790static void io_mem_free(void *ptr)
5791{
52e04ef4
MR
5792 struct page *page;
5793
5794 if (!ptr)
5795 return;
2b188cc1 5796
52e04ef4 5797 page = virt_to_head_page(ptr);
2b188cc1
JA
5798 if (put_page_testzero(page))
5799 free_compound_page(page);
5800}
5801
5802static void *io_mem_alloc(size_t size)
5803{
5804 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
5805 __GFP_NORETRY;
5806
5807 return (void *) __get_free_pages(gfp_flags, get_order(size));
5808}
5809
75b28aff
HV
5810static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
5811 size_t *sq_offset)
5812{
5813 struct io_rings *rings;
5814 size_t off, sq_array_size;
5815
5816 off = struct_size(rings, cqes, cq_entries);
5817 if (off == SIZE_MAX)
5818 return SIZE_MAX;
5819
5820#ifdef CONFIG_SMP
5821 off = ALIGN(off, SMP_CACHE_BYTES);
5822 if (off == 0)
5823 return SIZE_MAX;
5824#endif
5825
5826 sq_array_size = array_size(sizeof(u32), sq_entries);
5827 if (sq_array_size == SIZE_MAX)
5828 return SIZE_MAX;
5829
5830 if (check_add_overflow(off, sq_array_size, &off))
5831 return SIZE_MAX;
5832
5833 if (sq_offset)
5834 *sq_offset = off;
5835
5836 return off;
5837}
5838
2b188cc1
JA
5839static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
5840{
75b28aff 5841 size_t pages;
2b188cc1 5842
75b28aff
HV
5843 pages = (size_t)1 << get_order(
5844 rings_size(sq_entries, cq_entries, NULL));
5845 pages += (size_t)1 << get_order(
5846 array_size(sizeof(struct io_uring_sqe), sq_entries));
2b188cc1 5847
75b28aff 5848 return pages;
2b188cc1
JA
5849}
5850
edafccee
JA
5851static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
5852{
5853 int i, j;
5854
5855 if (!ctx->user_bufs)
5856 return -ENXIO;
5857
5858 for (i = 0; i < ctx->nr_user_bufs; i++) {
5859 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
5860
5861 for (j = 0; j < imu->nr_bvecs; j++)
27c4d3a3 5862 put_user_page(imu->bvec[j].bv_page);
edafccee
JA
5863
5864 if (ctx->account_mem)
5865 io_unaccount_mem(ctx->user, imu->nr_bvecs);
d4ef6475 5866 kvfree(imu->bvec);
edafccee
JA
5867 imu->nr_bvecs = 0;
5868 }
5869
5870 kfree(ctx->user_bufs);
5871 ctx->user_bufs = NULL;
5872 ctx->nr_user_bufs = 0;
5873 return 0;
5874}
5875
5876static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
5877 void __user *arg, unsigned index)
5878{
5879 struct iovec __user *src;
5880
5881#ifdef CONFIG_COMPAT
5882 if (ctx->compat) {
5883 struct compat_iovec __user *ciovs;
5884 struct compat_iovec ciov;
5885
5886 ciovs = (struct compat_iovec __user *) arg;
5887 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
5888 return -EFAULT;
5889
d55e5f5b 5890 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
5891 dst->iov_len = ciov.iov_len;
5892 return 0;
5893 }
5894#endif
5895 src = (struct iovec __user *) arg;
5896 if (copy_from_user(dst, &src[index], sizeof(*dst)))
5897 return -EFAULT;
5898 return 0;
5899}
5900
5901static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
5902 unsigned nr_args)
5903{
5904 struct vm_area_struct **vmas = NULL;
5905 struct page **pages = NULL;
5906 int i, j, got_pages = 0;
5907 int ret = -EINVAL;
5908
5909 if (ctx->user_bufs)
5910 return -EBUSY;
5911 if (!nr_args || nr_args > UIO_MAXIOV)
5912 return -EINVAL;
5913
5914 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
5915 GFP_KERNEL);
5916 if (!ctx->user_bufs)
5917 return -ENOMEM;
5918
5919 for (i = 0; i < nr_args; i++) {
5920 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
5921 unsigned long off, start, end, ubuf;
5922 int pret, nr_pages;
5923 struct iovec iov;
5924 size_t size;
5925
5926 ret = io_copy_iov(ctx, &iov, arg, i);
5927 if (ret)
a278682d 5928 goto err;
edafccee
JA
5929
5930 /*
5931 * Don't impose further limits on the size and buffer
5932 * constraints here, we'll -EINVAL later when IO is
5933 * submitted if they are wrong.
5934 */
5935 ret = -EFAULT;
5936 if (!iov.iov_base || !iov.iov_len)
5937 goto err;
5938
5939 /* arbitrary limit, but we need something */
5940 if (iov.iov_len > SZ_1G)
5941 goto err;
5942
5943 ubuf = (unsigned long) iov.iov_base;
5944 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
5945 start = ubuf >> PAGE_SHIFT;
5946 nr_pages = end - start;
5947
5948 if (ctx->account_mem) {
5949 ret = io_account_mem(ctx->user, nr_pages);
5950 if (ret)
5951 goto err;
5952 }
5953
5954 ret = 0;
5955 if (!pages || nr_pages > got_pages) {
5956 kfree(vmas);
5957 kfree(pages);
d4ef6475 5958 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
edafccee 5959 GFP_KERNEL);
d4ef6475 5960 vmas = kvmalloc_array(nr_pages,
edafccee
JA
5961 sizeof(struct vm_area_struct *),
5962 GFP_KERNEL);
5963 if (!pages || !vmas) {
5964 ret = -ENOMEM;
5965 if (ctx->account_mem)
5966 io_unaccount_mem(ctx->user, nr_pages);
5967 goto err;
5968 }
5969 got_pages = nr_pages;
5970 }
5971
d4ef6475 5972 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
edafccee
JA
5973 GFP_KERNEL);
5974 ret = -ENOMEM;
5975 if (!imu->bvec) {
5976 if (ctx->account_mem)
5977 io_unaccount_mem(ctx->user, nr_pages);
5978 goto err;
5979 }
5980
5981 ret = 0;
5982 down_read(&current->mm->mmap_sem);
932f4a63
IW
5983 pret = get_user_pages(ubuf, nr_pages,
5984 FOLL_WRITE | FOLL_LONGTERM,
5985 pages, vmas);
edafccee
JA
5986 if (pret == nr_pages) {
5987 /* don't support file backed memory */
5988 for (j = 0; j < nr_pages; j++) {
5989 struct vm_area_struct *vma = vmas[j];
5990
5991 if (vma->vm_file &&
5992 !is_file_hugepages(vma->vm_file)) {
5993 ret = -EOPNOTSUPP;
5994 break;
5995 }
5996 }
5997 } else {
5998 ret = pret < 0 ? pret : -EFAULT;
5999 }
6000 up_read(&current->mm->mmap_sem);
6001 if (ret) {
6002 /*
6003 * if we did partial map, or found file backed vmas,
6004 * release any pages we did get
6005 */
27c4d3a3
JH
6006 if (pret > 0)
6007 put_user_pages(pages, pret);
edafccee
JA
6008 if (ctx->account_mem)
6009 io_unaccount_mem(ctx->user, nr_pages);
d4ef6475 6010 kvfree(imu->bvec);
edafccee
JA
6011 goto err;
6012 }
6013
6014 off = ubuf & ~PAGE_MASK;
6015 size = iov.iov_len;
6016 for (j = 0; j < nr_pages; j++) {
6017 size_t vec_len;
6018
6019 vec_len = min_t(size_t, size, PAGE_SIZE - off);
6020 imu->bvec[j].bv_page = pages[j];
6021 imu->bvec[j].bv_len = vec_len;
6022 imu->bvec[j].bv_offset = off;
6023 off = 0;
6024 size -= vec_len;
6025 }
6026 /* store original address for later verification */
6027 imu->ubuf = ubuf;
6028 imu->len = iov.iov_len;
6029 imu->nr_bvecs = nr_pages;
6030
6031 ctx->nr_user_bufs++;
6032 }
d4ef6475
MR
6033 kvfree(pages);
6034 kvfree(vmas);
edafccee
JA
6035 return 0;
6036err:
d4ef6475
MR
6037 kvfree(pages);
6038 kvfree(vmas);
edafccee
JA
6039 io_sqe_buffer_unregister(ctx);
6040 return ret;
6041}
6042
9b402849
JA
6043static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
6044{
6045 __s32 __user *fds = arg;
6046 int fd;
6047
6048 if (ctx->cq_ev_fd)
6049 return -EBUSY;
6050
6051 if (copy_from_user(&fd, fds, sizeof(*fds)))
6052 return -EFAULT;
6053
6054 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
6055 if (IS_ERR(ctx->cq_ev_fd)) {
6056 int ret = PTR_ERR(ctx->cq_ev_fd);
6057 ctx->cq_ev_fd = NULL;
6058 return ret;
6059 }
6060
6061 return 0;
6062}
6063
6064static int io_eventfd_unregister(struct io_ring_ctx *ctx)
6065{
6066 if (ctx->cq_ev_fd) {
6067 eventfd_ctx_put(ctx->cq_ev_fd);
6068 ctx->cq_ev_fd = NULL;
6069 return 0;
6070 }
6071
6072 return -ENXIO;
6073}
6074
2b188cc1
JA
6075static void io_ring_ctx_free(struct io_ring_ctx *ctx)
6076{
6b06314c 6077 io_finish_async(ctx);
2b188cc1
JA
6078 if (ctx->sqo_mm)
6079 mmdrop(ctx->sqo_mm);
def596e9
JA
6080
6081 io_iopoll_reap_events(ctx);
edafccee 6082 io_sqe_buffer_unregister(ctx);
6b06314c 6083 io_sqe_files_unregister(ctx);
9b402849 6084 io_eventfd_unregister(ctx);
def596e9 6085
2b188cc1 6086#if defined(CONFIG_UNIX)
355e8d26
EB
6087 if (ctx->ring_sock) {
6088 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 6089 sock_release(ctx->ring_sock);
355e8d26 6090 }
2b188cc1
JA
6091#endif
6092
75b28aff 6093 io_mem_free(ctx->rings);
2b188cc1 6094 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
6095
6096 percpu_ref_exit(&ctx->refs);
6097 if (ctx->account_mem)
6098 io_unaccount_mem(ctx->user,
6099 ring_pages(ctx->sq_entries, ctx->cq_entries));
6100 free_uid(ctx->user);
181e448d 6101 put_cred(ctx->creds);
206aefde 6102 kfree(ctx->completions);
78076bb6 6103 kfree(ctx->cancel_hash);
0ddf92e8 6104 kmem_cache_free(req_cachep, ctx->fallback_req);
2b188cc1
JA
6105 kfree(ctx);
6106}
6107
6108static __poll_t io_uring_poll(struct file *file, poll_table *wait)
6109{
6110 struct io_ring_ctx *ctx = file->private_data;
6111 __poll_t mask = 0;
6112
6113 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
6114 /*
6115 * synchronizes with barrier from wq_has_sleeper call in
6116 * io_commit_cqring
6117 */
2b188cc1 6118 smp_rmb();
75b28aff
HV
6119 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
6120 ctx->rings->sq_ring_entries)
2b188cc1 6121 mask |= EPOLLOUT | EPOLLWRNORM;
daa5de54 6122 if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
2b188cc1
JA
6123 mask |= EPOLLIN | EPOLLRDNORM;
6124
6125 return mask;
6126}
6127
6128static int io_uring_fasync(int fd, struct file *file, int on)
6129{
6130 struct io_ring_ctx *ctx = file->private_data;
6131
6132 return fasync_helper(fd, file, on, &ctx->cq_fasync);
6133}
6134
6135static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
6136{
6137 mutex_lock(&ctx->uring_lock);
6138 percpu_ref_kill(&ctx->refs);
6139 mutex_unlock(&ctx->uring_lock);
6140
5262f567 6141 io_kill_timeouts(ctx);
221c5eb2 6142 io_poll_remove_all(ctx);
561fb04a
JA
6143
6144 if (ctx->io_wq)
6145 io_wq_cancel_all(ctx->io_wq);
6146
def596e9 6147 io_iopoll_reap_events(ctx);
15dff286
JA
6148 /* if we failed setting up the ctx, we might not have any rings */
6149 if (ctx->rings)
6150 io_cqring_overflow_flush(ctx, true);
206aefde 6151 wait_for_completion(&ctx->completions[0]);
2b188cc1
JA
6152 io_ring_ctx_free(ctx);
6153}
6154
6155static int io_uring_release(struct inode *inode, struct file *file)
6156{
6157 struct io_ring_ctx *ctx = file->private_data;
6158
6159 file->private_data = NULL;
6160 io_ring_ctx_wait_and_kill(ctx);
6161 return 0;
6162}
6163
fcb323cc
JA
6164static void io_uring_cancel_files(struct io_ring_ctx *ctx,
6165 struct files_struct *files)
6166{
6167 struct io_kiocb *req;
6168 DEFINE_WAIT(wait);
6169
6170 while (!list_empty_careful(&ctx->inflight_list)) {
768134d4 6171 struct io_kiocb *cancel_req = NULL;
fcb323cc
JA
6172
6173 spin_lock_irq(&ctx->inflight_lock);
6174 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
768134d4
JA
6175 if (req->work.files != files)
6176 continue;
6177 /* req is being completed, ignore */
6178 if (!refcount_inc_not_zero(&req->refs))
6179 continue;
6180 cancel_req = req;
6181 break;
fcb323cc 6182 }
768134d4 6183 if (cancel_req)
fcb323cc 6184 prepare_to_wait(&ctx->inflight_wait, &wait,
768134d4 6185 TASK_UNINTERRUPTIBLE);
fcb323cc
JA
6186 spin_unlock_irq(&ctx->inflight_lock);
6187
768134d4
JA
6188 /* We need to keep going until we don't find a matching req */
6189 if (!cancel_req)
fcb323cc 6190 break;
2f6d9b9d
BL
6191
6192 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
6193 io_put_req(cancel_req);
fcb323cc
JA
6194 schedule();
6195 }
768134d4 6196 finish_wait(&ctx->inflight_wait, &wait);
fcb323cc
JA
6197}
6198
6199static int io_uring_flush(struct file *file, void *data)
6200{
6201 struct io_ring_ctx *ctx = file->private_data;
6202
6203 io_uring_cancel_files(ctx, data);
1d7bb1d5
JA
6204 if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
6205 io_cqring_overflow_flush(ctx, true);
fcb323cc 6206 io_wq_cancel_all(ctx->io_wq);
1d7bb1d5 6207 }
fcb323cc
JA
6208 return 0;
6209}
6210
6c5c240e
RP
6211static void *io_uring_validate_mmap_request(struct file *file,
6212 loff_t pgoff, size_t sz)
2b188cc1 6213{
2b188cc1 6214 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 6215 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
6216 struct page *page;
6217 void *ptr;
6218
6219 switch (offset) {
6220 case IORING_OFF_SQ_RING:
75b28aff
HV
6221 case IORING_OFF_CQ_RING:
6222 ptr = ctx->rings;
2b188cc1
JA
6223 break;
6224 case IORING_OFF_SQES:
6225 ptr = ctx->sq_sqes;
6226 break;
2b188cc1 6227 default:
6c5c240e 6228 return ERR_PTR(-EINVAL);
2b188cc1
JA
6229 }
6230
6231 page = virt_to_head_page(ptr);
a50b854e 6232 if (sz > page_size(page))
6c5c240e
RP
6233 return ERR_PTR(-EINVAL);
6234
6235 return ptr;
6236}
6237
6238#ifdef CONFIG_MMU
6239
6240static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6241{
6242 size_t sz = vma->vm_end - vma->vm_start;
6243 unsigned long pfn;
6244 void *ptr;
6245
6246 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
6247 if (IS_ERR(ptr))
6248 return PTR_ERR(ptr);
2b188cc1
JA
6249
6250 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
6251 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
6252}
6253
6c5c240e
RP
6254#else /* !CONFIG_MMU */
6255
6256static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6257{
6258 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
6259}
6260
6261static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
6262{
6263 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
6264}
6265
6266static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
6267 unsigned long addr, unsigned long len,
6268 unsigned long pgoff, unsigned long flags)
6269{
6270 void *ptr;
6271
6272 ptr = io_uring_validate_mmap_request(file, pgoff, len);
6273 if (IS_ERR(ptr))
6274 return PTR_ERR(ptr);
6275
6276 return (unsigned long) ptr;
6277}
6278
6279#endif /* !CONFIG_MMU */
6280
2b188cc1
JA
6281SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
6282 u32, min_complete, u32, flags, const sigset_t __user *, sig,
6283 size_t, sigsz)
6284{
6285 struct io_ring_ctx *ctx;
6286 long ret = -EBADF;
6287 int submitted = 0;
6288 struct fd f;
6289
6c271ce2 6290 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2b188cc1
JA
6291 return -EINVAL;
6292
6293 f = fdget(fd);
6294 if (!f.file)
6295 return -EBADF;
6296
6297 ret = -EOPNOTSUPP;
6298 if (f.file->f_op != &io_uring_fops)
6299 goto out_fput;
6300
6301 ret = -ENXIO;
6302 ctx = f.file->private_data;
6303 if (!percpu_ref_tryget(&ctx->refs))
6304 goto out_fput;
6305
6c271ce2
JA
6306 /*
6307 * For SQ polling, the thread will do all submissions and completions.
6308 * Just return the requested submit count, and wake the thread if
6309 * we were asked to.
6310 */
b2a9eada 6311 ret = 0;
6c271ce2 6312 if (ctx->flags & IORING_SETUP_SQPOLL) {
c1edbf5f
JA
6313 if (!list_empty_careful(&ctx->cq_overflow_list))
6314 io_cqring_overflow_flush(ctx, false);
6c271ce2
JA
6315 if (flags & IORING_ENTER_SQ_WAKEUP)
6316 wake_up(&ctx->sqo_wait);
6317 submitted = to_submit;
b2a9eada 6318 } else if (to_submit) {
ae9428ca 6319 struct mm_struct *cur_mm;
2b188cc1 6320
44d28279
JA
6321 if (current->mm != ctx->sqo_mm ||
6322 current_cred() != ctx->creds) {
6323 ret = -EPERM;
6324 goto out;
6325 }
6326
2b188cc1 6327 mutex_lock(&ctx->uring_lock);
ae9428ca
PB
6328 /* already have mm, so io_submit_sqes() won't try to grab it */
6329 cur_mm = ctx->sqo_mm;
6330 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
6331 &cur_mm, false);
2b188cc1 6332 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
6333
6334 if (submitted != to_submit)
6335 goto out;
2b188cc1
JA
6336 }
6337 if (flags & IORING_ENTER_GETEVENTS) {
def596e9
JA
6338 unsigned nr_events = 0;
6339
2b188cc1
JA
6340 min_complete = min(min_complete, ctx->cq_entries);
6341
def596e9 6342 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9 6343 ret = io_iopoll_check(ctx, &nr_events, min_complete);
def596e9
JA
6344 } else {
6345 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
6346 }
2b188cc1
JA
6347 }
6348
7c504e65 6349out:
6805b32e 6350 percpu_ref_put(&ctx->refs);
2b188cc1
JA
6351out_fput:
6352 fdput(f);
6353 return submitted ? submitted : ret;
6354}
6355
6356static const struct file_operations io_uring_fops = {
6357 .release = io_uring_release,
fcb323cc 6358 .flush = io_uring_flush,
2b188cc1 6359 .mmap = io_uring_mmap,
6c5c240e
RP
6360#ifndef CONFIG_MMU
6361 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
6362 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
6363#endif
2b188cc1
JA
6364 .poll = io_uring_poll,
6365 .fasync = io_uring_fasync,
6366};
6367
6368static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
6369 struct io_uring_params *p)
6370{
75b28aff
HV
6371 struct io_rings *rings;
6372 size_t size, sq_array_offset;
2b188cc1 6373
75b28aff
HV
6374 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
6375 if (size == SIZE_MAX)
6376 return -EOVERFLOW;
6377
6378 rings = io_mem_alloc(size);
6379 if (!rings)
2b188cc1
JA
6380 return -ENOMEM;
6381
75b28aff
HV
6382 ctx->rings = rings;
6383 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
6384 rings->sq_ring_mask = p->sq_entries - 1;
6385 rings->cq_ring_mask = p->cq_entries - 1;
6386 rings->sq_ring_entries = p->sq_entries;
6387 rings->cq_ring_entries = p->cq_entries;
6388 ctx->sq_mask = rings->sq_ring_mask;
6389 ctx->cq_mask = rings->cq_ring_mask;
6390 ctx->sq_entries = rings->sq_ring_entries;
6391 ctx->cq_entries = rings->cq_ring_entries;
2b188cc1
JA
6392
6393 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
6394 if (size == SIZE_MAX) {
6395 io_mem_free(ctx->rings);
6396 ctx->rings = NULL;
2b188cc1 6397 return -EOVERFLOW;
eb065d30 6398 }
2b188cc1
JA
6399
6400 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
6401 if (!ctx->sq_sqes) {
6402 io_mem_free(ctx->rings);
6403 ctx->rings = NULL;
2b188cc1 6404 return -ENOMEM;
eb065d30 6405 }
2b188cc1 6406
2b188cc1
JA
6407 return 0;
6408}
6409
6410/*
6411 * Allocate an anonymous fd, this is what constitutes the application
6412 * visible backing of an io_uring instance. The application mmaps this
6413 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
6414 * we have to tie this fd to a socket for file garbage collection purposes.
6415 */
6416static int io_uring_get_fd(struct io_ring_ctx *ctx)
6417{
6418 struct file *file;
6419 int ret;
6420
6421#if defined(CONFIG_UNIX)
6422 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
6423 &ctx->ring_sock);
6424 if (ret)
6425 return ret;
6426#endif
6427
6428 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
6429 if (ret < 0)
6430 goto err;
6431
6432 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
6433 O_RDWR | O_CLOEXEC);
6434 if (IS_ERR(file)) {
6435 put_unused_fd(ret);
6436 ret = PTR_ERR(file);
6437 goto err;
6438 }
6439
6440#if defined(CONFIG_UNIX)
6441 ctx->ring_sock->file = file;
6442#endif
6443 fd_install(ret, file);
6444 return ret;
6445err:
6446#if defined(CONFIG_UNIX)
6447 sock_release(ctx->ring_sock);
6448 ctx->ring_sock = NULL;
6449#endif
6450 return ret;
6451}
6452
6453static int io_uring_create(unsigned entries, struct io_uring_params *p)
6454{
6455 struct user_struct *user = NULL;
6456 struct io_ring_ctx *ctx;
6457 bool account_mem;
6458 int ret;
6459
8110c1a6 6460 if (!entries)
2b188cc1 6461 return -EINVAL;
8110c1a6
JA
6462 if (entries > IORING_MAX_ENTRIES) {
6463 if (!(p->flags & IORING_SETUP_CLAMP))
6464 return -EINVAL;
6465 entries = IORING_MAX_ENTRIES;
6466 }
2b188cc1
JA
6467
6468 /*
6469 * Use twice as many entries for the CQ ring. It's possible for the
6470 * application to drive a higher depth than the size of the SQ ring,
6471 * since the sqes are only used at submission time. This allows for
33a107f0
JA
6472 * some flexibility in overcommitting a bit. If the application has
6473 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
6474 * of CQ ring entries manually.
2b188cc1
JA
6475 */
6476 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
6477 if (p->flags & IORING_SETUP_CQSIZE) {
6478 /*
6479 * If IORING_SETUP_CQSIZE is set, we do the same roundup
6480 * to a power-of-two, if it isn't already. We do NOT impose
6481 * any cq vs sq ring sizing.
6482 */
8110c1a6 6483 if (p->cq_entries < p->sq_entries)
33a107f0 6484 return -EINVAL;
8110c1a6
JA
6485 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
6486 if (!(p->flags & IORING_SETUP_CLAMP))
6487 return -EINVAL;
6488 p->cq_entries = IORING_MAX_CQ_ENTRIES;
6489 }
33a107f0
JA
6490 p->cq_entries = roundup_pow_of_two(p->cq_entries);
6491 } else {
6492 p->cq_entries = 2 * p->sq_entries;
6493 }
2b188cc1
JA
6494
6495 user = get_uid(current_user());
6496 account_mem = !capable(CAP_IPC_LOCK);
6497
6498 if (account_mem) {
6499 ret = io_account_mem(user,
6500 ring_pages(p->sq_entries, p->cq_entries));
6501 if (ret) {
6502 free_uid(user);
6503 return ret;
6504 }
6505 }
6506
6507 ctx = io_ring_ctx_alloc(p);
6508 if (!ctx) {
6509 if (account_mem)
6510 io_unaccount_mem(user, ring_pages(p->sq_entries,
6511 p->cq_entries));
6512 free_uid(user);
6513 return -ENOMEM;
6514 }
6515 ctx->compat = in_compat_syscall();
6516 ctx->account_mem = account_mem;
6517 ctx->user = user;
0b8c0ec7 6518 ctx->creds = get_current_cred();
2b188cc1
JA
6519
6520 ret = io_allocate_scq_urings(ctx, p);
6521 if (ret)
6522 goto err;
6523
6c271ce2 6524 ret = io_sq_offload_start(ctx, p);
2b188cc1
JA
6525 if (ret)
6526 goto err;
6527
2b188cc1 6528 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
6529 p->sq_off.head = offsetof(struct io_rings, sq.head);
6530 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
6531 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
6532 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
6533 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
6534 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
6535 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
6536
6537 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
6538 p->cq_off.head = offsetof(struct io_rings, cq.head);
6539 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
6540 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
6541 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
6542 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
6543 p->cq_off.cqes = offsetof(struct io_rings, cqes);
ac90f249 6544
044c1ab3
JA
6545 /*
6546 * Install ring fd as the very last thing, so we don't risk someone
6547 * having closed it before we finish setup
6548 */
6549 ret = io_uring_get_fd(ctx);
6550 if (ret < 0)
6551 goto err;
6552
da8c9690 6553 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
ba04291e 6554 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS;
c826bd7a 6555 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
6556 return ret;
6557err:
6558 io_ring_ctx_wait_and_kill(ctx);
6559 return ret;
6560}
6561
6562/*
6563 * Sets up an aio uring context, and returns the fd. Applications asks for a
6564 * ring size, we return the actual sq/cq ring sizes (among other things) in the
6565 * params structure passed in.
6566 */
6567static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
6568{
6569 struct io_uring_params p;
6570 long ret;
6571 int i;
6572
6573 if (copy_from_user(&p, params, sizeof(p)))
6574 return -EFAULT;
6575 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
6576 if (p.resv[i])
6577 return -EINVAL;
6578 }
6579
6c271ce2 6580 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6
JA
6581 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
6582 IORING_SETUP_CLAMP))
2b188cc1
JA
6583 return -EINVAL;
6584
6585 ret = io_uring_create(entries, &p);
6586 if (ret < 0)
6587 return ret;
6588
6589 if (copy_to_user(params, &p, sizeof(p)))
6590 return -EFAULT;
6591
6592 return ret;
6593}
6594
6595SYSCALL_DEFINE2(io_uring_setup, u32, entries,
6596 struct io_uring_params __user *, params)
6597{
6598 return io_uring_setup(entries, params);
6599}
6600
66f4af93
JA
6601static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
6602{
6603 struct io_uring_probe *p;
6604 size_t size;
6605 int i, ret;
6606
6607 size = struct_size(p, ops, nr_args);
6608 if (size == SIZE_MAX)
6609 return -EOVERFLOW;
6610 p = kzalloc(size, GFP_KERNEL);
6611 if (!p)
6612 return -ENOMEM;
6613
6614 ret = -EFAULT;
6615 if (copy_from_user(p, arg, size))
6616 goto out;
6617 ret = -EINVAL;
6618 if (memchr_inv(p, 0, size))
6619 goto out;
6620
6621 p->last_op = IORING_OP_LAST - 1;
6622 if (nr_args > IORING_OP_LAST)
6623 nr_args = IORING_OP_LAST;
6624
6625 for (i = 0; i < nr_args; i++) {
6626 p->ops[i].op = i;
6627 if (!io_op_defs[i].not_supported)
6628 p->ops[i].flags = IO_URING_OP_SUPPORTED;
6629 }
6630 p->ops_len = i;
6631
6632 ret = 0;
6633 if (copy_to_user(arg, p, size))
6634 ret = -EFAULT;
6635out:
6636 kfree(p);
6637 return ret;
6638}
6639
edafccee
JA
6640static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
6641 void __user *arg, unsigned nr_args)
b19062a5
JA
6642 __releases(ctx->uring_lock)
6643 __acquires(ctx->uring_lock)
edafccee
JA
6644{
6645 int ret;
6646
35fa71a0
JA
6647 /*
6648 * We're inside the ring mutex, if the ref is already dying, then
6649 * someone else killed the ctx or is already going through
6650 * io_uring_register().
6651 */
6652 if (percpu_ref_is_dying(&ctx->refs))
6653 return -ENXIO;
6654
05f3fb3c 6655 if (opcode != IORING_UNREGISTER_FILES &&
66f4af93
JA
6656 opcode != IORING_REGISTER_FILES_UPDATE &&
6657 opcode != IORING_REGISTER_PROBE) {
05f3fb3c 6658 percpu_ref_kill(&ctx->refs);
b19062a5 6659
05f3fb3c
JA
6660 /*
6661 * Drop uring mutex before waiting for references to exit. If
6662 * another thread is currently inside io_uring_enter() it might
6663 * need to grab the uring_lock to make progress. If we hold it
6664 * here across the drain wait, then we can deadlock. It's safe
6665 * to drop the mutex here, since no new references will come in
6666 * after we've killed the percpu ref.
6667 */
6668 mutex_unlock(&ctx->uring_lock);
c150368b 6669 ret = wait_for_completion_interruptible(&ctx->completions[0]);
05f3fb3c 6670 mutex_lock(&ctx->uring_lock);
c150368b
JA
6671 if (ret) {
6672 percpu_ref_resurrect(&ctx->refs);
6673 ret = -EINTR;
6674 goto out;
6675 }
05f3fb3c 6676 }
edafccee
JA
6677
6678 switch (opcode) {
6679 case IORING_REGISTER_BUFFERS:
6680 ret = io_sqe_buffer_register(ctx, arg, nr_args);
6681 break;
6682 case IORING_UNREGISTER_BUFFERS:
6683 ret = -EINVAL;
6684 if (arg || nr_args)
6685 break;
6686 ret = io_sqe_buffer_unregister(ctx);
6687 break;
6b06314c
JA
6688 case IORING_REGISTER_FILES:
6689 ret = io_sqe_files_register(ctx, arg, nr_args);
6690 break;
6691 case IORING_UNREGISTER_FILES:
6692 ret = -EINVAL;
6693 if (arg || nr_args)
6694 break;
6695 ret = io_sqe_files_unregister(ctx);
6696 break;
c3a31e60
JA
6697 case IORING_REGISTER_FILES_UPDATE:
6698 ret = io_sqe_files_update(ctx, arg, nr_args);
6699 break;
9b402849 6700 case IORING_REGISTER_EVENTFD:
f2842ab5 6701 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
6702 ret = -EINVAL;
6703 if (nr_args != 1)
6704 break;
6705 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
6706 if (ret)
6707 break;
6708 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
6709 ctx->eventfd_async = 1;
6710 else
6711 ctx->eventfd_async = 0;
9b402849
JA
6712 break;
6713 case IORING_UNREGISTER_EVENTFD:
6714 ret = -EINVAL;
6715 if (arg || nr_args)
6716 break;
6717 ret = io_eventfd_unregister(ctx);
6718 break;
66f4af93
JA
6719 case IORING_REGISTER_PROBE:
6720 ret = -EINVAL;
6721 if (!arg || nr_args > 256)
6722 break;
6723 ret = io_probe(ctx, arg, nr_args);
6724 break;
edafccee
JA
6725 default:
6726 ret = -EINVAL;
6727 break;
6728 }
6729
05f3fb3c
JA
6730
6731 if (opcode != IORING_UNREGISTER_FILES &&
66f4af93
JA
6732 opcode != IORING_REGISTER_FILES_UPDATE &&
6733 opcode != IORING_REGISTER_PROBE) {
05f3fb3c 6734 /* bring the ctx back to life */
05f3fb3c 6735 percpu_ref_reinit(&ctx->refs);
c150368b
JA
6736out:
6737 reinit_completion(&ctx->completions[0]);
05f3fb3c 6738 }
edafccee
JA
6739 return ret;
6740}
6741
6742SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
6743 void __user *, arg, unsigned int, nr_args)
6744{
6745 struct io_ring_ctx *ctx;
6746 long ret = -EBADF;
6747 struct fd f;
6748
6749 f = fdget(fd);
6750 if (!f.file)
6751 return -EBADF;
6752
6753 ret = -EOPNOTSUPP;
6754 if (f.file->f_op != &io_uring_fops)
6755 goto out_fput;
6756
6757 ctx = f.file->private_data;
6758
6759 mutex_lock(&ctx->uring_lock);
6760 ret = __io_uring_register(ctx, opcode, arg, nr_args);
6761 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
6762 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
6763 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
6764out_fput:
6765 fdput(f);
6766 return ret;
6767}
6768
2b188cc1
JA
6769static int __init io_uring_init(void)
6770{
d3656344 6771 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
2b188cc1
JA
6772 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
6773 return 0;
6774};
6775__initcall(io_uring_init);