]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/io_uring.c
io_uring: remove extra ->file check
[mirror_ubuntu-jammy-kernel.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <linux/refcount.h>
48#include <linux/uio.h>
6b47ee6e 49#include <linux/bits.h>
2b188cc1
JA
50
51#include <linux/sched/signal.h>
52#include <linux/fs.h>
53#include <linux/file.h>
54#include <linux/fdtable.h>
55#include <linux/mm.h>
56#include <linux/mman.h>
57#include <linux/mmu_context.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
6c271ce2 60#include <linux/kthread.h>
2b188cc1 61#include <linux/blkdev.h>
edafccee 62#include <linux/bvec.h>
2b188cc1
JA
63#include <linux/net.h>
64#include <net/sock.h>
65#include <net/af_unix.h>
6b06314c 66#include <net/scm.h>
2b188cc1
JA
67#include <linux/anon_inodes.h>
68#include <linux/sched/mm.h>
69#include <linux/uaccess.h>
70#include <linux/nospec.h>
edafccee
JA
71#include <linux/sizes.h>
72#include <linux/hugetlb.h>
aa4c3967 73#include <linux/highmem.h>
15b71abe
JA
74#include <linux/namei.h>
75#include <linux/fsnotify.h>
4840e418 76#include <linux/fadvise.h>
3e4827b0 77#include <linux/eventpoll.h>
2b188cc1 78
c826bd7a
DD
79#define CREATE_TRACE_POINTS
80#include <trace/events/io_uring.h>
81
2b188cc1
JA
82#include <uapi/linux/io_uring.h>
83
84#include "internal.h"
561fb04a 85#include "io-wq.h"
2b188cc1 86
5277deaa 87#define IORING_MAX_ENTRIES 32768
33a107f0 88#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
65e19f54
JA
89
90/*
91 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
92 */
93#define IORING_FILE_TABLE_SHIFT 9
94#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
95#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
96#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
2b188cc1
JA
97
98struct io_uring {
99 u32 head ____cacheline_aligned_in_smp;
100 u32 tail ____cacheline_aligned_in_smp;
101};
102
1e84b97b 103/*
75b28aff
HV
104 * This data is shared with the application through the mmap at offsets
105 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
106 *
107 * The offsets to the member fields are published through struct
108 * io_sqring_offsets when calling io_uring_setup.
109 */
75b28aff 110struct io_rings {
1e84b97b
SB
111 /*
112 * Head and tail offsets into the ring; the offsets need to be
113 * masked to get valid indices.
114 *
75b28aff
HV
115 * The kernel controls head of the sq ring and the tail of the cq ring,
116 * and the application controls tail of the sq ring and the head of the
117 * cq ring.
1e84b97b 118 */
75b28aff 119 struct io_uring sq, cq;
1e84b97b 120 /*
75b28aff 121 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
122 * ring_entries - 1)
123 */
75b28aff
HV
124 u32 sq_ring_mask, cq_ring_mask;
125 /* Ring sizes (constant, power of 2) */
126 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
127 /*
128 * Number of invalid entries dropped by the kernel due to
129 * invalid index stored in array
130 *
131 * Written by the kernel, shouldn't be modified by the
132 * application (i.e. get number of "new events" by comparing to
133 * cached value).
134 *
135 * After a new SQ head value was read by the application this
136 * counter includes all submissions that were dropped reaching
137 * the new SQ head (and possibly more).
138 */
75b28aff 139 u32 sq_dropped;
1e84b97b
SB
140 /*
141 * Runtime flags
142 *
143 * Written by the kernel, shouldn't be modified by the
144 * application.
145 *
146 * The application needs a full memory barrier before checking
147 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
148 */
75b28aff 149 u32 sq_flags;
1e84b97b
SB
150 /*
151 * Number of completion events lost because the queue was full;
152 * this should be avoided by the application by making sure
0b4295b5 153 * there are not more requests pending than there is space in
1e84b97b
SB
154 * the completion queue.
155 *
156 * Written by the kernel, shouldn't be modified by the
157 * application (i.e. get number of "new events" by comparing to
158 * cached value).
159 *
160 * As completion events come in out of order this counter is not
161 * ordered with any other data.
162 */
75b28aff 163 u32 cq_overflow;
1e84b97b
SB
164 /*
165 * Ring buffer of completion events.
166 *
167 * The kernel writes completion events fresh every time they are
168 * produced, so the application is allowed to modify pending
169 * entries.
170 */
75b28aff 171 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
172};
173
edafccee
JA
174struct io_mapped_ubuf {
175 u64 ubuf;
176 size_t len;
177 struct bio_vec *bvec;
178 unsigned int nr_bvecs;
179};
180
65e19f54
JA
181struct fixed_file_table {
182 struct file **files;
31b51510
JA
183};
184
05f3fb3c
JA
185enum {
186 FFD_F_ATOMIC,
187};
188
189struct fixed_file_data {
190 struct fixed_file_table *table;
191 struct io_ring_ctx *ctx;
192
193 struct percpu_ref refs;
194 struct llist_head put_llist;
195 unsigned long state;
196 struct work_struct ref_work;
197 struct completion done;
198};
199
2b188cc1
JA
200struct io_ring_ctx {
201 struct {
202 struct percpu_ref refs;
203 } ____cacheline_aligned_in_smp;
204
205 struct {
206 unsigned int flags;
69b3e546
JA
207 int compat: 1;
208 int account_mem: 1;
209 int cq_overflow_flushed: 1;
210 int drain_next: 1;
f2842ab5 211 int eventfd_async: 1;
2b188cc1 212
75b28aff
HV
213 /*
214 * Ring buffer of indices into array of io_uring_sqe, which is
215 * mmapped by the application using the IORING_OFF_SQES offset.
216 *
217 * This indirection could e.g. be used to assign fixed
218 * io_uring_sqe entries to operations and only submit them to
219 * the queue when needed.
220 *
221 * The kernel modifies neither the indices array nor the entries
222 * array.
223 */
224 u32 *sq_array;
2b188cc1
JA
225 unsigned cached_sq_head;
226 unsigned sq_entries;
227 unsigned sq_mask;
6c271ce2 228 unsigned sq_thread_idle;
498ccd9e 229 unsigned cached_sq_dropped;
206aefde 230 atomic_t cached_cq_overflow;
ad3eb2c8 231 unsigned long sq_check_overflow;
de0617e4
JA
232
233 struct list_head defer_list;
5262f567 234 struct list_head timeout_list;
1d7bb1d5 235 struct list_head cq_overflow_list;
fcb323cc
JA
236
237 wait_queue_head_t inflight_wait;
ad3eb2c8 238 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
239 } ____cacheline_aligned_in_smp;
240
206aefde
JA
241 struct io_rings *rings;
242
2b188cc1 243 /* IO offload */
561fb04a 244 struct io_wq *io_wq;
6c271ce2 245 struct task_struct *sqo_thread; /* if using sq thread polling */
2b188cc1 246 struct mm_struct *sqo_mm;
6c271ce2 247 wait_queue_head_t sqo_wait;
75b28aff 248
6b06314c
JA
249 /*
250 * If used, fixed file set. Writers must ensure that ->refs is dead,
251 * readers must ensure that ->refs is alive as long as the file* is
252 * used. Only updated through io_uring_register(2).
253 */
05f3fb3c 254 struct fixed_file_data *file_data;
6b06314c 255 unsigned nr_user_files;
b14cca0c
PB
256 int ring_fd;
257 struct file *ring_file;
6b06314c 258
edafccee
JA
259 /* if used, fixed mapped user buffers */
260 unsigned nr_user_bufs;
261 struct io_mapped_ubuf *user_bufs;
262
2b188cc1
JA
263 struct user_struct *user;
264
0b8c0ec7 265 const struct cred *creds;
181e448d 266
206aefde
JA
267 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
268 struct completion *completions;
269
0ddf92e8
JA
270 /* if all else fails... */
271 struct io_kiocb *fallback_req;
272
206aefde
JA
273#if defined(CONFIG_UNIX)
274 struct socket *ring_sock;
275#endif
276
071698e1
JA
277 struct idr personality_idr;
278
206aefde
JA
279 struct {
280 unsigned cached_cq_tail;
281 unsigned cq_entries;
282 unsigned cq_mask;
283 atomic_t cq_timeouts;
ad3eb2c8 284 unsigned long cq_check_overflow;
206aefde
JA
285 struct wait_queue_head cq_wait;
286 struct fasync_struct *cq_fasync;
287 struct eventfd_ctx *cq_ev_fd;
288 } ____cacheline_aligned_in_smp;
2b188cc1
JA
289
290 struct {
291 struct mutex uring_lock;
292 wait_queue_head_t wait;
293 } ____cacheline_aligned_in_smp;
294
295 struct {
296 spinlock_t completion_lock;
e94f141b
JA
297 struct llist_head poll_llist;
298
def596e9
JA
299 /*
300 * ->poll_list is protected by the ctx->uring_lock for
301 * io_uring instances that don't use IORING_SETUP_SQPOLL.
302 * For SQPOLL, only the single threaded io_sq_thread() will
303 * manipulate the list, hence no extra locking is needed there.
304 */
305 struct list_head poll_list;
78076bb6
JA
306 struct hlist_head *cancel_hash;
307 unsigned cancel_hash_bits;
e94f141b 308 bool poll_multi_file;
31b51510 309
fcb323cc
JA
310 spinlock_t inflight_lock;
311 struct list_head inflight_list;
2b188cc1 312 } ____cacheline_aligned_in_smp;
2b188cc1
JA
313};
314
09bb8394
JA
315/*
316 * First field must be the file pointer in all the
317 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
318 */
221c5eb2
JA
319struct io_poll_iocb {
320 struct file *file;
0969e783
JA
321 union {
322 struct wait_queue_head *head;
323 u64 addr;
324 };
221c5eb2 325 __poll_t events;
8c838788 326 bool done;
221c5eb2 327 bool canceled;
392edb45 328 struct wait_queue_entry wait;
221c5eb2
JA
329};
330
b5dba59e
JA
331struct io_close {
332 struct file *file;
333 struct file *put_file;
334 int fd;
335};
336
ad8a48ac
JA
337struct io_timeout_data {
338 struct io_kiocb *req;
339 struct hrtimer timer;
340 struct timespec64 ts;
341 enum hrtimer_mode mode;
cc42e0ac 342 u32 seq_offset;
ad8a48ac
JA
343};
344
8ed8d3c3
JA
345struct io_accept {
346 struct file *file;
347 struct sockaddr __user *addr;
348 int __user *addr_len;
349 int flags;
350};
351
352struct io_sync {
353 struct file *file;
354 loff_t len;
355 loff_t off;
356 int flags;
d63d1b5e 357 int mode;
8ed8d3c3
JA
358};
359
fbf23849
JA
360struct io_cancel {
361 struct file *file;
362 u64 addr;
363};
364
b29472ee
JA
365struct io_timeout {
366 struct file *file;
367 u64 addr;
368 int flags;
26a61679 369 unsigned count;
b29472ee
JA
370};
371
9adbd45d
JA
372struct io_rw {
373 /* NOTE: kiocb has the file as the first member, so don't do it here */
374 struct kiocb kiocb;
375 u64 addr;
376 u64 len;
377};
378
3fbb51c1
JA
379struct io_connect {
380 struct file *file;
381 struct sockaddr __user *addr;
382 int addr_len;
383};
384
e47293fd
JA
385struct io_sr_msg {
386 struct file *file;
fddaface
JA
387 union {
388 struct user_msghdr __user *msg;
389 void __user *buf;
390 };
e47293fd 391 int msg_flags;
fddaface 392 size_t len;
e47293fd
JA
393};
394
15b71abe
JA
395struct io_open {
396 struct file *file;
397 int dfd;
eddc7ef5 398 union {
eddc7ef5
JA
399 unsigned mask;
400 };
15b71abe 401 struct filename *filename;
eddc7ef5 402 struct statx __user *buffer;
c12cedf2 403 struct open_how how;
15b71abe
JA
404};
405
05f3fb3c
JA
406struct io_files_update {
407 struct file *file;
408 u64 arg;
409 u32 nr_args;
410 u32 offset;
411};
412
4840e418
JA
413struct io_fadvise {
414 struct file *file;
415 u64 offset;
416 u32 len;
417 u32 advice;
418};
419
c1ca757b
JA
420struct io_madvise {
421 struct file *file;
422 u64 addr;
423 u32 len;
424 u32 advice;
425};
426
3e4827b0
JA
427struct io_epoll {
428 struct file *file;
429 int epfd;
430 int op;
431 int fd;
432 struct epoll_event event;
e47293fd
JA
433};
434
f499a021
JA
435struct io_async_connect {
436 struct sockaddr_storage address;
437};
438
03b1230c
JA
439struct io_async_msghdr {
440 struct iovec fast_iov[UIO_FASTIOV];
441 struct iovec *iov;
442 struct sockaddr __user *uaddr;
443 struct msghdr msg;
444};
445
f67676d1
JA
446struct io_async_rw {
447 struct iovec fast_iov[UIO_FASTIOV];
448 struct iovec *iov;
449 ssize_t nr_segs;
450 ssize_t size;
451};
452
15b71abe
JA
453struct io_async_open {
454 struct filename *filename;
455};
456
1a6b74fc 457struct io_async_ctx {
f67676d1
JA
458 union {
459 struct io_async_rw rw;
03b1230c 460 struct io_async_msghdr msg;
f499a021 461 struct io_async_connect connect;
2d28390a 462 struct io_timeout_data timeout;
15b71abe 463 struct io_async_open open;
f67676d1 464 };
1a6b74fc
JA
465};
466
6b47ee6e
PB
467enum {
468 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
469 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
470 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
471 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
472 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
473
474 REQ_F_LINK_NEXT_BIT,
475 REQ_F_FAIL_LINK_BIT,
476 REQ_F_INFLIGHT_BIT,
477 REQ_F_CUR_POS_BIT,
478 REQ_F_NOWAIT_BIT,
479 REQ_F_IOPOLL_COMPLETED_BIT,
480 REQ_F_LINK_TIMEOUT_BIT,
481 REQ_F_TIMEOUT_BIT,
482 REQ_F_ISREG_BIT,
483 REQ_F_MUST_PUNT_BIT,
484 REQ_F_TIMEOUT_NOSEQ_BIT,
485 REQ_F_COMP_LOCKED_BIT,
486};
487
488enum {
489 /* ctx owns file */
490 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
491 /* drain existing IO first */
492 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
493 /* linked sqes */
494 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
495 /* doesn't sever on completion < 0 */
496 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
497 /* IOSQE_ASYNC */
498 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
499
500 /* already grabbed next link */
501 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
502 /* fail rest of links */
503 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
504 /* on inflight list */
505 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
506 /* read/write uses file position */
507 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
508 /* must not punt to workers */
509 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
510 /* polled IO has completed */
511 REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
512 /* has linked timeout */
513 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
514 /* timeout request */
515 REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
516 /* regular file */
517 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
518 /* must be punted even for NONBLOCK */
519 REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
520 /* no timeout sequence */
521 REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
522 /* completion under lock */
523 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
524};
525
09bb8394
JA
526/*
527 * NOTE! Each of the iocb union members has the file pointer
528 * as the first entry in their struct definition. So you can
529 * access the file pointer through any of the sub-structs,
530 * or directly as just 'ki_filp' in this struct.
531 */
2b188cc1 532struct io_kiocb {
221c5eb2 533 union {
09bb8394 534 struct file *file;
9adbd45d 535 struct io_rw rw;
221c5eb2 536 struct io_poll_iocb poll;
8ed8d3c3
JA
537 struct io_accept accept;
538 struct io_sync sync;
fbf23849 539 struct io_cancel cancel;
b29472ee 540 struct io_timeout timeout;
3fbb51c1 541 struct io_connect connect;
e47293fd 542 struct io_sr_msg sr_msg;
15b71abe 543 struct io_open open;
b5dba59e 544 struct io_close close;
05f3fb3c 545 struct io_files_update files_update;
4840e418 546 struct io_fadvise fadvise;
c1ca757b 547 struct io_madvise madvise;
3e4827b0 548 struct io_epoll epoll;
221c5eb2 549 };
2b188cc1 550
1a6b74fc 551 struct io_async_ctx *io;
b14cca0c
PB
552 /*
553 * llist_node is only used for poll deferred completions
554 */
555 struct llist_node llist_node;
cf6fd4bd
PB
556 bool has_user;
557 bool in_async;
558 bool needs_fixed_file;
d625c6ee 559 u8 opcode;
2b188cc1
JA
560
561 struct io_ring_ctx *ctx;
eac406c6
JA
562 union {
563 struct list_head list;
78076bb6 564 struct hlist_node hash_node;
eac406c6 565 };
9e645e11 566 struct list_head link_list;
2b188cc1 567 unsigned int flags;
c16361c1 568 refcount_t refs;
2b188cc1 569 u64 user_data;
9e645e11 570 u32 result;
de0617e4 571 u32 sequence;
2b188cc1 572
fcb323cc
JA
573 struct list_head inflight_entry;
574
561fb04a 575 struct io_wq_work work;
2b188cc1
JA
576};
577
578#define IO_PLUG_THRESHOLD 2
def596e9 579#define IO_IOPOLL_BATCH 8
2b188cc1 580
9a56a232
JA
581struct io_submit_state {
582 struct blk_plug plug;
583
2579f913
JA
584 /*
585 * io_kiocb alloc cache
586 */
587 void *reqs[IO_IOPOLL_BATCH];
588 unsigned int free_reqs;
589 unsigned int cur_req;
590
9a56a232
JA
591 /*
592 * File reference cache
593 */
594 struct file *file;
595 unsigned int fd;
596 unsigned int has_refs;
597 unsigned int used_refs;
598 unsigned int ios_left;
599};
600
d3656344
JA
601struct io_op_def {
602 /* needs req->io allocated for deferral/async */
603 unsigned async_ctx : 1;
604 /* needs current->mm setup, does mm access */
605 unsigned needs_mm : 1;
606 /* needs req->file assigned */
607 unsigned needs_file : 1;
608 /* needs req->file assigned IFF fd is >= 0 */
609 unsigned fd_non_neg : 1;
610 /* hash wq insertion if file is a regular file */
611 unsigned hash_reg_file : 1;
612 /* unbound wq insertion if file is a non-regular file */
613 unsigned unbound_nonreg_file : 1;
66f4af93
JA
614 /* opcode is not supported by this kernel */
615 unsigned not_supported : 1;
f86cd20c
JA
616 /* needs file table */
617 unsigned file_table : 1;
d3656344
JA
618};
619
620static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
621 [IORING_OP_NOP] = {},
622 [IORING_OP_READV] = {
d3656344
JA
623 .async_ctx = 1,
624 .needs_mm = 1,
625 .needs_file = 1,
626 .unbound_nonreg_file = 1,
627 },
0463b6c5 628 [IORING_OP_WRITEV] = {
d3656344
JA
629 .async_ctx = 1,
630 .needs_mm = 1,
631 .needs_file = 1,
632 .hash_reg_file = 1,
633 .unbound_nonreg_file = 1,
634 },
0463b6c5 635 [IORING_OP_FSYNC] = {
d3656344
JA
636 .needs_file = 1,
637 },
0463b6c5 638 [IORING_OP_READ_FIXED] = {
d3656344
JA
639 .needs_file = 1,
640 .unbound_nonreg_file = 1,
641 },
0463b6c5 642 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
643 .needs_file = 1,
644 .hash_reg_file = 1,
645 .unbound_nonreg_file = 1,
646 },
0463b6c5 647 [IORING_OP_POLL_ADD] = {
d3656344
JA
648 .needs_file = 1,
649 .unbound_nonreg_file = 1,
650 },
0463b6c5
PB
651 [IORING_OP_POLL_REMOVE] = {},
652 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
653 .needs_file = 1,
654 },
0463b6c5 655 [IORING_OP_SENDMSG] = {
d3656344
JA
656 .async_ctx = 1,
657 .needs_mm = 1,
658 .needs_file = 1,
659 .unbound_nonreg_file = 1,
660 },
0463b6c5 661 [IORING_OP_RECVMSG] = {
d3656344
JA
662 .async_ctx = 1,
663 .needs_mm = 1,
664 .needs_file = 1,
665 .unbound_nonreg_file = 1,
666 },
0463b6c5 667 [IORING_OP_TIMEOUT] = {
d3656344
JA
668 .async_ctx = 1,
669 .needs_mm = 1,
670 },
0463b6c5
PB
671 [IORING_OP_TIMEOUT_REMOVE] = {},
672 [IORING_OP_ACCEPT] = {
d3656344
JA
673 .needs_mm = 1,
674 .needs_file = 1,
675 .unbound_nonreg_file = 1,
f86cd20c 676 .file_table = 1,
d3656344 677 },
0463b6c5
PB
678 [IORING_OP_ASYNC_CANCEL] = {},
679 [IORING_OP_LINK_TIMEOUT] = {
d3656344
JA
680 .async_ctx = 1,
681 .needs_mm = 1,
682 },
0463b6c5 683 [IORING_OP_CONNECT] = {
d3656344
JA
684 .async_ctx = 1,
685 .needs_mm = 1,
686 .needs_file = 1,
687 .unbound_nonreg_file = 1,
688 },
0463b6c5 689 [IORING_OP_FALLOCATE] = {
d3656344
JA
690 .needs_file = 1,
691 },
0463b6c5 692 [IORING_OP_OPENAT] = {
d3656344
JA
693 .needs_file = 1,
694 .fd_non_neg = 1,
f86cd20c 695 .file_table = 1,
d3656344 696 },
0463b6c5 697 [IORING_OP_CLOSE] = {
d3656344 698 .needs_file = 1,
f86cd20c 699 .file_table = 1,
d3656344 700 },
0463b6c5 701 [IORING_OP_FILES_UPDATE] = {
d3656344 702 .needs_mm = 1,
f86cd20c 703 .file_table = 1,
d3656344 704 },
0463b6c5 705 [IORING_OP_STATX] = {
d3656344
JA
706 .needs_mm = 1,
707 .needs_file = 1,
708 .fd_non_neg = 1,
709 },
0463b6c5 710 [IORING_OP_READ] = {
3a6820f2
JA
711 .needs_mm = 1,
712 .needs_file = 1,
713 .unbound_nonreg_file = 1,
714 },
0463b6c5 715 [IORING_OP_WRITE] = {
3a6820f2
JA
716 .needs_mm = 1,
717 .needs_file = 1,
718 .unbound_nonreg_file = 1,
719 },
0463b6c5 720 [IORING_OP_FADVISE] = {
4840e418
JA
721 .needs_file = 1,
722 },
0463b6c5 723 [IORING_OP_MADVISE] = {
c1ca757b
JA
724 .needs_mm = 1,
725 },
0463b6c5 726 [IORING_OP_SEND] = {
fddaface
JA
727 .needs_mm = 1,
728 .needs_file = 1,
729 .unbound_nonreg_file = 1,
730 },
0463b6c5 731 [IORING_OP_RECV] = {
fddaface
JA
732 .needs_mm = 1,
733 .needs_file = 1,
734 .unbound_nonreg_file = 1,
735 },
0463b6c5 736 [IORING_OP_OPENAT2] = {
cebdb986
JA
737 .needs_file = 1,
738 .fd_non_neg = 1,
f86cd20c 739 .file_table = 1,
cebdb986 740 },
3e4827b0
JA
741 [IORING_OP_EPOLL_CTL] = {
742 .unbound_nonreg_file = 1,
743 .file_table = 1,
744 },
d3656344
JA
745};
746
561fb04a 747static void io_wq_submit_work(struct io_wq_work **workptr);
78e19bbe 748static void io_cqring_fill_event(struct io_kiocb *req, long res);
ec9c02ad 749static void io_put_req(struct io_kiocb *req);
978db57e 750static void __io_double_put_req(struct io_kiocb *req);
94ae5e77
JA
751static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
752static void io_queue_linked_timeout(struct io_kiocb *req);
05f3fb3c
JA
753static int __io_sqe_files_update(struct io_ring_ctx *ctx,
754 struct io_uring_files_update *ip,
755 unsigned nr_args);
f86cd20c 756static int io_grab_files(struct io_kiocb *req);
de0617e4 757
2b188cc1
JA
758static struct kmem_cache *req_cachep;
759
760static const struct file_operations io_uring_fops;
761
762struct sock *io_uring_get_socket(struct file *file)
763{
764#if defined(CONFIG_UNIX)
765 if (file->f_op == &io_uring_fops) {
766 struct io_ring_ctx *ctx = file->private_data;
767
768 return ctx->ring_sock->sk;
769 }
770#endif
771 return NULL;
772}
773EXPORT_SYMBOL(io_uring_get_socket);
774
775static void io_ring_ctx_ref_free(struct percpu_ref *ref)
776{
777 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
778
206aefde 779 complete(&ctx->completions[0]);
2b188cc1
JA
780}
781
782static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
783{
784 struct io_ring_ctx *ctx;
78076bb6 785 int hash_bits;
2b188cc1
JA
786
787 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
788 if (!ctx)
789 return NULL;
790
0ddf92e8
JA
791 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
792 if (!ctx->fallback_req)
793 goto err;
794
206aefde
JA
795 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
796 if (!ctx->completions)
797 goto err;
798
78076bb6
JA
799 /*
800 * Use 5 bits less than the max cq entries, that should give us around
801 * 32 entries per hash list if totally full and uniformly spread.
802 */
803 hash_bits = ilog2(p->cq_entries);
804 hash_bits -= 5;
805 if (hash_bits <= 0)
806 hash_bits = 1;
807 ctx->cancel_hash_bits = hash_bits;
808 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
809 GFP_KERNEL);
810 if (!ctx->cancel_hash)
811 goto err;
812 __hash_init(ctx->cancel_hash, 1U << hash_bits);
813
21482896 814 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
815 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
816 goto err;
2b188cc1
JA
817
818 ctx->flags = p->flags;
819 init_waitqueue_head(&ctx->cq_wait);
1d7bb1d5 820 INIT_LIST_HEAD(&ctx->cq_overflow_list);
206aefde
JA
821 init_completion(&ctx->completions[0]);
822 init_completion(&ctx->completions[1]);
071698e1 823 idr_init(&ctx->personality_idr);
2b188cc1
JA
824 mutex_init(&ctx->uring_lock);
825 init_waitqueue_head(&ctx->wait);
826 spin_lock_init(&ctx->completion_lock);
e94f141b 827 init_llist_head(&ctx->poll_llist);
def596e9 828 INIT_LIST_HEAD(&ctx->poll_list);
de0617e4 829 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 830 INIT_LIST_HEAD(&ctx->timeout_list);
fcb323cc
JA
831 init_waitqueue_head(&ctx->inflight_wait);
832 spin_lock_init(&ctx->inflight_lock);
833 INIT_LIST_HEAD(&ctx->inflight_list);
2b188cc1 834 return ctx;
206aefde 835err:
0ddf92e8
JA
836 if (ctx->fallback_req)
837 kmem_cache_free(req_cachep, ctx->fallback_req);
206aefde 838 kfree(ctx->completions);
78076bb6 839 kfree(ctx->cancel_hash);
206aefde
JA
840 kfree(ctx);
841 return NULL;
2b188cc1
JA
842}
843
9d858b21 844static inline bool __req_need_defer(struct io_kiocb *req)
7adf4eaf 845{
a197f664
JL
846 struct io_ring_ctx *ctx = req->ctx;
847
498ccd9e
JA
848 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
849 + atomic_read(&ctx->cached_cq_overflow);
7adf4eaf
JA
850}
851
9d858b21 852static inline bool req_need_defer(struct io_kiocb *req)
de0617e4 853{
87987898 854 if (unlikely(req->flags & REQ_F_IO_DRAIN))
9d858b21 855 return __req_need_defer(req);
de0617e4 856
9d858b21 857 return false;
de0617e4
JA
858}
859
7adf4eaf 860static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
de0617e4
JA
861{
862 struct io_kiocb *req;
863
7adf4eaf 864 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
9d858b21 865 if (req && !req_need_defer(req)) {
de0617e4
JA
866 list_del_init(&req->list);
867 return req;
868 }
869
870 return NULL;
871}
872
5262f567
JA
873static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
874{
7adf4eaf
JA
875 struct io_kiocb *req;
876
877 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
93bd25bb
JA
878 if (req) {
879 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
880 return NULL;
fb4b3d3f 881 if (!__req_need_defer(req)) {
93bd25bb
JA
882 list_del_init(&req->list);
883 return req;
884 }
7adf4eaf
JA
885 }
886
887 return NULL;
5262f567
JA
888}
889
de0617e4 890static void __io_commit_cqring(struct io_ring_ctx *ctx)
2b188cc1 891{
75b28aff 892 struct io_rings *rings = ctx->rings;
2b188cc1 893
07910158
PB
894 /* order cqe stores with ring update */
895 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
2b188cc1 896
07910158
PB
897 if (wq_has_sleeper(&ctx->cq_wait)) {
898 wake_up_interruptible(&ctx->cq_wait);
899 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
2b188cc1
JA
900 }
901}
902
cccf0ee8
JA
903static inline void io_req_work_grab_env(struct io_kiocb *req,
904 const struct io_op_def *def)
905{
906 if (!req->work.mm && def->needs_mm) {
907 mmgrab(current->mm);
908 req->work.mm = current->mm;
2b188cc1 909 }
cccf0ee8
JA
910 if (!req->work.creds)
911 req->work.creds = get_current_cred();
2b188cc1
JA
912}
913
cccf0ee8 914static inline void io_req_work_drop_env(struct io_kiocb *req)
18d9be1a 915{
cccf0ee8
JA
916 if (req->work.mm) {
917 mmdrop(req->work.mm);
918 req->work.mm = NULL;
919 }
920 if (req->work.creds) {
921 put_cred(req->work.creds);
922 req->work.creds = NULL;
923 }
561fb04a
JA
924}
925
94ae5e77
JA
926static inline bool io_prep_async_work(struct io_kiocb *req,
927 struct io_kiocb **link)
18d9be1a 928{
d3656344 929 const struct io_op_def *def = &io_op_defs[req->opcode];
561fb04a 930 bool do_hashed = false;
54a91f3b 931
d3656344
JA
932 if (req->flags & REQ_F_ISREG) {
933 if (def->hash_reg_file)
3529d8c2 934 do_hashed = true;
d3656344
JA
935 } else {
936 if (def->unbound_nonreg_file)
3529d8c2 937 req->work.flags |= IO_WQ_WORK_UNBOUND;
54a91f3b 938 }
cccf0ee8
JA
939
940 io_req_work_grab_env(req, def);
54a91f3b 941
94ae5e77 942 *link = io_prep_linked_timeout(req);
561fb04a
JA
943 return do_hashed;
944}
945
a197f664 946static inline void io_queue_async_work(struct io_kiocb *req)
561fb04a 947{
a197f664 948 struct io_ring_ctx *ctx = req->ctx;
94ae5e77
JA
949 struct io_kiocb *link;
950 bool do_hashed;
951
952 do_hashed = io_prep_async_work(req, &link);
561fb04a
JA
953
954 trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
955 req->flags);
956 if (!do_hashed) {
957 io_wq_enqueue(ctx->io_wq, &req->work);
958 } else {
959 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
960 file_inode(req->file));
961 }
94ae5e77
JA
962
963 if (link)
964 io_queue_linked_timeout(link);
18d9be1a
JA
965}
966
5262f567
JA
967static void io_kill_timeout(struct io_kiocb *req)
968{
969 int ret;
970
2d28390a 971 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
5262f567
JA
972 if (ret != -1) {
973 atomic_inc(&req->ctx->cq_timeouts);
842f9612 974 list_del_init(&req->list);
78e19bbe 975 io_cqring_fill_event(req, 0);
ec9c02ad 976 io_put_req(req);
5262f567
JA
977 }
978}
979
980static void io_kill_timeouts(struct io_ring_ctx *ctx)
981{
982 struct io_kiocb *req, *tmp;
983
984 spin_lock_irq(&ctx->completion_lock);
985 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
986 io_kill_timeout(req);
987 spin_unlock_irq(&ctx->completion_lock);
988}
989
de0617e4
JA
990static void io_commit_cqring(struct io_ring_ctx *ctx)
991{
992 struct io_kiocb *req;
993
5262f567
JA
994 while ((req = io_get_timeout_req(ctx)) != NULL)
995 io_kill_timeout(req);
996
de0617e4
JA
997 __io_commit_cqring(ctx);
998
87987898 999 while ((req = io_get_deferred_req(ctx)) != NULL)
a197f664 1000 io_queue_async_work(req);
de0617e4
JA
1001}
1002
2b188cc1
JA
1003static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1004{
75b28aff 1005 struct io_rings *rings = ctx->rings;
2b188cc1
JA
1006 unsigned tail;
1007
1008 tail = ctx->cached_cq_tail;
115e12e5
SB
1009 /*
1010 * writes to the cq entry need to come after reading head; the
1011 * control dependency is enough as we're using WRITE_ONCE to
1012 * fill the cq entry
1013 */
75b28aff 1014 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
2b188cc1
JA
1015 return NULL;
1016
1017 ctx->cached_cq_tail++;
75b28aff 1018 return &rings->cqes[tail & ctx->cq_mask];
2b188cc1
JA
1019}
1020
f2842ab5
JA
1021static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1022{
f0b493e6
JA
1023 if (!ctx->cq_ev_fd)
1024 return false;
f2842ab5
JA
1025 if (!ctx->eventfd_async)
1026 return true;
1027 return io_wq_current_is_worker() || in_interrupt();
1028}
1029
f0b493e6 1030static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
1d7bb1d5
JA
1031{
1032 if (waitqueue_active(&ctx->wait))
1033 wake_up(&ctx->wait);
1034 if (waitqueue_active(&ctx->sqo_wait))
1035 wake_up(&ctx->sqo_wait);
f0b493e6 1036 if (trigger_ev)
1d7bb1d5
JA
1037 eventfd_signal(ctx->cq_ev_fd, 1);
1038}
1039
f0b493e6
JA
1040static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1041{
1042 __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
1043}
1044
c4a2ed72
JA
1045/* Returns true if there are no backlogged entries after the flush */
1046static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5
JA
1047{
1048 struct io_rings *rings = ctx->rings;
1049 struct io_uring_cqe *cqe;
1050 struct io_kiocb *req;
1051 unsigned long flags;
1052 LIST_HEAD(list);
1053
1054 if (!force) {
1055 if (list_empty_careful(&ctx->cq_overflow_list))
c4a2ed72 1056 return true;
1d7bb1d5
JA
1057 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1058 rings->cq_ring_entries))
c4a2ed72 1059 return false;
1d7bb1d5
JA
1060 }
1061
1062 spin_lock_irqsave(&ctx->completion_lock, flags);
1063
1064 /* if force is set, the ring is going away. always drop after that */
1065 if (force)
69b3e546 1066 ctx->cq_overflow_flushed = 1;
1d7bb1d5 1067
c4a2ed72 1068 cqe = NULL;
1d7bb1d5
JA
1069 while (!list_empty(&ctx->cq_overflow_list)) {
1070 cqe = io_get_cqring(ctx);
1071 if (!cqe && !force)
1072 break;
1073
1074 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1075 list);
1076 list_move(&req->list, &list);
1077 if (cqe) {
1078 WRITE_ONCE(cqe->user_data, req->user_data);
1079 WRITE_ONCE(cqe->res, req->result);
1080 WRITE_ONCE(cqe->flags, 0);
1081 } else {
1082 WRITE_ONCE(ctx->rings->cq_overflow,
1083 atomic_inc_return(&ctx->cached_cq_overflow));
1084 }
1085 }
1086
1087 io_commit_cqring(ctx);
ad3eb2c8
JA
1088 if (cqe) {
1089 clear_bit(0, &ctx->sq_check_overflow);
1090 clear_bit(0, &ctx->cq_check_overflow);
1091 }
1d7bb1d5
JA
1092 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1093 io_cqring_ev_posted(ctx);
1094
1095 while (!list_empty(&list)) {
1096 req = list_first_entry(&list, struct io_kiocb, list);
1097 list_del(&req->list);
ec9c02ad 1098 io_put_req(req);
1d7bb1d5 1099 }
c4a2ed72
JA
1100
1101 return cqe != NULL;
1d7bb1d5
JA
1102}
1103
78e19bbe 1104static void io_cqring_fill_event(struct io_kiocb *req, long res)
2b188cc1 1105{
78e19bbe 1106 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1107 struct io_uring_cqe *cqe;
1108
78e19bbe 1109 trace_io_uring_complete(ctx, req->user_data, res);
51c3ff62 1110
2b188cc1
JA
1111 /*
1112 * If we can't get a cq entry, userspace overflowed the
1113 * submission (by quite a lot). Increment the overflow count in
1114 * the ring.
1115 */
1116 cqe = io_get_cqring(ctx);
1d7bb1d5 1117 if (likely(cqe)) {
78e19bbe 1118 WRITE_ONCE(cqe->user_data, req->user_data);
2b188cc1 1119 WRITE_ONCE(cqe->res, res);
c71ffb67 1120 WRITE_ONCE(cqe->flags, 0);
1d7bb1d5 1121 } else if (ctx->cq_overflow_flushed) {
498ccd9e
JA
1122 WRITE_ONCE(ctx->rings->cq_overflow,
1123 atomic_inc_return(&ctx->cached_cq_overflow));
1d7bb1d5 1124 } else {
ad3eb2c8
JA
1125 if (list_empty(&ctx->cq_overflow_list)) {
1126 set_bit(0, &ctx->sq_check_overflow);
1127 set_bit(0, &ctx->cq_check_overflow);
1128 }
1d7bb1d5
JA
1129 refcount_inc(&req->refs);
1130 req->result = res;
1131 list_add_tail(&req->list, &ctx->cq_overflow_list);
2b188cc1
JA
1132 }
1133}
1134
78e19bbe 1135static void io_cqring_add_event(struct io_kiocb *req, long res)
2b188cc1 1136{
78e19bbe 1137 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1138 unsigned long flags;
1139
1140 spin_lock_irqsave(&ctx->completion_lock, flags);
78e19bbe 1141 io_cqring_fill_event(req, res);
2b188cc1
JA
1142 io_commit_cqring(ctx);
1143 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1144
8c838788 1145 io_cqring_ev_posted(ctx);
2b188cc1
JA
1146}
1147
0ddf92e8
JA
1148static inline bool io_is_fallback_req(struct io_kiocb *req)
1149{
1150 return req == (struct io_kiocb *)
1151 ((unsigned long) req->ctx->fallback_req & ~1UL);
1152}
1153
1154static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1155{
1156 struct io_kiocb *req;
1157
1158 req = ctx->fallback_req;
1159 if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
1160 return req;
1161
1162 return NULL;
1163}
1164
2579f913
JA
1165static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
1166 struct io_submit_state *state)
2b188cc1 1167{
fd6fab2c 1168 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2b188cc1
JA
1169 struct io_kiocb *req;
1170
2579f913 1171 if (!state) {
fd6fab2c 1172 req = kmem_cache_alloc(req_cachep, gfp);
2579f913 1173 if (unlikely(!req))
0ddf92e8 1174 goto fallback;
2579f913
JA
1175 } else if (!state->free_reqs) {
1176 size_t sz;
1177 int ret;
1178
1179 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
fd6fab2c
JA
1180 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1181
1182 /*
1183 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1184 * retry single alloc to be on the safe side.
1185 */
1186 if (unlikely(ret <= 0)) {
1187 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1188 if (!state->reqs[0])
0ddf92e8 1189 goto fallback;
fd6fab2c
JA
1190 ret = 1;
1191 }
2579f913
JA
1192 state->free_reqs = ret - 1;
1193 state->cur_req = 1;
1194 req = state->reqs[0];
1195 } else {
1196 req = state->reqs[state->cur_req];
1197 state->free_reqs--;
1198 state->cur_req++;
2b188cc1
JA
1199 }
1200
0ddf92e8 1201got_it:
1a6b74fc 1202 req->io = NULL;
60c112b0 1203 req->file = NULL;
2579f913
JA
1204 req->ctx = ctx;
1205 req->flags = 0;
e65ef56d
JA
1206 /* one is dropped after submission, the other at completion */
1207 refcount_set(&req->refs, 2);
9e645e11 1208 req->result = 0;
561fb04a 1209 INIT_IO_WORK(&req->work, io_wq_submit_work);
2579f913 1210 return req;
0ddf92e8
JA
1211fallback:
1212 req = io_get_fallback_req(ctx);
1213 if (req)
1214 goto got_it;
6805b32e 1215 percpu_ref_put(&ctx->refs);
2b188cc1
JA
1216 return NULL;
1217}
1218
2b85edfc 1219static void __io_req_do_free(struct io_kiocb *req)
def596e9 1220{
2b85edfc
PB
1221 if (likely(!io_is_fallback_req(req)))
1222 kmem_cache_free(req_cachep, req);
1223 else
1224 clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
1225}
1226
c6ca97b3 1227static void __io_req_aux_free(struct io_kiocb *req)
2b188cc1 1228{
fcb323cc
JA
1229 struct io_ring_ctx *ctx = req->ctx;
1230
96fd84d8 1231 kfree(req->io);
05f3fb3c
JA
1232 if (req->file) {
1233 if (req->flags & REQ_F_FIXED_FILE)
1234 percpu_ref_put(&ctx->file_data->refs);
1235 else
1236 fput(req->file);
def596e9 1237 }
cccf0ee8
JA
1238
1239 io_req_work_drop_env(req);
def596e9
JA
1240}
1241
9e645e11 1242static void __io_free_req(struct io_kiocb *req)
2b188cc1 1243{
c6ca97b3 1244 __io_req_aux_free(req);
fcb323cc 1245
fcb323cc 1246 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3 1247 struct io_ring_ctx *ctx = req->ctx;
fcb323cc
JA
1248 unsigned long flags;
1249
1250 spin_lock_irqsave(&ctx->inflight_lock, flags);
1251 list_del(&req->inflight_entry);
1252 if (waitqueue_active(&ctx->inflight_wait))
1253 wake_up(&ctx->inflight_wait);
1254 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1255 }
2b85edfc
PB
1256
1257 percpu_ref_put(&req->ctx->refs);
1258 __io_req_do_free(req);
e65ef56d
JA
1259}
1260
c6ca97b3
JA
1261struct req_batch {
1262 void *reqs[IO_IOPOLL_BATCH];
1263 int to_free;
1264 int need_iter;
1265};
1266
1267static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1268{
10fef4be
JA
1269 int fixed_refs = rb->to_free;
1270
c6ca97b3
JA
1271 if (!rb->to_free)
1272 return;
1273 if (rb->need_iter) {
1274 int i, inflight = 0;
1275 unsigned long flags;
1276
10fef4be 1277 fixed_refs = 0;
c6ca97b3
JA
1278 for (i = 0; i < rb->to_free; i++) {
1279 struct io_kiocb *req = rb->reqs[i];
1280
10fef4be 1281 if (req->flags & REQ_F_FIXED_FILE) {
c6ca97b3 1282 req->file = NULL;
10fef4be
JA
1283 fixed_refs++;
1284 }
c6ca97b3
JA
1285 if (req->flags & REQ_F_INFLIGHT)
1286 inflight++;
c6ca97b3
JA
1287 __io_req_aux_free(req);
1288 }
1289 if (!inflight)
1290 goto do_free;
1291
1292 spin_lock_irqsave(&ctx->inflight_lock, flags);
1293 for (i = 0; i < rb->to_free; i++) {
1294 struct io_kiocb *req = rb->reqs[i];
1295
10fef4be 1296 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3
JA
1297 list_del(&req->inflight_entry);
1298 if (!--inflight)
1299 break;
1300 }
1301 }
1302 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1303
1304 if (waitqueue_active(&ctx->inflight_wait))
1305 wake_up(&ctx->inflight_wait);
1306 }
1307do_free:
1308 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
10fef4be
JA
1309 if (fixed_refs)
1310 percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
c6ca97b3 1311 percpu_ref_put_many(&ctx->refs, rb->to_free);
c6ca97b3 1312 rb->to_free = rb->need_iter = 0;
e65ef56d
JA
1313}
1314
a197f664 1315static bool io_link_cancel_timeout(struct io_kiocb *req)
2665abfd 1316{
a197f664 1317 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1318 int ret;
1319
2d28390a 1320 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
2665abfd 1321 if (ret != -1) {
78e19bbe 1322 io_cqring_fill_event(req, -ECANCELED);
2665abfd
JA
1323 io_commit_cqring(ctx);
1324 req->flags &= ~REQ_F_LINK;
ec9c02ad 1325 io_put_req(req);
2665abfd
JA
1326 return true;
1327 }
1328
1329 return false;
e65ef56d
JA
1330}
1331
ba816ad6 1332static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
9e645e11 1333{
2665abfd 1334 struct io_ring_ctx *ctx = req->ctx;
2665abfd 1335 bool wake_ev = false;
9e645e11 1336
4d7dd462
JA
1337 /* Already got next link */
1338 if (req->flags & REQ_F_LINK_NEXT)
1339 return;
1340
9e645e11
JA
1341 /*
1342 * The list should never be empty when we are called here. But could
1343 * potentially happen if the chain is messed up, check to be on the
1344 * safe side.
1345 */
4493233e
PB
1346 while (!list_empty(&req->link_list)) {
1347 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1348 struct io_kiocb, link_list);
94ae5e77 1349
4493233e
PB
1350 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1351 (nxt->flags & REQ_F_TIMEOUT))) {
1352 list_del_init(&nxt->link_list);
94ae5e77 1353 wake_ev |= io_link_cancel_timeout(nxt);
94ae5e77
JA
1354 req->flags &= ~REQ_F_LINK_TIMEOUT;
1355 continue;
1356 }
9e645e11 1357
4493233e
PB
1358 list_del_init(&req->link_list);
1359 if (!list_empty(&nxt->link_list))
1360 nxt->flags |= REQ_F_LINK;
b18fdf71 1361 *nxtptr = nxt;
94ae5e77 1362 break;
9e645e11 1363 }
2665abfd 1364
4d7dd462 1365 req->flags |= REQ_F_LINK_NEXT;
2665abfd
JA
1366 if (wake_ev)
1367 io_cqring_ev_posted(ctx);
9e645e11
JA
1368}
1369
1370/*
1371 * Called if REQ_F_LINK is set, and we fail the head request
1372 */
1373static void io_fail_links(struct io_kiocb *req)
1374{
2665abfd 1375 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1376 unsigned long flags;
1377
1378 spin_lock_irqsave(&ctx->completion_lock, flags);
9e645e11
JA
1379
1380 while (!list_empty(&req->link_list)) {
4493233e
PB
1381 struct io_kiocb *link = list_first_entry(&req->link_list,
1382 struct io_kiocb, link_list);
9e645e11 1383
4493233e 1384 list_del_init(&link->link_list);
c826bd7a 1385 trace_io_uring_fail_link(req, link);
2665abfd
JA
1386
1387 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
d625c6ee 1388 link->opcode == IORING_OP_LINK_TIMEOUT) {
a197f664 1389 io_link_cancel_timeout(link);
2665abfd 1390 } else {
78e19bbe 1391 io_cqring_fill_event(link, -ECANCELED);
978db57e 1392 __io_double_put_req(link);
2665abfd 1393 }
5d960724 1394 req->flags &= ~REQ_F_LINK_TIMEOUT;
9e645e11 1395 }
2665abfd
JA
1396
1397 io_commit_cqring(ctx);
1398 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1399 io_cqring_ev_posted(ctx);
9e645e11
JA
1400}
1401
4d7dd462 1402static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
9e645e11 1403{
4d7dd462 1404 if (likely(!(req->flags & REQ_F_LINK)))
2665abfd 1405 return;
2665abfd 1406
9e645e11
JA
1407 /*
1408 * If LINK is set, we have dependent requests in this chain. If we
1409 * didn't fail this request, queue the first one up, moving any other
1410 * dependencies to the next request. In case of failure, fail the rest
1411 * of the chain.
1412 */
2665abfd
JA
1413 if (req->flags & REQ_F_FAIL_LINK) {
1414 io_fail_links(req);
7c9e7f0f
JA
1415 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1416 REQ_F_LINK_TIMEOUT) {
2665abfd
JA
1417 struct io_ring_ctx *ctx = req->ctx;
1418 unsigned long flags;
1419
1420 /*
1421 * If this is a timeout link, we could be racing with the
1422 * timeout timer. Grab the completion lock for this case to
7c9e7f0f 1423 * protect against that.
2665abfd
JA
1424 */
1425 spin_lock_irqsave(&ctx->completion_lock, flags);
1426 io_req_link_next(req, nxt);
1427 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1428 } else {
1429 io_req_link_next(req, nxt);
9e645e11 1430 }
4d7dd462 1431}
9e645e11 1432
c69f8dbe
JL
1433static void io_free_req(struct io_kiocb *req)
1434{
944e58bf
PB
1435 struct io_kiocb *nxt = NULL;
1436
1437 io_req_find_next(req, &nxt);
70cf9f32 1438 __io_free_req(req);
944e58bf
PB
1439
1440 if (nxt)
1441 io_queue_async_work(nxt);
c69f8dbe
JL
1442}
1443
ba816ad6
JA
1444/*
1445 * Drop reference to request, return next in chain (if there is one) if this
1446 * was the last reference to this request.
1447 */
f9bd67f6 1448__attribute__((nonnull))
ec9c02ad 1449static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
e65ef56d 1450{
f9bd67f6 1451 io_req_find_next(req, nxtptr);
4d7dd462 1452
e65ef56d 1453 if (refcount_dec_and_test(&req->refs))
4d7dd462 1454 __io_free_req(req);
2b188cc1
JA
1455}
1456
e65ef56d
JA
1457static void io_put_req(struct io_kiocb *req)
1458{
1459 if (refcount_dec_and_test(&req->refs))
1460 io_free_req(req);
2b188cc1
JA
1461}
1462
978db57e
JA
1463/*
1464 * Must only be used if we don't need to care about links, usually from
1465 * within the completion handling itself.
1466 */
1467static void __io_double_put_req(struct io_kiocb *req)
78e19bbe
JA
1468{
1469 /* drop both submit and complete references */
1470 if (refcount_sub_and_test(2, &req->refs))
1471 __io_free_req(req);
1472}
1473
978db57e
JA
1474static void io_double_put_req(struct io_kiocb *req)
1475{
1476 /* drop both submit and complete references */
1477 if (refcount_sub_and_test(2, &req->refs))
1478 io_free_req(req);
1479}
1480
1d7bb1d5 1481static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
a3a0e43f 1482{
84f97dc2
JA
1483 struct io_rings *rings = ctx->rings;
1484
ad3eb2c8
JA
1485 if (test_bit(0, &ctx->cq_check_overflow)) {
1486 /*
1487 * noflush == true is from the waitqueue handler, just ensure
1488 * we wake up the task, and the next invocation will flush the
1489 * entries. We cannot safely to it from here.
1490 */
1491 if (noflush && !list_empty(&ctx->cq_overflow_list))
1492 return -1U;
1d7bb1d5 1493
ad3eb2c8
JA
1494 io_cqring_overflow_flush(ctx, false);
1495 }
1d7bb1d5 1496
a3a0e43f
JA
1497 /* See comment at the top of this file */
1498 smp_rmb();
ad3eb2c8 1499 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
a3a0e43f
JA
1500}
1501
fb5ccc98
PB
1502static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1503{
1504 struct io_rings *rings = ctx->rings;
1505
1506 /* make sure SQ entry isn't read before tail */
1507 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1508}
1509
8237e045 1510static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
e94f141b 1511{
c6ca97b3
JA
1512 if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
1513 return false;
e94f141b 1514
c6ca97b3
JA
1515 if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
1516 rb->need_iter++;
1517
1518 rb->reqs[rb->to_free++] = req;
1519 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1520 io_free_req_many(req->ctx, rb);
1521 return true;
e94f141b
JA
1522}
1523
def596e9
JA
1524/*
1525 * Find and free completed poll iocbs
1526 */
1527static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1528 struct list_head *done)
1529{
8237e045 1530 struct req_batch rb;
def596e9 1531 struct io_kiocb *req;
def596e9 1532
c6ca97b3 1533 rb.to_free = rb.need_iter = 0;
def596e9
JA
1534 while (!list_empty(done)) {
1535 req = list_first_entry(done, struct io_kiocb, list);
1536 list_del(&req->list);
1537
78e19bbe 1538 io_cqring_fill_event(req, req->result);
def596e9
JA
1539 (*nr_events)++;
1540
8237e045
JA
1541 if (refcount_dec_and_test(&req->refs) &&
1542 !io_req_multi_free(&rb, req))
1543 io_free_req(req);
def596e9 1544 }
def596e9 1545
09bb8394 1546 io_commit_cqring(ctx);
8237e045 1547 io_free_req_many(ctx, &rb);
def596e9
JA
1548}
1549
1550static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1551 long min)
1552{
1553 struct io_kiocb *req, *tmp;
1554 LIST_HEAD(done);
1555 bool spin;
1556 int ret;
1557
1558 /*
1559 * Only spin for completions if we don't have multiple devices hanging
1560 * off our complete list, and we're under the requested amount.
1561 */
1562 spin = !ctx->poll_multi_file && *nr_events < min;
1563
1564 ret = 0;
1565 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
9adbd45d 1566 struct kiocb *kiocb = &req->rw.kiocb;
def596e9
JA
1567
1568 /*
1569 * Move completed entries to our local list. If we find a
1570 * request that requires polling, break out and complete
1571 * the done list first, if we have entries there.
1572 */
1573 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1574 list_move_tail(&req->list, &done);
1575 continue;
1576 }
1577 if (!list_empty(&done))
1578 break;
1579
1580 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1581 if (ret < 0)
1582 break;
1583
1584 if (ret && spin)
1585 spin = false;
1586 ret = 0;
1587 }
1588
1589 if (!list_empty(&done))
1590 io_iopoll_complete(ctx, nr_events, &done);
1591
1592 return ret;
1593}
1594
1595/*
d195a66e 1596 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
def596e9
JA
1597 * non-spinning poll check - we'll still enter the driver poll loop, but only
1598 * as a non-spinning completion check.
1599 */
1600static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1601 long min)
1602{
08f5439f 1603 while (!list_empty(&ctx->poll_list) && !need_resched()) {
def596e9
JA
1604 int ret;
1605
1606 ret = io_do_iopoll(ctx, nr_events, min);
1607 if (ret < 0)
1608 return ret;
1609 if (!min || *nr_events >= min)
1610 return 0;
1611 }
1612
1613 return 1;
1614}
1615
1616/*
1617 * We can't just wait for polled events to come to us, we have to actively
1618 * find and complete them.
1619 */
1620static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1621{
1622 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1623 return;
1624
1625 mutex_lock(&ctx->uring_lock);
1626 while (!list_empty(&ctx->poll_list)) {
1627 unsigned int nr_events = 0;
1628
1629 io_iopoll_getevents(ctx, &nr_events, 1);
08f5439f
JA
1630
1631 /*
1632 * Ensure we allow local-to-the-cpu processing to take place,
1633 * in this case we need to ensure that we reap all events.
1634 */
1635 cond_resched();
def596e9
JA
1636 }
1637 mutex_unlock(&ctx->uring_lock);
1638}
1639
2b2ed975
JA
1640static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1641 long min)
def596e9 1642{
2b2ed975 1643 int iters = 0, ret = 0;
500f9fba 1644
def596e9
JA
1645 do {
1646 int tmin = 0;
1647
a3a0e43f
JA
1648 /*
1649 * Don't enter poll loop if we already have events pending.
1650 * If we do, we can potentially be spinning for commands that
1651 * already triggered a CQE (eg in error).
1652 */
1d7bb1d5 1653 if (io_cqring_events(ctx, false))
a3a0e43f
JA
1654 break;
1655
500f9fba
JA
1656 /*
1657 * If a submit got punted to a workqueue, we can have the
1658 * application entering polling for a command before it gets
1659 * issued. That app will hold the uring_lock for the duration
1660 * of the poll right here, so we need to take a breather every
1661 * now and then to ensure that the issue has a chance to add
1662 * the poll to the issued list. Otherwise we can spin here
1663 * forever, while the workqueue is stuck trying to acquire the
1664 * very same mutex.
1665 */
1666 if (!(++iters & 7)) {
1667 mutex_unlock(&ctx->uring_lock);
1668 mutex_lock(&ctx->uring_lock);
1669 }
1670
def596e9
JA
1671 if (*nr_events < min)
1672 tmin = min - *nr_events;
1673
1674 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1675 if (ret <= 0)
1676 break;
1677 ret = 0;
1678 } while (min && !*nr_events && !need_resched());
1679
2b2ed975
JA
1680 return ret;
1681}
1682
1683static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1684 long min)
1685{
1686 int ret;
1687
1688 /*
1689 * We disallow the app entering submit/complete with polling, but we
1690 * still need to lock the ring to prevent racing with polled issue
1691 * that got punted to a workqueue.
1692 */
1693 mutex_lock(&ctx->uring_lock);
1694 ret = __io_iopoll_check(ctx, nr_events, min);
500f9fba 1695 mutex_unlock(&ctx->uring_lock);
def596e9
JA
1696 return ret;
1697}
1698
491381ce 1699static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 1700{
491381ce
JA
1701 /*
1702 * Tell lockdep we inherited freeze protection from submission
1703 * thread.
1704 */
1705 if (req->flags & REQ_F_ISREG) {
1706 struct inode *inode = file_inode(req->file);
2b188cc1 1707
491381ce 1708 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1 1709 }
491381ce 1710 file_end_write(req->file);
2b188cc1
JA
1711}
1712
4e88d6e7
JA
1713static inline void req_set_fail_links(struct io_kiocb *req)
1714{
1715 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1716 req->flags |= REQ_F_FAIL_LINK;
1717}
1718
ba816ad6 1719static void io_complete_rw_common(struct kiocb *kiocb, long res)
2b188cc1 1720{
9adbd45d 1721 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2b188cc1 1722
491381ce
JA
1723 if (kiocb->ki_flags & IOCB_WRITE)
1724 kiocb_end_write(req);
2b188cc1 1725
4e88d6e7
JA
1726 if (res != req->result)
1727 req_set_fail_links(req);
78e19bbe 1728 io_cqring_add_event(req, res);
ba816ad6
JA
1729}
1730
1731static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1732{
9adbd45d 1733 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6
JA
1734
1735 io_complete_rw_common(kiocb, res);
e65ef56d 1736 io_put_req(req);
2b188cc1
JA
1737}
1738
ba816ad6
JA
1739static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1740{
9adbd45d 1741 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ec9c02ad 1742 struct io_kiocb *nxt = NULL;
ba816ad6
JA
1743
1744 io_complete_rw_common(kiocb, res);
ec9c02ad
JL
1745 io_put_req_find_next(req, &nxt);
1746
1747 return nxt;
2b188cc1
JA
1748}
1749
def596e9
JA
1750static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1751{
9adbd45d 1752 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 1753
491381ce
JA
1754 if (kiocb->ki_flags & IOCB_WRITE)
1755 kiocb_end_write(req);
def596e9 1756
4e88d6e7
JA
1757 if (res != req->result)
1758 req_set_fail_links(req);
9e645e11 1759 req->result = res;
def596e9
JA
1760 if (res != -EAGAIN)
1761 req->flags |= REQ_F_IOPOLL_COMPLETED;
1762}
1763
1764/*
1765 * After the iocb has been issued, it's safe to be found on the poll list.
1766 * Adding the kiocb to the list AFTER submission ensures that we don't
1767 * find it from a io_iopoll_getevents() thread before the issuer is done
1768 * accessing the kiocb cookie.
1769 */
1770static void io_iopoll_req_issued(struct io_kiocb *req)
1771{
1772 struct io_ring_ctx *ctx = req->ctx;
1773
1774 /*
1775 * Track whether we have multiple files in our lists. This will impact
1776 * how we do polling eventually, not spinning if we're on potentially
1777 * different devices.
1778 */
1779 if (list_empty(&ctx->poll_list)) {
1780 ctx->poll_multi_file = false;
1781 } else if (!ctx->poll_multi_file) {
1782 struct io_kiocb *list_req;
1783
1784 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1785 list);
9adbd45d 1786 if (list_req->file != req->file)
def596e9
JA
1787 ctx->poll_multi_file = true;
1788 }
1789
1790 /*
1791 * For fast devices, IO may have already completed. If it has, add
1792 * it to the front so we find it first.
1793 */
1794 if (req->flags & REQ_F_IOPOLL_COMPLETED)
1795 list_add(&req->list, &ctx->poll_list);
1796 else
1797 list_add_tail(&req->list, &ctx->poll_list);
1798}
1799
3d6770fb 1800static void io_file_put(struct io_submit_state *state)
9a56a232 1801{
3d6770fb 1802 if (state->file) {
9a56a232
JA
1803 int diff = state->has_refs - state->used_refs;
1804
1805 if (diff)
1806 fput_many(state->file, diff);
1807 state->file = NULL;
1808 }
1809}
1810
1811/*
1812 * Get as many references to a file as we have IOs left in this submission,
1813 * assuming most submissions are for one file, or at least that each file
1814 * has more than one submission.
1815 */
1816static struct file *io_file_get(struct io_submit_state *state, int fd)
1817{
1818 if (!state)
1819 return fget(fd);
1820
1821 if (state->file) {
1822 if (state->fd == fd) {
1823 state->used_refs++;
1824 state->ios_left--;
1825 return state->file;
1826 }
3d6770fb 1827 io_file_put(state);
9a56a232
JA
1828 }
1829 state->file = fget_many(fd, state->ios_left);
1830 if (!state->file)
1831 return NULL;
1832
1833 state->fd = fd;
1834 state->has_refs = state->ios_left;
1835 state->used_refs = 1;
1836 state->ios_left--;
1837 return state->file;
1838}
1839
2b188cc1
JA
1840/*
1841 * If we tracked the file through the SCM inflight mechanism, we could support
1842 * any file. For now, just ensure that anything potentially problematic is done
1843 * inline.
1844 */
1845static bool io_file_supports_async(struct file *file)
1846{
1847 umode_t mode = file_inode(file)->i_mode;
1848
10d59345 1849 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2b188cc1
JA
1850 return true;
1851 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1852 return true;
1853
1854 return false;
1855}
1856
3529d8c2
JA
1857static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1858 bool force_nonblock)
2b188cc1 1859{
def596e9 1860 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 1861 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
1862 unsigned ioprio;
1863 int ret;
2b188cc1 1864
491381ce
JA
1865 if (S_ISREG(file_inode(req->file)->i_mode))
1866 req->flags |= REQ_F_ISREG;
1867
2b188cc1 1868 kiocb->ki_pos = READ_ONCE(sqe->off);
ba04291e
JA
1869 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
1870 req->flags |= REQ_F_CUR_POS;
1871 kiocb->ki_pos = req->file->f_pos;
1872 }
2b188cc1
JA
1873 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1874 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1875
1876 ioprio = READ_ONCE(sqe->ioprio);
1877 if (ioprio) {
1878 ret = ioprio_check_cap(ioprio);
1879 if (ret)
09bb8394 1880 return ret;
2b188cc1
JA
1881
1882 kiocb->ki_ioprio = ioprio;
1883 } else
1884 kiocb->ki_ioprio = get_current_ioprio();
1885
1886 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1887 if (unlikely(ret))
09bb8394 1888 return ret;
8449eeda
SB
1889
1890 /* don't allow async punt if RWF_NOWAIT was requested */
491381ce
JA
1891 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1892 (req->file->f_flags & O_NONBLOCK))
8449eeda
SB
1893 req->flags |= REQ_F_NOWAIT;
1894
1895 if (force_nonblock)
2b188cc1 1896 kiocb->ki_flags |= IOCB_NOWAIT;
8449eeda 1897
def596e9 1898 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
1899 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1900 !kiocb->ki_filp->f_op->iopoll)
09bb8394 1901 return -EOPNOTSUPP;
2b188cc1 1902
def596e9
JA
1903 kiocb->ki_flags |= IOCB_HIPRI;
1904 kiocb->ki_complete = io_complete_rw_iopoll;
6873e0bd 1905 req->result = 0;
def596e9 1906 } else {
09bb8394
JA
1907 if (kiocb->ki_flags & IOCB_HIPRI)
1908 return -EINVAL;
def596e9
JA
1909 kiocb->ki_complete = io_complete_rw;
1910 }
9adbd45d 1911
3529d8c2
JA
1912 req->rw.addr = READ_ONCE(sqe->addr);
1913 req->rw.len = READ_ONCE(sqe->len);
9adbd45d
JA
1914 /* we own ->private, reuse it for the buffer index */
1915 req->rw.kiocb.private = (void *) (unsigned long)
3529d8c2 1916 READ_ONCE(sqe->buf_index);
2b188cc1 1917 return 0;
2b188cc1
JA
1918}
1919
1920static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1921{
1922 switch (ret) {
1923 case -EIOCBQUEUED:
1924 break;
1925 case -ERESTARTSYS:
1926 case -ERESTARTNOINTR:
1927 case -ERESTARTNOHAND:
1928 case -ERESTART_RESTARTBLOCK:
1929 /*
1930 * We can't just restart the syscall, since previously
1931 * submitted sqes may already be in progress. Just fail this
1932 * IO with EINTR.
1933 */
1934 ret = -EINTR;
1935 /* fall through */
1936 default:
1937 kiocb->ki_complete(kiocb, ret, 0);
1938 }
1939}
1940
ba816ad6
JA
1941static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1942 bool in_async)
1943{
ba04291e
JA
1944 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1945
1946 if (req->flags & REQ_F_CUR_POS)
1947 req->file->f_pos = kiocb->ki_pos;
f9bd67f6 1948 if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
ba816ad6
JA
1949 *nxt = __io_complete_rw(kiocb, ret);
1950 else
1951 io_rw_done(kiocb, ret);
1952}
1953
9adbd45d 1954static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
7d009165 1955 struct iov_iter *iter)
edafccee 1956{
9adbd45d
JA
1957 struct io_ring_ctx *ctx = req->ctx;
1958 size_t len = req->rw.len;
edafccee
JA
1959 struct io_mapped_ubuf *imu;
1960 unsigned index, buf_index;
1961 size_t offset;
1962 u64 buf_addr;
1963
1964 /* attempt to use fixed buffers without having provided iovecs */
1965 if (unlikely(!ctx->user_bufs))
1966 return -EFAULT;
1967
9adbd45d 1968 buf_index = (unsigned long) req->rw.kiocb.private;
edafccee
JA
1969 if (unlikely(buf_index >= ctx->nr_user_bufs))
1970 return -EFAULT;
1971
1972 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1973 imu = &ctx->user_bufs[index];
9adbd45d 1974 buf_addr = req->rw.addr;
edafccee
JA
1975
1976 /* overflow */
1977 if (buf_addr + len < buf_addr)
1978 return -EFAULT;
1979 /* not inside the mapped region */
1980 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1981 return -EFAULT;
1982
1983 /*
1984 * May not be a start of buffer, set size appropriately
1985 * and advance us to the beginning.
1986 */
1987 offset = buf_addr - imu->ubuf;
1988 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
1989
1990 if (offset) {
1991 /*
1992 * Don't use iov_iter_advance() here, as it's really slow for
1993 * using the latter parts of a big fixed buffer - it iterates
1994 * over each segment manually. We can cheat a bit here, because
1995 * we know that:
1996 *
1997 * 1) it's a BVEC iter, we set it up
1998 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1999 * first and last bvec
2000 *
2001 * So just find our index, and adjust the iterator afterwards.
2002 * If the offset is within the first bvec (or the whole first
2003 * bvec, just use iov_iter_advance(). This makes it easier
2004 * since we can just skip the first segment, which may not
2005 * be PAGE_SIZE aligned.
2006 */
2007 const struct bio_vec *bvec = imu->bvec;
2008
2009 if (offset <= bvec->bv_len) {
2010 iov_iter_advance(iter, offset);
2011 } else {
2012 unsigned long seg_skip;
2013
2014 /* skip first vec */
2015 offset -= bvec->bv_len;
2016 seg_skip = 1 + (offset >> PAGE_SHIFT);
2017
2018 iter->bvec = bvec + seg_skip;
2019 iter->nr_segs -= seg_skip;
99c79f66 2020 iter->count -= bvec->bv_len + offset;
bd11b3a3 2021 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2022 }
2023 }
2024
5e559561 2025 return len;
edafccee
JA
2026}
2027
cf6fd4bd
PB
2028static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2029 struct iovec **iovec, struct iov_iter *iter)
2b188cc1 2030{
9adbd45d
JA
2031 void __user *buf = u64_to_user_ptr(req->rw.addr);
2032 size_t sqe_len = req->rw.len;
edafccee
JA
2033 u8 opcode;
2034
d625c6ee 2035 opcode = req->opcode;
7d009165 2036 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 2037 *iovec = NULL;
9adbd45d 2038 return io_import_fixed(req, rw, iter);
edafccee 2039 }
2b188cc1 2040
9adbd45d
JA
2041 /* buffer index only valid with fixed read/write */
2042 if (req->rw.kiocb.private)
2043 return -EINVAL;
2044
3a6820f2
JA
2045 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2046 ssize_t ret;
2047 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2048 *iovec = NULL;
2049 return ret;
2050 }
2051
f67676d1
JA
2052 if (req->io) {
2053 struct io_async_rw *iorw = &req->io->rw;
2054
2055 *iovec = iorw->iov;
2056 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2057 if (iorw->iov == iorw->fast_iov)
2058 *iovec = NULL;
2059 return iorw->size;
2060 }
2061
cf6fd4bd 2062 if (!req->has_user)
2b188cc1
JA
2063 return -EFAULT;
2064
2065#ifdef CONFIG_COMPAT
cf6fd4bd 2066 if (req->ctx->compat)
2b188cc1
JA
2067 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2068 iovec, iter);
2069#endif
2070
2071 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2072}
2073
31b51510 2074/*
32960613
JA
2075 * For files that don't have ->read_iter() and ->write_iter(), handle them
2076 * by looping over ->read() or ->write() manually.
31b51510 2077 */
32960613
JA
2078static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2079 struct iov_iter *iter)
2080{
2081 ssize_t ret = 0;
2082
2083 /*
2084 * Don't support polled IO through this interface, and we can't
2085 * support non-blocking either. For the latter, this just causes
2086 * the kiocb to be handled from an async context.
2087 */
2088 if (kiocb->ki_flags & IOCB_HIPRI)
2089 return -EOPNOTSUPP;
2090 if (kiocb->ki_flags & IOCB_NOWAIT)
2091 return -EAGAIN;
2092
2093 while (iov_iter_count(iter)) {
311ae9e1 2094 struct iovec iovec;
32960613
JA
2095 ssize_t nr;
2096
311ae9e1
PB
2097 if (!iov_iter_is_bvec(iter)) {
2098 iovec = iov_iter_iovec(iter);
2099 } else {
2100 /* fixed buffers import bvec */
2101 iovec.iov_base = kmap(iter->bvec->bv_page)
2102 + iter->iov_offset;
2103 iovec.iov_len = min(iter->count,
2104 iter->bvec->bv_len - iter->iov_offset);
2105 }
2106
32960613
JA
2107 if (rw == READ) {
2108 nr = file->f_op->read(file, iovec.iov_base,
2109 iovec.iov_len, &kiocb->ki_pos);
2110 } else {
2111 nr = file->f_op->write(file, iovec.iov_base,
2112 iovec.iov_len, &kiocb->ki_pos);
2113 }
2114
311ae9e1
PB
2115 if (iov_iter_is_bvec(iter))
2116 kunmap(iter->bvec->bv_page);
2117
32960613
JA
2118 if (nr < 0) {
2119 if (!ret)
2120 ret = nr;
2121 break;
2122 }
2123 ret += nr;
2124 if (nr != iovec.iov_len)
2125 break;
2126 iov_iter_advance(iter, nr);
2127 }
2128
2129 return ret;
2130}
2131
b7bb4f7d 2132static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
f67676d1
JA
2133 struct iovec *iovec, struct iovec *fast_iov,
2134 struct iov_iter *iter)
2135{
2136 req->io->rw.nr_segs = iter->nr_segs;
2137 req->io->rw.size = io_size;
2138 req->io->rw.iov = iovec;
2139 if (!req->io->rw.iov) {
2140 req->io->rw.iov = req->io->rw.fast_iov;
2141 memcpy(req->io->rw.iov, fast_iov,
2142 sizeof(struct iovec) * iter->nr_segs);
2143 }
2144}
2145
b7bb4f7d 2146static int io_alloc_async_ctx(struct io_kiocb *req)
f67676d1 2147{
d3656344
JA
2148 if (!io_op_defs[req->opcode].async_ctx)
2149 return 0;
f67676d1 2150 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
06b76d44 2151 return req->io == NULL;
b7bb4f7d
JA
2152}
2153
2154static void io_rw_async(struct io_wq_work **workptr)
2155{
2156 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2157 struct iovec *iov = NULL;
2158
2159 if (req->io->rw.iov != req->io->rw.fast_iov)
2160 iov = req->io->rw.iov;
2161 io_wq_submit_work(workptr);
2162 kfree(iov);
2163}
2164
2165static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2166 struct iovec *iovec, struct iovec *fast_iov,
2167 struct iov_iter *iter)
2168{
980ad263 2169 if (!io_op_defs[req->opcode].async_ctx)
74566df3 2170 return 0;
5d204bcf
JA
2171 if (!req->io) {
2172 if (io_alloc_async_ctx(req))
2173 return -ENOMEM;
b7bb4f7d 2174
5d204bcf
JA
2175 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2176 }
b7bb4f7d
JA
2177 req->work.func = io_rw_async;
2178 return 0;
f67676d1
JA
2179}
2180
3529d8c2
JA
2181static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2182 bool force_nonblock)
f67676d1 2183{
3529d8c2
JA
2184 struct io_async_ctx *io;
2185 struct iov_iter iter;
f67676d1
JA
2186 ssize_t ret;
2187
3529d8c2
JA
2188 ret = io_prep_rw(req, sqe, force_nonblock);
2189 if (ret)
2190 return ret;
f67676d1 2191
3529d8c2
JA
2192 if (unlikely(!(req->file->f_mode & FMODE_READ)))
2193 return -EBADF;
f67676d1 2194
3529d8c2
JA
2195 if (!req->io)
2196 return 0;
2197
2198 io = req->io;
2199 io->rw.iov = io->rw.fast_iov;
2200 req->io = NULL;
2201 ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
2202 req->io = io;
2203 if (ret < 0)
2204 return ret;
2205
2206 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2207 return 0;
f67676d1
JA
2208}
2209
267bc904 2210static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
8358e3a8 2211 bool force_nonblock)
2b188cc1
JA
2212{
2213 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2214 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2215 struct iov_iter iter;
31b51510 2216 size_t iov_count;
f67676d1 2217 ssize_t io_size, ret;
2b188cc1 2218
3529d8c2 2219 ret = io_import_iovec(READ, req, &iovec, &iter);
06b76d44
JA
2220 if (ret < 0)
2221 return ret;
2b188cc1 2222
fd6c2e4c
JA
2223 /* Ensure we clear previously set non-block flag */
2224 if (!force_nonblock)
9adbd45d 2225 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2226
797f3f53 2227 req->result = 0;
f67676d1 2228 io_size = ret;
9e645e11 2229 if (req->flags & REQ_F_LINK)
f67676d1
JA
2230 req->result = io_size;
2231
2232 /*
2233 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2234 * we know to async punt it even if it was opened O_NONBLOCK
2235 */
9adbd45d 2236 if (force_nonblock && !io_file_supports_async(req->file)) {
f67676d1
JA
2237 req->flags |= REQ_F_MUST_PUNT;
2238 goto copy_iov;
2239 }
9e645e11 2240
31b51510 2241 iov_count = iov_iter_count(&iter);
9adbd45d 2242 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2b188cc1
JA
2243 if (!ret) {
2244 ssize_t ret2;
2245
9adbd45d
JA
2246 if (req->file->f_op->read_iter)
2247 ret2 = call_read_iter(req->file, kiocb, &iter);
32960613 2248 else
9adbd45d 2249 ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
32960613 2250
9d93a3f5 2251 /* Catch -EAGAIN return for forced non-blocking submission */
f67676d1 2252 if (!force_nonblock || ret2 != -EAGAIN) {
cf6fd4bd 2253 kiocb_done(kiocb, ret2, nxt, req->in_async);
f67676d1
JA
2254 } else {
2255copy_iov:
b7bb4f7d 2256 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2257 inline_vecs, &iter);
2258 if (ret)
2259 goto out_free;
2260 return -EAGAIN;
2261 }
2b188cc1 2262 }
f67676d1 2263out_free:
b7bb4f7d
JA
2264 if (!io_wq_current_is_worker())
2265 kfree(iovec);
2b188cc1
JA
2266 return ret;
2267}
2268
3529d8c2
JA
2269static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2270 bool force_nonblock)
f67676d1 2271{
3529d8c2
JA
2272 struct io_async_ctx *io;
2273 struct iov_iter iter;
f67676d1
JA
2274 ssize_t ret;
2275
3529d8c2
JA
2276 ret = io_prep_rw(req, sqe, force_nonblock);
2277 if (ret)
2278 return ret;
f67676d1 2279
3529d8c2
JA
2280 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2281 return -EBADF;
f67676d1 2282
3529d8c2
JA
2283 if (!req->io)
2284 return 0;
2285
2286 io = req->io;
2287 io->rw.iov = io->rw.fast_iov;
2288 req->io = NULL;
2289 ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
2290 req->io = io;
2291 if (ret < 0)
2292 return ret;
2293
2294 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2295 return 0;
f67676d1
JA
2296}
2297
267bc904 2298static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
8358e3a8 2299 bool force_nonblock)
2b188cc1
JA
2300{
2301 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2302 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2303 struct iov_iter iter;
31b51510 2304 size_t iov_count;
f67676d1 2305 ssize_t ret, io_size;
2b188cc1 2306
3529d8c2 2307 ret = io_import_iovec(WRITE, req, &iovec, &iter);
06b76d44
JA
2308 if (ret < 0)
2309 return ret;
2b188cc1 2310
fd6c2e4c
JA
2311 /* Ensure we clear previously set non-block flag */
2312 if (!force_nonblock)
9adbd45d 2313 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2314
797f3f53 2315 req->result = 0;
f67676d1 2316 io_size = ret;
9e645e11 2317 if (req->flags & REQ_F_LINK)
f67676d1 2318 req->result = io_size;
9e645e11 2319
f67676d1
JA
2320 /*
2321 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2322 * we know to async punt it even if it was opened O_NONBLOCK
2323 */
2324 if (force_nonblock && !io_file_supports_async(req->file)) {
2325 req->flags |= REQ_F_MUST_PUNT;
2326 goto copy_iov;
2327 }
31b51510 2328
10d59345
JA
2329 /* file path doesn't support NOWAIT for non-direct_IO */
2330 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2331 (req->flags & REQ_F_ISREG))
f67676d1 2332 goto copy_iov;
31b51510 2333
f67676d1 2334 iov_count = iov_iter_count(&iter);
9adbd45d 2335 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2b188cc1 2336 if (!ret) {
9bf7933f
RP
2337 ssize_t ret2;
2338
2b188cc1
JA
2339 /*
2340 * Open-code file_start_write here to grab freeze protection,
2341 * which will be released by another thread in
2342 * io_complete_rw(). Fool lockdep by telling it the lock got
2343 * released so that it doesn't complain about the held lock when
2344 * we return to userspace.
2345 */
491381ce 2346 if (req->flags & REQ_F_ISREG) {
9adbd45d 2347 __sb_start_write(file_inode(req->file)->i_sb,
2b188cc1 2348 SB_FREEZE_WRITE, true);
9adbd45d 2349 __sb_writers_release(file_inode(req->file)->i_sb,
2b188cc1
JA
2350 SB_FREEZE_WRITE);
2351 }
2352 kiocb->ki_flags |= IOCB_WRITE;
9bf7933f 2353
9adbd45d
JA
2354 if (req->file->f_op->write_iter)
2355 ret2 = call_write_iter(req->file, kiocb, &iter);
32960613 2356 else
9adbd45d 2357 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
f67676d1 2358 if (!force_nonblock || ret2 != -EAGAIN) {
cf6fd4bd 2359 kiocb_done(kiocb, ret2, nxt, req->in_async);
f67676d1
JA
2360 } else {
2361copy_iov:
b7bb4f7d 2362 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2363 inline_vecs, &iter);
2364 if (ret)
2365 goto out_free;
2366 return -EAGAIN;
2367 }
2b188cc1 2368 }
31b51510 2369out_free:
b7bb4f7d
JA
2370 if (!io_wq_current_is_worker())
2371 kfree(iovec);
2b188cc1
JA
2372 return ret;
2373}
2374
2375/*
2376 * IORING_OP_NOP just posts a completion event, nothing else.
2377 */
78e19bbe 2378static int io_nop(struct io_kiocb *req)
2b188cc1
JA
2379{
2380 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 2381
def596e9
JA
2382 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2383 return -EINVAL;
2384
78e19bbe 2385 io_cqring_add_event(req, 0);
e65ef56d 2386 io_put_req(req);
2b188cc1
JA
2387 return 0;
2388}
2389
3529d8c2 2390static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 2391{
6b06314c 2392 struct io_ring_ctx *ctx = req->ctx;
c992fe29 2393
09bb8394
JA
2394 if (!req->file)
2395 return -EBADF;
c992fe29 2396
6b06314c 2397 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 2398 return -EINVAL;
edafccee 2399 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
2400 return -EINVAL;
2401
8ed8d3c3
JA
2402 req->sync.flags = READ_ONCE(sqe->fsync_flags);
2403 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2404 return -EINVAL;
2405
2406 req->sync.off = READ_ONCE(sqe->off);
2407 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
2408 return 0;
2409}
2410
8ed8d3c3
JA
2411static bool io_req_cancelled(struct io_kiocb *req)
2412{
2413 if (req->work.flags & IO_WQ_WORK_CANCEL) {
2414 req_set_fail_links(req);
2415 io_cqring_add_event(req, -ECANCELED);
2416 io_put_req(req);
2417 return true;
2418 }
2419
2420 return false;
2421}
2422
78912934
JA
2423static void io_link_work_cb(struct io_wq_work **workptr)
2424{
2425 struct io_wq_work *work = *workptr;
2426 struct io_kiocb *link = work->data;
2427
2428 io_queue_linked_timeout(link);
2429 work->func = io_wq_submit_work;
2430}
2431
2432static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
2433{
2434 struct io_kiocb *link;
2435
2436 io_prep_async_work(nxt, &link);
2437 *workptr = &nxt->work;
2438 if (link) {
2439 nxt->work.flags |= IO_WQ_WORK_CB;
2440 nxt->work.func = io_link_work_cb;
2441 nxt->work.data = link;
2442 }
2443}
2444
8ed8d3c3
JA
2445static void io_fsync_finish(struct io_wq_work **workptr)
2446{
2447 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2448 loff_t end = req->sync.off + req->sync.len;
2449 struct io_kiocb *nxt = NULL;
2450 int ret;
2451
2452 if (io_req_cancelled(req))
2453 return;
2454
9adbd45d 2455 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
2456 end > 0 ? end : LLONG_MAX,
2457 req->sync.flags & IORING_FSYNC_DATASYNC);
2458 if (ret < 0)
2459 req_set_fail_links(req);
2460 io_cqring_add_event(req, ret);
2461 io_put_req_find_next(req, &nxt);
2462 if (nxt)
78912934 2463 io_wq_assign_next(workptr, nxt);
8ed8d3c3
JA
2464}
2465
fc4df999
JA
2466static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
2467 bool force_nonblock)
c992fe29 2468{
8ed8d3c3 2469 struct io_wq_work *work, *old_work;
c992fe29
CH
2470
2471 /* fsync always requires a blocking context */
8ed8d3c3
JA
2472 if (force_nonblock) {
2473 io_put_req(req);
2474 req->work.func = io_fsync_finish;
c992fe29 2475 return -EAGAIN;
8ed8d3c3 2476 }
c992fe29 2477
8ed8d3c3
JA
2478 work = old_work = &req->work;
2479 io_fsync_finish(&work);
2480 if (work && work != old_work)
2481 *nxt = container_of(work, struct io_kiocb, work);
c992fe29
CH
2482 return 0;
2483}
2484
d63d1b5e 2485static void io_fallocate_finish(struct io_wq_work **workptr)
8ed8d3c3
JA
2486{
2487 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2488 struct io_kiocb *nxt = NULL;
2489 int ret;
2490
d63d1b5e
JA
2491 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2492 req->sync.len);
8ed8d3c3
JA
2493 if (ret < 0)
2494 req_set_fail_links(req);
2495 io_cqring_add_event(req, ret);
2496 io_put_req_find_next(req, &nxt);
2497 if (nxt)
78912934 2498 io_wq_assign_next(workptr, nxt);
5d17b4a4
JA
2499}
2500
d63d1b5e
JA
2501static int io_fallocate_prep(struct io_kiocb *req,
2502 const struct io_uring_sqe *sqe)
2503{
2504 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2505 return -EINVAL;
2506
2507 req->sync.off = READ_ONCE(sqe->off);
2508 req->sync.len = READ_ONCE(sqe->addr);
2509 req->sync.mode = READ_ONCE(sqe->len);
2510 return 0;
2511}
2512
2513static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
2514 bool force_nonblock)
5d17b4a4 2515{
8ed8d3c3 2516 struct io_wq_work *work, *old_work;
5d17b4a4 2517
d63d1b5e 2518 /* fallocate always requiring blocking context */
8ed8d3c3
JA
2519 if (force_nonblock) {
2520 io_put_req(req);
d63d1b5e 2521 req->work.func = io_fallocate_finish;
5d17b4a4 2522 return -EAGAIN;
8ed8d3c3 2523 }
5d17b4a4 2524
8ed8d3c3 2525 work = old_work = &req->work;
d63d1b5e 2526 io_fallocate_finish(&work);
8ed8d3c3
JA
2527 if (work && work != old_work)
2528 *nxt = container_of(work, struct io_kiocb, work);
d63d1b5e 2529
5d17b4a4
JA
2530 return 0;
2531}
2532
15b71abe 2533static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 2534{
f8748881 2535 const char __user *fname;
15b71abe 2536 int ret;
b7bb4f7d 2537
15b71abe
JA
2538 if (sqe->ioprio || sqe->buf_index)
2539 return -EINVAL;
03b1230c 2540
15b71abe 2541 req->open.dfd = READ_ONCE(sqe->fd);
c12cedf2 2542 req->open.how.mode = READ_ONCE(sqe->len);
f8748881 2543 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
c12cedf2 2544 req->open.how.flags = READ_ONCE(sqe->open_flags);
3529d8c2 2545
f8748881 2546 req->open.filename = getname(fname);
15b71abe
JA
2547 if (IS_ERR(req->open.filename)) {
2548 ret = PTR_ERR(req->open.filename);
2549 req->open.filename = NULL;
2550 return ret;
2551 }
3529d8c2 2552
15b71abe 2553 return 0;
03b1230c
JA
2554}
2555
cebdb986 2556static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 2557{
cebdb986
JA
2558 struct open_how __user *how;
2559 const char __user *fname;
2560 size_t len;
0fa03c62
JA
2561 int ret;
2562
cebdb986 2563 if (sqe->ioprio || sqe->buf_index)
0fa03c62
JA
2564 return -EINVAL;
2565
cebdb986
JA
2566 req->open.dfd = READ_ONCE(sqe->fd);
2567 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2568 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2569 len = READ_ONCE(sqe->len);
0fa03c62 2570
cebdb986
JA
2571 if (len < OPEN_HOW_SIZE_VER0)
2572 return -EINVAL;
3529d8c2 2573
cebdb986
JA
2574 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2575 len);
2576 if (ret)
2577 return ret;
3529d8c2 2578
cebdb986
JA
2579 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2580 req->open.how.flags |= O_LARGEFILE;
0fa03c62 2581
cebdb986
JA
2582 req->open.filename = getname(fname);
2583 if (IS_ERR(req->open.filename)) {
2584 ret = PTR_ERR(req->open.filename);
2585 req->open.filename = NULL;
2586 return ret;
2587 }
2588
2589 return 0;
2590}
2591
2592static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
2593 bool force_nonblock)
15b71abe
JA
2594{
2595 struct open_flags op;
15b71abe
JA
2596 struct file *file;
2597 int ret;
2598
f86cd20c 2599 if (force_nonblock)
15b71abe 2600 return -EAGAIN;
15b71abe 2601
cebdb986 2602 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
2603 if (ret)
2604 goto err;
2605
cebdb986 2606 ret = get_unused_fd_flags(req->open.how.flags);
15b71abe
JA
2607 if (ret < 0)
2608 goto err;
2609
2610 file = do_filp_open(req->open.dfd, req->open.filename, &op);
2611 if (IS_ERR(file)) {
2612 put_unused_fd(ret);
2613 ret = PTR_ERR(file);
2614 } else {
2615 fsnotify_open(file);
2616 fd_install(ret, file);
2617 }
2618err:
2619 putname(req->open.filename);
2620 if (ret < 0)
2621 req_set_fail_links(req);
2622 io_cqring_add_event(req, ret);
2623 io_put_req_find_next(req, nxt);
2624 return 0;
2625}
2626
cebdb986
JA
2627static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
2628 bool force_nonblock)
2629{
2630 req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
2631 return io_openat2(req, nxt, force_nonblock);
2632}
2633
3e4827b0
JA
2634static int io_epoll_ctl_prep(struct io_kiocb *req,
2635 const struct io_uring_sqe *sqe)
2636{
2637#if defined(CONFIG_EPOLL)
2638 if (sqe->ioprio || sqe->buf_index)
2639 return -EINVAL;
2640
2641 req->epoll.epfd = READ_ONCE(sqe->fd);
2642 req->epoll.op = READ_ONCE(sqe->len);
2643 req->epoll.fd = READ_ONCE(sqe->off);
2644
2645 if (ep_op_has_event(req->epoll.op)) {
2646 struct epoll_event __user *ev;
2647
2648 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
2649 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
2650 return -EFAULT;
2651 }
2652
2653 return 0;
2654#else
2655 return -EOPNOTSUPP;
2656#endif
2657}
2658
2659static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
2660 bool force_nonblock)
2661{
2662#if defined(CONFIG_EPOLL)
2663 struct io_epoll *ie = &req->epoll;
2664 int ret;
2665
2666 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
2667 if (force_nonblock && ret == -EAGAIN)
2668 return -EAGAIN;
2669
2670 if (ret < 0)
2671 req_set_fail_links(req);
2672 io_cqring_add_event(req, ret);
2673 io_put_req_find_next(req, nxt);
2674 return 0;
2675#else
2676 return -EOPNOTSUPP;
2677#endif
2678}
2679
c1ca757b
JA
2680static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2681{
2682#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2683 if (sqe->ioprio || sqe->buf_index || sqe->off)
2684 return -EINVAL;
2685
2686 req->madvise.addr = READ_ONCE(sqe->addr);
2687 req->madvise.len = READ_ONCE(sqe->len);
2688 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
2689 return 0;
2690#else
2691 return -EOPNOTSUPP;
2692#endif
2693}
2694
2695static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
2696 bool force_nonblock)
2697{
2698#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2699 struct io_madvise *ma = &req->madvise;
2700 int ret;
2701
2702 if (force_nonblock)
2703 return -EAGAIN;
2704
2705 ret = do_madvise(ma->addr, ma->len, ma->advice);
2706 if (ret < 0)
2707 req_set_fail_links(req);
2708 io_cqring_add_event(req, ret);
2709 io_put_req_find_next(req, nxt);
2710 return 0;
2711#else
2712 return -EOPNOTSUPP;
2713#endif
2714}
2715
4840e418
JA
2716static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2717{
2718 if (sqe->ioprio || sqe->buf_index || sqe->addr)
2719 return -EINVAL;
2720
2721 req->fadvise.offset = READ_ONCE(sqe->off);
2722 req->fadvise.len = READ_ONCE(sqe->len);
2723 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
2724 return 0;
2725}
2726
2727static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
2728 bool force_nonblock)
2729{
2730 struct io_fadvise *fa = &req->fadvise;
2731 int ret;
2732
2733 /* DONTNEED may block, others _should_ not */
2734 if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
2735 return -EAGAIN;
2736
2737 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
2738 if (ret < 0)
2739 req_set_fail_links(req);
2740 io_cqring_add_event(req, ret);
2741 io_put_req_find_next(req, nxt);
2742 return 0;
2743}
2744
eddc7ef5
JA
2745static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2746{
f8748881 2747 const char __user *fname;
eddc7ef5
JA
2748 unsigned lookup_flags;
2749 int ret;
2750
2751 if (sqe->ioprio || sqe->buf_index)
2752 return -EINVAL;
2753
2754 req->open.dfd = READ_ONCE(sqe->fd);
2755 req->open.mask = READ_ONCE(sqe->len);
f8748881 2756 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
eddc7ef5 2757 req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
c12cedf2 2758 req->open.how.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 2759
c12cedf2 2760 if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
eddc7ef5
JA
2761 return -EINVAL;
2762
f8748881 2763 req->open.filename = getname_flags(fname, lookup_flags, NULL);
eddc7ef5
JA
2764 if (IS_ERR(req->open.filename)) {
2765 ret = PTR_ERR(req->open.filename);
2766 req->open.filename = NULL;
2767 return ret;
2768 }
2769
2770 return 0;
2771}
2772
2773static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
2774 bool force_nonblock)
2775{
2776 struct io_open *ctx = &req->open;
2777 unsigned lookup_flags;
2778 struct path path;
2779 struct kstat stat;
2780 int ret;
2781
2782 if (force_nonblock)
2783 return -EAGAIN;
2784
c12cedf2 2785 if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
eddc7ef5
JA
2786 return -EINVAL;
2787
2788retry:
2789 /* filename_lookup() drops it, keep a reference */
2790 ctx->filename->refcnt++;
2791
2792 ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
2793 NULL);
2794 if (ret)
2795 goto err;
2796
c12cedf2 2797 ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
eddc7ef5
JA
2798 path_put(&path);
2799 if (retry_estale(ret, lookup_flags)) {
2800 lookup_flags |= LOOKUP_REVAL;
2801 goto retry;
2802 }
2803 if (!ret)
2804 ret = cp_statx(&stat, ctx->buffer);
2805err:
2806 putname(ctx->filename);
2807 if (ret < 0)
2808 req_set_fail_links(req);
2809 io_cqring_add_event(req, ret);
2810 io_put_req_find_next(req, nxt);
2811 return 0;
2812}
2813
b5dba59e
JA
2814static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2815{
2816 /*
2817 * If we queue this for async, it must not be cancellable. That would
2818 * leave the 'file' in an undeterminate state.
2819 */
2820 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
2821
2822 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
2823 sqe->rw_flags || sqe->buf_index)
2824 return -EINVAL;
2825 if (sqe->flags & IOSQE_FIXED_FILE)
2826 return -EINVAL;
2827
2828 req->close.fd = READ_ONCE(sqe->fd);
2829 if (req->file->f_op == &io_uring_fops ||
b14cca0c 2830 req->close.fd == req->ctx->ring_fd)
b5dba59e
JA
2831 return -EBADF;
2832
2833 return 0;
2834}
2835
2836static void io_close_finish(struct io_wq_work **workptr)
2837{
2838 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2839 struct io_kiocb *nxt = NULL;
2840
2841 /* Invoked with files, we need to do the close */
2842 if (req->work.files) {
2843 int ret;
2844
2845 ret = filp_close(req->close.put_file, req->work.files);
2846 if (ret < 0) {
2847 req_set_fail_links(req);
2848 }
2849 io_cqring_add_event(req, ret);
2850 }
2851
2852 fput(req->close.put_file);
2853
2854 /* we bypassed the re-issue, drop the submission reference */
2855 io_put_req(req);
2856 io_put_req_find_next(req, &nxt);
2857 if (nxt)
2858 io_wq_assign_next(workptr, nxt);
2859}
2860
2861static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
2862 bool force_nonblock)
2863{
2864 int ret;
2865
2866 req->close.put_file = NULL;
2867 ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
2868 if (ret < 0)
2869 return ret;
2870
2871 /* if the file has a flush method, be safe and punt to async */
f86cd20c 2872 if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
b5dba59e 2873 goto eagain;
b5dba59e
JA
2874
2875 /*
2876 * No ->flush(), safely close from here and just punt the
2877 * fput() to async context.
2878 */
2879 ret = filp_close(req->close.put_file, current->files);
2880
2881 if (ret < 0)
2882 req_set_fail_links(req);
2883 io_cqring_add_event(req, ret);
2884
2885 if (io_wq_current_is_worker()) {
2886 struct io_wq_work *old_work, *work;
2887
2888 old_work = work = &req->work;
2889 io_close_finish(&work);
2890 if (work && work != old_work)
2891 *nxt = container_of(work, struct io_kiocb, work);
2892 return 0;
2893 }
2894
2895eagain:
2896 req->work.func = io_close_finish;
2897 return -EAGAIN;
2898}
2899
3529d8c2 2900static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
2901{
2902 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4
JA
2903
2904 if (!req->file)
2905 return -EBADF;
5d17b4a4
JA
2906
2907 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2908 return -EINVAL;
2909 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2910 return -EINVAL;
2911
8ed8d3c3
JA
2912 req->sync.off = READ_ONCE(sqe->off);
2913 req->sync.len = READ_ONCE(sqe->len);
2914 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
2915 return 0;
2916}
2917
2918static void io_sync_file_range_finish(struct io_wq_work **workptr)
2919{
2920 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2921 struct io_kiocb *nxt = NULL;
2922 int ret;
2923
2924 if (io_req_cancelled(req))
2925 return;
2926
9adbd45d 2927 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
2928 req->sync.flags);
2929 if (ret < 0)
2930 req_set_fail_links(req);
2931 io_cqring_add_event(req, ret);
2932 io_put_req_find_next(req, &nxt);
2933 if (nxt)
78912934 2934 io_wq_assign_next(workptr, nxt);
5d17b4a4
JA
2935}
2936
fc4df999 2937static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
5d17b4a4
JA
2938 bool force_nonblock)
2939{
8ed8d3c3 2940 struct io_wq_work *work, *old_work;
5d17b4a4
JA
2941
2942 /* sync_file_range always requires a blocking context */
8ed8d3c3
JA
2943 if (force_nonblock) {
2944 io_put_req(req);
2945 req->work.func = io_sync_file_range_finish;
5d17b4a4 2946 return -EAGAIN;
8ed8d3c3 2947 }
5d17b4a4 2948
8ed8d3c3
JA
2949 work = old_work = &req->work;
2950 io_sync_file_range_finish(&work);
2951 if (work && work != old_work)
2952 *nxt = container_of(work, struct io_kiocb, work);
5d17b4a4
JA
2953 return 0;
2954}
2955
b7bb4f7d
JA
2956#if defined(CONFIG_NET)
2957static void io_sendrecv_async(struct io_wq_work **workptr)
2958{
2959 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2960 struct iovec *iov = NULL;
2961
2962 if (req->io->rw.iov != req->io->rw.fast_iov)
2963 iov = req->io->msg.iov;
2964 io_wq_submit_work(workptr);
2965 kfree(iov);
2966}
2967#endif
2968
3529d8c2 2969static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 2970{
0fa03c62 2971#if defined(CONFIG_NET)
e47293fd 2972 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 2973 struct io_async_ctx *io = req->io;
03b1230c 2974
e47293fd
JA
2975 sr->msg_flags = READ_ONCE(sqe->msg_flags);
2976 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 2977 sr->len = READ_ONCE(sqe->len);
3529d8c2 2978
fddaface 2979 if (!io || req->opcode == IORING_OP_SEND)
3529d8c2
JA
2980 return 0;
2981
d9688565 2982 io->msg.iov = io->msg.fast_iov;
3529d8c2 2983 return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 2984 &io->msg.iov);
03b1230c 2985#else
e47293fd 2986 return -EOPNOTSUPP;
03b1230c
JA
2987#endif
2988}
2989
fc4df999
JA
2990static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2991 bool force_nonblock)
aa1fa28f 2992{
03b1230c 2993#if defined(CONFIG_NET)
0b416c3e 2994 struct io_async_msghdr *kmsg = NULL;
0fa03c62
JA
2995 struct socket *sock;
2996 int ret;
2997
2998 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2999 return -EINVAL;
3000
3001 sock = sock_from_file(req->file, &ret);
3002 if (sock) {
b7bb4f7d 3003 struct io_async_ctx io;
03b1230c 3004 struct sockaddr_storage addr;
0fa03c62
JA
3005 unsigned flags;
3006
03b1230c 3007 if (req->io) {
0b416c3e
JA
3008 kmsg = &req->io->msg;
3009 kmsg->msg.msg_name = &addr;
3010 /* if iov is set, it's allocated already */
3011 if (!kmsg->iov)
3012 kmsg->iov = kmsg->fast_iov;
3013 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3014 } else {
3529d8c2
JA
3015 struct io_sr_msg *sr = &req->sr_msg;
3016
0b416c3e
JA
3017 kmsg = &io.msg;
3018 kmsg->msg.msg_name = &addr;
3529d8c2
JA
3019
3020 io.msg.iov = io.msg.fast_iov;
3021 ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3022 sr->msg_flags, &io.msg.iov);
03b1230c 3023 if (ret)
3529d8c2 3024 return ret;
03b1230c 3025 }
0fa03c62 3026
e47293fd
JA
3027 flags = req->sr_msg.msg_flags;
3028 if (flags & MSG_DONTWAIT)
3029 req->flags |= REQ_F_NOWAIT;
3030 else if (force_nonblock)
3031 flags |= MSG_DONTWAIT;
3032
0b416c3e 3033 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
03b1230c 3034 if (force_nonblock && ret == -EAGAIN) {
b7bb4f7d
JA
3035 if (req->io)
3036 return -EAGAIN;
3037 if (io_alloc_async_ctx(req))
3038 return -ENOMEM;
3039 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
3040 req->work.func = io_sendrecv_async;
0b416c3e 3041 return -EAGAIN;
03b1230c 3042 }
441cdbd5
JA
3043 if (ret == -ERESTARTSYS)
3044 ret = -EINTR;
0fa03c62
JA
3045 }
3046
b7bb4f7d 3047 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3048 kfree(kmsg->iov);
78e19bbe 3049 io_cqring_add_event(req, ret);
4e88d6e7
JA
3050 if (ret < 0)
3051 req_set_fail_links(req);
ec9c02ad 3052 io_put_req_find_next(req, nxt);
5d17b4a4 3053 return 0;
03b1230c
JA
3054#else
3055 return -EOPNOTSUPP;
aa1fa28f 3056#endif
03b1230c 3057}
aa1fa28f 3058
fddaface
JA
3059static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
3060 bool force_nonblock)
3061{
3062#if defined(CONFIG_NET)
3063 struct socket *sock;
3064 int ret;
3065
3066 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3067 return -EINVAL;
3068
3069 sock = sock_from_file(req->file, &ret);
3070 if (sock) {
3071 struct io_sr_msg *sr = &req->sr_msg;
3072 struct msghdr msg;
3073 struct iovec iov;
3074 unsigned flags;
3075
3076 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3077 &msg.msg_iter);
3078 if (ret)
3079 return ret;
3080
3081 msg.msg_name = NULL;
3082 msg.msg_control = NULL;
3083 msg.msg_controllen = 0;
3084 msg.msg_namelen = 0;
3085
3086 flags = req->sr_msg.msg_flags;
3087 if (flags & MSG_DONTWAIT)
3088 req->flags |= REQ_F_NOWAIT;
3089 else if (force_nonblock)
3090 flags |= MSG_DONTWAIT;
3091
0b7b21e4
JA
3092 msg.msg_flags = flags;
3093 ret = sock_sendmsg(sock, &msg);
fddaface
JA
3094 if (force_nonblock && ret == -EAGAIN)
3095 return -EAGAIN;
3096 if (ret == -ERESTARTSYS)
3097 ret = -EINTR;
3098 }
3099
3100 io_cqring_add_event(req, ret);
3101 if (ret < 0)
3102 req_set_fail_links(req);
3103 io_put_req_find_next(req, nxt);
3104 return 0;
3105#else
3106 return -EOPNOTSUPP;
3107#endif
3108}
3109
3529d8c2
JA
3110static int io_recvmsg_prep(struct io_kiocb *req,
3111 const struct io_uring_sqe *sqe)
aa1fa28f
JA
3112{
3113#if defined(CONFIG_NET)
e47293fd 3114 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2
JA
3115 struct io_async_ctx *io = req->io;
3116
3117 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3118 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 3119 sr->len = READ_ONCE(sqe->len);
06b76d44 3120
fddaface 3121 if (!io || req->opcode == IORING_OP_RECV)
06b76d44 3122 return 0;
03b1230c 3123
d9688565 3124 io->msg.iov = io->msg.fast_iov;
3529d8c2 3125 return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 3126 &io->msg.uaddr, &io->msg.iov);
aa1fa28f 3127#else
e47293fd 3128 return -EOPNOTSUPP;
aa1fa28f
JA
3129#endif
3130}
3131
fc4df999
JA
3132static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
3133 bool force_nonblock)
aa1fa28f
JA
3134{
3135#if defined(CONFIG_NET)
0b416c3e 3136 struct io_async_msghdr *kmsg = NULL;
03b1230c
JA
3137 struct socket *sock;
3138 int ret;
3139
3140 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3141 return -EINVAL;
3142
3143 sock = sock_from_file(req->file, &ret);
3144 if (sock) {
b7bb4f7d 3145 struct io_async_ctx io;
03b1230c 3146 struct sockaddr_storage addr;
03b1230c
JA
3147 unsigned flags;
3148
03b1230c 3149 if (req->io) {
0b416c3e
JA
3150 kmsg = &req->io->msg;
3151 kmsg->msg.msg_name = &addr;
3152 /* if iov is set, it's allocated already */
3153 if (!kmsg->iov)
3154 kmsg->iov = kmsg->fast_iov;
3155 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3156 } else {
3529d8c2
JA
3157 struct io_sr_msg *sr = &req->sr_msg;
3158
0b416c3e
JA
3159 kmsg = &io.msg;
3160 kmsg->msg.msg_name = &addr;
3529d8c2
JA
3161
3162 io.msg.iov = io.msg.fast_iov;
3163 ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
3164 sr->msg_flags, &io.msg.uaddr,
3165 &io.msg.iov);
03b1230c 3166 if (ret)
3529d8c2 3167 return ret;
03b1230c
JA
3168 }
3169
e47293fd
JA
3170 flags = req->sr_msg.msg_flags;
3171 if (flags & MSG_DONTWAIT)
3172 req->flags |= REQ_F_NOWAIT;
3173 else if (force_nonblock)
3174 flags |= MSG_DONTWAIT;
3175
3176 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3177 kmsg->uaddr, flags);
03b1230c 3178 if (force_nonblock && ret == -EAGAIN) {
b7bb4f7d
JA
3179 if (req->io)
3180 return -EAGAIN;
3181 if (io_alloc_async_ctx(req))
3182 return -ENOMEM;
3183 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
3184 req->work.func = io_sendrecv_async;
0b416c3e 3185 return -EAGAIN;
03b1230c
JA
3186 }
3187 if (ret == -ERESTARTSYS)
3188 ret = -EINTR;
3189 }
3190
b7bb4f7d 3191 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3192 kfree(kmsg->iov);
03b1230c 3193 io_cqring_add_event(req, ret);
4e88d6e7
JA
3194 if (ret < 0)
3195 req_set_fail_links(req);
03b1230c
JA
3196 io_put_req_find_next(req, nxt);
3197 return 0;
0fa03c62
JA
3198#else
3199 return -EOPNOTSUPP;
3200#endif
3201}
5d17b4a4 3202
fddaface
JA
3203static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
3204 bool force_nonblock)
3205{
3206#if defined(CONFIG_NET)
3207 struct socket *sock;
3208 int ret;
3209
3210 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3211 return -EINVAL;
3212
3213 sock = sock_from_file(req->file, &ret);
3214 if (sock) {
3215 struct io_sr_msg *sr = &req->sr_msg;
3216 struct msghdr msg;
3217 struct iovec iov;
3218 unsigned flags;
3219
3220 ret = import_single_range(READ, sr->buf, sr->len, &iov,
3221 &msg.msg_iter);
3222 if (ret)
3223 return ret;
3224
3225 msg.msg_name = NULL;
3226 msg.msg_control = NULL;
3227 msg.msg_controllen = 0;
3228 msg.msg_namelen = 0;
3229 msg.msg_iocb = NULL;
3230 msg.msg_flags = 0;
3231
3232 flags = req->sr_msg.msg_flags;
3233 if (flags & MSG_DONTWAIT)
3234 req->flags |= REQ_F_NOWAIT;
3235 else if (force_nonblock)
3236 flags |= MSG_DONTWAIT;
3237
0b7b21e4 3238 ret = sock_recvmsg(sock, &msg, flags);
fddaface
JA
3239 if (force_nonblock && ret == -EAGAIN)
3240 return -EAGAIN;
3241 if (ret == -ERESTARTSYS)
3242 ret = -EINTR;
3243 }
3244
3245 io_cqring_add_event(req, ret);
3246 if (ret < 0)
3247 req_set_fail_links(req);
3248 io_put_req_find_next(req, nxt);
3249 return 0;
3250#else
3251 return -EOPNOTSUPP;
3252#endif
3253}
3254
3255
3529d8c2 3256static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35
JA
3257{
3258#if defined(CONFIG_NET)
8ed8d3c3
JA
3259 struct io_accept *accept = &req->accept;
3260
17f2fe35
JA
3261 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3262 return -EINVAL;
8042d6ce 3263 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
3264 return -EINVAL;
3265
d55e5f5b
JA
3266 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3267 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 3268 accept->flags = READ_ONCE(sqe->accept_flags);
8ed8d3c3
JA
3269 return 0;
3270#else
3271 return -EOPNOTSUPP;
3272#endif
3273}
17f2fe35 3274
8ed8d3c3
JA
3275#if defined(CONFIG_NET)
3276static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
3277 bool force_nonblock)
3278{
3279 struct io_accept *accept = &req->accept;
3280 unsigned file_flags;
3281 int ret;
3282
3283 file_flags = force_nonblock ? O_NONBLOCK : 0;
3284 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3285 accept->addr_len, accept->flags);
3286 if (ret == -EAGAIN && force_nonblock)
17f2fe35 3287 return -EAGAIN;
8e3cca12
JA
3288 if (ret == -ERESTARTSYS)
3289 ret = -EINTR;
4e88d6e7
JA
3290 if (ret < 0)
3291 req_set_fail_links(req);
78e19bbe 3292 io_cqring_add_event(req, ret);
ec9c02ad 3293 io_put_req_find_next(req, nxt);
17f2fe35 3294 return 0;
8ed8d3c3
JA
3295}
3296
3297static void io_accept_finish(struct io_wq_work **workptr)
3298{
3299 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3300 struct io_kiocb *nxt = NULL;
3301
3302 if (io_req_cancelled(req))
3303 return;
3304 __io_accept(req, &nxt, false);
3305 if (nxt)
78912934 3306 io_wq_assign_next(workptr, nxt);
8ed8d3c3
JA
3307}
3308#endif
3309
3310static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
3311 bool force_nonblock)
3312{
3313#if defined(CONFIG_NET)
3314 int ret;
3315
8ed8d3c3
JA
3316 ret = __io_accept(req, nxt, force_nonblock);
3317 if (ret == -EAGAIN && force_nonblock) {
3318 req->work.func = io_accept_finish;
8ed8d3c3
JA
3319 io_put_req(req);
3320 return -EAGAIN;
3321 }
3322 return 0;
0fa03c62
JA
3323#else
3324 return -EOPNOTSUPP;
3325#endif
3326}
5d17b4a4 3327
3529d8c2 3328static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021
JA
3329{
3330#if defined(CONFIG_NET)
3529d8c2
JA
3331 struct io_connect *conn = &req->connect;
3332 struct io_async_ctx *io = req->io;
f499a021 3333
3fbb51c1
JA
3334 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3335 return -EINVAL;
3336 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3337 return -EINVAL;
3338
3529d8c2
JA
3339 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3340 conn->addr_len = READ_ONCE(sqe->addr2);
3341
3342 if (!io)
3343 return 0;
3344
3345 return move_addr_to_kernel(conn->addr, conn->addr_len,
3fbb51c1 3346 &io->connect.address);
f499a021 3347#else
3fbb51c1 3348 return -EOPNOTSUPP;
f499a021
JA
3349#endif
3350}
3351
fc4df999
JA
3352static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
3353 bool force_nonblock)
f8e85cf2
JA
3354{
3355#if defined(CONFIG_NET)
f499a021 3356 struct io_async_ctx __io, *io;
f8e85cf2 3357 unsigned file_flags;
3fbb51c1 3358 int ret;
f8e85cf2 3359
f499a021
JA
3360 if (req->io) {
3361 io = req->io;
3362 } else {
3529d8c2
JA
3363 ret = move_addr_to_kernel(req->connect.addr,
3364 req->connect.addr_len,
3365 &__io.connect.address);
f499a021
JA
3366 if (ret)
3367 goto out;
3368 io = &__io;
3369 }
3370
3fbb51c1
JA
3371 file_flags = force_nonblock ? O_NONBLOCK : 0;
3372
3373 ret = __sys_connect_file(req->file, &io->connect.address,
3374 req->connect.addr_len, file_flags);
87f80d62 3375 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
b7bb4f7d
JA
3376 if (req->io)
3377 return -EAGAIN;
3378 if (io_alloc_async_ctx(req)) {
f499a021
JA
3379 ret = -ENOMEM;
3380 goto out;
3381 }
b7bb4f7d 3382 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
f8e85cf2 3383 return -EAGAIN;
f499a021 3384 }
f8e85cf2
JA
3385 if (ret == -ERESTARTSYS)
3386 ret = -EINTR;
f499a021 3387out:
4e88d6e7
JA
3388 if (ret < 0)
3389 req_set_fail_links(req);
f8e85cf2
JA
3390 io_cqring_add_event(req, ret);
3391 io_put_req_find_next(req, nxt);
3392 return 0;
3393#else
3394 return -EOPNOTSUPP;
3395#endif
3396}
3397
221c5eb2
JA
3398static void io_poll_remove_one(struct io_kiocb *req)
3399{
3400 struct io_poll_iocb *poll = &req->poll;
3401
3402 spin_lock(&poll->head->lock);
3403 WRITE_ONCE(poll->canceled, true);
392edb45
JA
3404 if (!list_empty(&poll->wait.entry)) {
3405 list_del_init(&poll->wait.entry);
a197f664 3406 io_queue_async_work(req);
221c5eb2
JA
3407 }
3408 spin_unlock(&poll->head->lock);
78076bb6 3409 hash_del(&req->hash_node);
221c5eb2
JA
3410}
3411
3412static void io_poll_remove_all(struct io_ring_ctx *ctx)
3413{
78076bb6 3414 struct hlist_node *tmp;
221c5eb2 3415 struct io_kiocb *req;
78076bb6 3416 int i;
221c5eb2
JA
3417
3418 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
3419 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
3420 struct hlist_head *list;
3421
3422 list = &ctx->cancel_hash[i];
3423 hlist_for_each_entry_safe(req, tmp, list, hash_node)
3424 io_poll_remove_one(req);
221c5eb2
JA
3425 }
3426 spin_unlock_irq(&ctx->completion_lock);
3427}
3428
47f46768
JA
3429static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
3430{
78076bb6 3431 struct hlist_head *list;
47f46768
JA
3432 struct io_kiocb *req;
3433
78076bb6
JA
3434 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
3435 hlist_for_each_entry(req, list, hash_node) {
3436 if (sqe_addr == req->user_data) {
eac406c6
JA
3437 io_poll_remove_one(req);
3438 return 0;
3439 }
47f46768
JA
3440 }
3441
3442 return -ENOENT;
3443}
3444
3529d8c2
JA
3445static int io_poll_remove_prep(struct io_kiocb *req,
3446 const struct io_uring_sqe *sqe)
0969e783 3447{
0969e783
JA
3448 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3449 return -EINVAL;
3450 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3451 sqe->poll_events)
3452 return -EINVAL;
3453
3454 req->poll.addr = READ_ONCE(sqe->addr);
0969e783
JA
3455 return 0;
3456}
3457
221c5eb2
JA
3458/*
3459 * Find a running poll command that matches one specified in sqe->addr,
3460 * and remove it if found.
3461 */
fc4df999 3462static int io_poll_remove(struct io_kiocb *req)
221c5eb2
JA
3463{
3464 struct io_ring_ctx *ctx = req->ctx;
0969e783 3465 u64 addr;
47f46768 3466 int ret;
221c5eb2 3467
0969e783 3468 addr = req->poll.addr;
221c5eb2 3469 spin_lock_irq(&ctx->completion_lock);
0969e783 3470 ret = io_poll_cancel(ctx, addr);
221c5eb2
JA
3471 spin_unlock_irq(&ctx->completion_lock);
3472
78e19bbe 3473 io_cqring_add_event(req, ret);
4e88d6e7
JA
3474 if (ret < 0)
3475 req_set_fail_links(req);
e65ef56d 3476 io_put_req(req);
221c5eb2
JA
3477 return 0;
3478}
3479
b0dd8a41 3480static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
221c5eb2 3481{
a197f664
JL
3482 struct io_ring_ctx *ctx = req->ctx;
3483
8c838788 3484 req->poll.done = true;
b0dd8a41
JA
3485 if (error)
3486 io_cqring_fill_event(req, error);
3487 else
3488 io_cqring_fill_event(req, mangle_poll(mask));
8c838788 3489 io_commit_cqring(ctx);
221c5eb2
JA
3490}
3491
561fb04a 3492static void io_poll_complete_work(struct io_wq_work **workptr)
221c5eb2 3493{
561fb04a 3494 struct io_wq_work *work = *workptr;
221c5eb2
JA
3495 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3496 struct io_poll_iocb *poll = &req->poll;
3497 struct poll_table_struct pt = { ._key = poll->events };
3498 struct io_ring_ctx *ctx = req->ctx;
89723d0b 3499 struct io_kiocb *nxt = NULL;
221c5eb2 3500 __poll_t mask = 0;
b0dd8a41 3501 int ret = 0;
221c5eb2 3502
b0dd8a41 3503 if (work->flags & IO_WQ_WORK_CANCEL) {
561fb04a 3504 WRITE_ONCE(poll->canceled, true);
b0dd8a41
JA
3505 ret = -ECANCELED;
3506 } else if (READ_ONCE(poll->canceled)) {
3507 ret = -ECANCELED;
3508 }
561fb04a 3509
b0dd8a41 3510 if (ret != -ECANCELED)
221c5eb2
JA
3511 mask = vfs_poll(poll->file, &pt) & poll->events;
3512
3513 /*
3514 * Note that ->ki_cancel callers also delete iocb from active_reqs after
3515 * calling ->ki_cancel. We need the ctx_lock roundtrip here to
3516 * synchronize with them. In the cancellation case the list_del_init
3517 * itself is not actually needed, but harmless so we keep it in to
3518 * avoid further branches in the fast path.
3519 */
3520 spin_lock_irq(&ctx->completion_lock);
b0dd8a41 3521 if (!mask && ret != -ECANCELED) {
392edb45 3522 add_wait_queue(poll->head, &poll->wait);
221c5eb2
JA
3523 spin_unlock_irq(&ctx->completion_lock);
3524 return;
3525 }
78076bb6 3526 hash_del(&req->hash_node);
b0dd8a41 3527 io_poll_complete(req, mask, ret);
221c5eb2
JA
3528 spin_unlock_irq(&ctx->completion_lock);
3529
8c838788 3530 io_cqring_ev_posted(ctx);
89723d0b 3531
4e88d6e7
JA
3532 if (ret < 0)
3533 req_set_fail_links(req);
ec9c02ad 3534 io_put_req_find_next(req, &nxt);
89723d0b 3535 if (nxt)
78912934 3536 io_wq_assign_next(workptr, nxt);
221c5eb2
JA
3537}
3538
e94f141b
JA
3539static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
3540{
e94f141b 3541 struct io_kiocb *req, *tmp;
8237e045 3542 struct req_batch rb;
e94f141b 3543
c6ca97b3 3544 rb.to_free = rb.need_iter = 0;
e94f141b
JA
3545 spin_lock_irq(&ctx->completion_lock);
3546 llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
3547 hash_del(&req->hash_node);
3548 io_poll_complete(req, req->result, 0);
3549
8237e045
JA
3550 if (refcount_dec_and_test(&req->refs) &&
3551 !io_req_multi_free(&rb, req)) {
3552 req->flags |= REQ_F_COMP_LOCKED;
3553 io_free_req(req);
e94f141b
JA
3554 }
3555 }
3556 spin_unlock_irq(&ctx->completion_lock);
3557
3558 io_cqring_ev_posted(ctx);
8237e045 3559 io_free_req_many(ctx, &rb);
e94f141b
JA
3560}
3561
3562static void io_poll_flush(struct io_wq_work **workptr)
3563{
3564 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3565 struct llist_node *nodes;
3566
3567 nodes = llist_del_all(&req->ctx->poll_llist);
3568 if (nodes)
3569 __io_poll_flush(req->ctx, nodes);
3570}
3571
f0b493e6
JA
3572static void io_poll_trigger_evfd(struct io_wq_work **workptr)
3573{
3574 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3575
3576 eventfd_signal(req->ctx->cq_ev_fd, 1);
3577 io_put_req(req);
3578}
3579
221c5eb2
JA
3580static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
3581 void *key)
3582{
e944475e 3583 struct io_poll_iocb *poll = wait->private;
221c5eb2
JA
3584 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
3585 struct io_ring_ctx *ctx = req->ctx;
3586 __poll_t mask = key_to_poll(key);
221c5eb2
JA
3587
3588 /* for instances that support it check for an event match first: */
8c838788
JA
3589 if (mask && !(mask & poll->events))
3590 return 0;
221c5eb2 3591
392edb45 3592 list_del_init(&poll->wait.entry);
221c5eb2 3593
7c9e7f0f
JA
3594 /*
3595 * Run completion inline if we can. We're using trylock here because
3596 * we are violating the completion_lock -> poll wq lock ordering.
3597 * If we have a link timeout we're going to need the completion_lock
3598 * for finalizing the request, mark us as having grabbed that already.
3599 */
e94f141b
JA
3600 if (mask) {
3601 unsigned long flags;
221c5eb2 3602
e94f141b
JA
3603 if (llist_empty(&ctx->poll_llist) &&
3604 spin_trylock_irqsave(&ctx->completion_lock, flags)) {
f0b493e6
JA
3605 bool trigger_ev;
3606
e94f141b
JA
3607 hash_del(&req->hash_node);
3608 io_poll_complete(req, mask, 0);
e94f141b 3609
f0b493e6
JA
3610 trigger_ev = io_should_trigger_evfd(ctx);
3611 if (trigger_ev && eventfd_signal_count()) {
3612 trigger_ev = false;
3613 req->work.func = io_poll_trigger_evfd;
3614 } else {
3615 req->flags |= REQ_F_COMP_LOCKED;
3616 io_put_req(req);
3617 req = NULL;
3618 }
3619 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3620 __io_cqring_ev_posted(ctx, trigger_ev);
e94f141b
JA
3621 } else {
3622 req->result = mask;
3623 req->llist_node.next = NULL;
3624 /* if the list wasn't empty, we're done */
3625 if (!llist_add(&req->llist_node, &ctx->poll_llist))
3626 req = NULL;
3627 else
3628 req->work.func = io_poll_flush;
3629 }
221c5eb2 3630 }
e94f141b
JA
3631 if (req)
3632 io_queue_async_work(req);
221c5eb2 3633
221c5eb2
JA
3634 return 1;
3635}
3636
3637struct io_poll_table {
3638 struct poll_table_struct pt;
3639 struct io_kiocb *req;
3640 int error;
3641};
3642
3643static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
3644 struct poll_table_struct *p)
3645{
3646 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
3647
3648 if (unlikely(pt->req->poll.head)) {
3649 pt->error = -EINVAL;
3650 return;
3651 }
3652
3653 pt->error = 0;
3654 pt->req->poll.head = head;
392edb45 3655 add_wait_queue(head, &pt->req->poll.wait);
221c5eb2
JA
3656}
3657
eac406c6
JA
3658static void io_poll_req_insert(struct io_kiocb *req)
3659{
3660 struct io_ring_ctx *ctx = req->ctx;
78076bb6
JA
3661 struct hlist_head *list;
3662
3663 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
3664 hlist_add_head(&req->hash_node, list);
eac406c6
JA
3665}
3666
3529d8c2 3667static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
3668{
3669 struct io_poll_iocb *poll = &req->poll;
221c5eb2 3670 u16 events;
221c5eb2
JA
3671
3672 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3673 return -EINVAL;
3674 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
3675 return -EINVAL;
09bb8394
JA
3676 if (!poll->file)
3677 return -EBADF;
221c5eb2 3678
221c5eb2
JA
3679 events = READ_ONCE(sqe->poll_events);
3680 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
0969e783
JA
3681 return 0;
3682}
3683
3684static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
3685{
3686 struct io_poll_iocb *poll = &req->poll;
3687 struct io_ring_ctx *ctx = req->ctx;
3688 struct io_poll_table ipt;
3689 bool cancel = false;
3690 __poll_t mask;
0969e783
JA
3691
3692 INIT_IO_WORK(&req->work, io_poll_complete_work);
78076bb6 3693 INIT_HLIST_NODE(&req->hash_node);
221c5eb2 3694
221c5eb2 3695 poll->head = NULL;
8c838788 3696 poll->done = false;
221c5eb2
JA
3697 poll->canceled = false;
3698
3699 ipt.pt._qproc = io_poll_queue_proc;
3700 ipt.pt._key = poll->events;
3701 ipt.req = req;
3702 ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
3703
3704 /* initialized the list so that we can do list_empty checks */
392edb45
JA
3705 INIT_LIST_HEAD(&poll->wait.entry);
3706 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
3707 poll->wait.private = poll;
221c5eb2 3708
36703247
JA
3709 INIT_LIST_HEAD(&req->list);
3710
221c5eb2 3711 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
221c5eb2
JA
3712
3713 spin_lock_irq(&ctx->completion_lock);
8c838788
JA
3714 if (likely(poll->head)) {
3715 spin_lock(&poll->head->lock);
392edb45 3716 if (unlikely(list_empty(&poll->wait.entry))) {
8c838788
JA
3717 if (ipt.error)
3718 cancel = true;
3719 ipt.error = 0;
3720 mask = 0;
3721 }
3722 if (mask || ipt.error)
392edb45 3723 list_del_init(&poll->wait.entry);
8c838788
JA
3724 else if (cancel)
3725 WRITE_ONCE(poll->canceled, true);
3726 else if (!poll->done) /* actually waiting for an event */
eac406c6 3727 io_poll_req_insert(req);
8c838788
JA
3728 spin_unlock(&poll->head->lock);
3729 }
3730 if (mask) { /* no async, we'd stolen it */
221c5eb2 3731 ipt.error = 0;
b0dd8a41 3732 io_poll_complete(req, mask, 0);
221c5eb2 3733 }
221c5eb2
JA
3734 spin_unlock_irq(&ctx->completion_lock);
3735
8c838788
JA
3736 if (mask) {
3737 io_cqring_ev_posted(ctx);
ec9c02ad 3738 io_put_req_find_next(req, nxt);
221c5eb2 3739 }
8c838788 3740 return ipt.error;
221c5eb2
JA
3741}
3742
5262f567
JA
3743static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
3744{
ad8a48ac
JA
3745 struct io_timeout_data *data = container_of(timer,
3746 struct io_timeout_data, timer);
3747 struct io_kiocb *req = data->req;
3748 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
3749 unsigned long flags;
3750
5262f567
JA
3751 atomic_inc(&ctx->cq_timeouts);
3752
3753 spin_lock_irqsave(&ctx->completion_lock, flags);
ef03681a 3754 /*
11365043
JA
3755 * We could be racing with timeout deletion. If the list is empty,
3756 * then timeout lookup already found it and will be handling it.
ef03681a 3757 */
842f9612 3758 if (!list_empty(&req->list)) {
11365043 3759 struct io_kiocb *prev;
5262f567 3760
11365043
JA
3761 /*
3762 * Adjust the reqs sequence before the current one because it
d195a66e 3763 * will consume a slot in the cq_ring and the cq_tail
11365043
JA
3764 * pointer will be increased, otherwise other timeout reqs may
3765 * return in advance without waiting for enough wait_nr.
3766 */
3767 prev = req;
3768 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
3769 prev->sequence++;
11365043 3770 list_del_init(&req->list);
11365043 3771 }
5262f567 3772
78e19bbe 3773 io_cqring_fill_event(req, -ETIME);
5262f567
JA
3774 io_commit_cqring(ctx);
3775 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3776
3777 io_cqring_ev_posted(ctx);
4e88d6e7 3778 req_set_fail_links(req);
5262f567
JA
3779 io_put_req(req);
3780 return HRTIMER_NORESTART;
3781}
3782
47f46768
JA
3783static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
3784{
3785 struct io_kiocb *req;
3786 int ret = -ENOENT;
3787
3788 list_for_each_entry(req, &ctx->timeout_list, list) {
3789 if (user_data == req->user_data) {
3790 list_del_init(&req->list);
3791 ret = 0;
3792 break;
3793 }
3794 }
3795
3796 if (ret == -ENOENT)
3797 return ret;
3798
2d28390a 3799 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
47f46768
JA
3800 if (ret == -1)
3801 return -EALREADY;
3802
4e88d6e7 3803 req_set_fail_links(req);
47f46768
JA
3804 io_cqring_fill_event(req, -ECANCELED);
3805 io_put_req(req);
3806 return 0;
3807}
3808
3529d8c2
JA
3809static int io_timeout_remove_prep(struct io_kiocb *req,
3810 const struct io_uring_sqe *sqe)
b29472ee 3811{
b29472ee
JA
3812 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3813 return -EINVAL;
3814 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
3815 return -EINVAL;
3816
3817 req->timeout.addr = READ_ONCE(sqe->addr);
3818 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
3819 if (req->timeout.flags)
3820 return -EINVAL;
3821
b29472ee
JA
3822 return 0;
3823}
3824
11365043
JA
3825/*
3826 * Remove or update an existing timeout command
3827 */
fc4df999 3828static int io_timeout_remove(struct io_kiocb *req)
11365043
JA
3829{
3830 struct io_ring_ctx *ctx = req->ctx;
47f46768 3831 int ret;
11365043 3832
11365043 3833 spin_lock_irq(&ctx->completion_lock);
b29472ee 3834 ret = io_timeout_cancel(ctx, req->timeout.addr);
11365043 3835
47f46768 3836 io_cqring_fill_event(req, ret);
11365043
JA
3837 io_commit_cqring(ctx);
3838 spin_unlock_irq(&ctx->completion_lock);
5262f567 3839 io_cqring_ev_posted(ctx);
4e88d6e7
JA
3840 if (ret < 0)
3841 req_set_fail_links(req);
ec9c02ad 3842 io_put_req(req);
11365043 3843 return 0;
5262f567
JA
3844}
3845
3529d8c2 3846static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 3847 bool is_timeout_link)
5262f567 3848{
ad8a48ac 3849 struct io_timeout_data *data;
a41525ab 3850 unsigned flags;
5262f567 3851
ad8a48ac 3852 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 3853 return -EINVAL;
ad8a48ac 3854 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 3855 return -EINVAL;
2d28390a
JA
3856 if (sqe->off && is_timeout_link)
3857 return -EINVAL;
a41525ab
JA
3858 flags = READ_ONCE(sqe->timeout_flags);
3859 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 3860 return -EINVAL;
bdf20073 3861
26a61679
JA
3862 req->timeout.count = READ_ONCE(sqe->off);
3863
3529d8c2 3864 if (!req->io && io_alloc_async_ctx(req))
26a61679
JA
3865 return -ENOMEM;
3866
3867 data = &req->io->timeout;
ad8a48ac 3868 data->req = req;
ad8a48ac
JA
3869 req->flags |= REQ_F_TIMEOUT;
3870
3871 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
3872 return -EFAULT;
3873
11365043 3874 if (flags & IORING_TIMEOUT_ABS)
ad8a48ac 3875 data->mode = HRTIMER_MODE_ABS;
11365043 3876 else
ad8a48ac 3877 data->mode = HRTIMER_MODE_REL;
11365043 3878
ad8a48ac
JA
3879 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
3880 return 0;
3881}
3882
fc4df999 3883static int io_timeout(struct io_kiocb *req)
ad8a48ac
JA
3884{
3885 unsigned count;
3886 struct io_ring_ctx *ctx = req->ctx;
3887 struct io_timeout_data *data;
3888 struct list_head *entry;
3889 unsigned span = 0;
ad8a48ac 3890
2d28390a 3891 data = &req->io->timeout;
93bd25bb 3892
5262f567
JA
3893 /*
3894 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
3895 * timeout event to be satisfied. If it isn't set, then this is
3896 * a pure timeout request, sequence isn't used.
5262f567 3897 */
26a61679 3898 count = req->timeout.count;
93bd25bb
JA
3899 if (!count) {
3900 req->flags |= REQ_F_TIMEOUT_NOSEQ;
3901 spin_lock_irq(&ctx->completion_lock);
3902 entry = ctx->timeout_list.prev;
3903 goto add;
3904 }
5262f567
JA
3905
3906 req->sequence = ctx->cached_sq_head + count - 1;
2d28390a 3907 data->seq_offset = count;
5262f567
JA
3908
3909 /*
3910 * Insertion sort, ensuring the first entry in the list is always
3911 * the one we need first.
3912 */
5262f567
JA
3913 spin_lock_irq(&ctx->completion_lock);
3914 list_for_each_prev(entry, &ctx->timeout_list) {
3915 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5da0fb1a 3916 unsigned nxt_sq_head;
3917 long long tmp, tmp_nxt;
2d28390a 3918 u32 nxt_offset = nxt->io->timeout.seq_offset;
5262f567 3919
93bd25bb
JA
3920 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
3921 continue;
3922
5da0fb1a 3923 /*
3924 * Since cached_sq_head + count - 1 can overflow, use type long
3925 * long to store it.
3926 */
3927 tmp = (long long)ctx->cached_sq_head + count - 1;
cc42e0ac
PB
3928 nxt_sq_head = nxt->sequence - nxt_offset + 1;
3929 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
5da0fb1a 3930
3931 /*
3932 * cached_sq_head may overflow, and it will never overflow twice
3933 * once there is some timeout req still be valid.
3934 */
3935 if (ctx->cached_sq_head < nxt_sq_head)
8b07a65a 3936 tmp += UINT_MAX;
5da0fb1a 3937
a1f58ba4 3938 if (tmp > tmp_nxt)
5262f567 3939 break;
a1f58ba4 3940
3941 /*
3942 * Sequence of reqs after the insert one and itself should
3943 * be adjusted because each timeout req consumes a slot.
3944 */
3945 span++;
3946 nxt->sequence++;
5262f567 3947 }
a1f58ba4 3948 req->sequence -= span;
93bd25bb 3949add:
5262f567 3950 list_add(&req->list, entry);
ad8a48ac
JA
3951 data->timer.function = io_timeout_fn;
3952 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 3953 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
3954 return 0;
3955}
5262f567 3956
62755e35
JA
3957static bool io_cancel_cb(struct io_wq_work *work, void *data)
3958{
3959 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3960
3961 return req->user_data == (unsigned long) data;
3962}
3963
e977d6d3 3964static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
62755e35 3965{
62755e35 3966 enum io_wq_cancel cancel_ret;
62755e35
JA
3967 int ret = 0;
3968
62755e35
JA
3969 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
3970 switch (cancel_ret) {
3971 case IO_WQ_CANCEL_OK:
3972 ret = 0;
3973 break;
3974 case IO_WQ_CANCEL_RUNNING:
3975 ret = -EALREADY;
3976 break;
3977 case IO_WQ_CANCEL_NOTFOUND:
3978 ret = -ENOENT;
3979 break;
3980 }
3981
e977d6d3
JA
3982 return ret;
3983}
3984
47f46768
JA
3985static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
3986 struct io_kiocb *req, __u64 sqe_addr,
b0dd8a41 3987 struct io_kiocb **nxt, int success_ret)
47f46768
JA
3988{
3989 unsigned long flags;
3990 int ret;
3991
3992 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
3993 if (ret != -ENOENT) {
3994 spin_lock_irqsave(&ctx->completion_lock, flags);
3995 goto done;
3996 }
3997
3998 spin_lock_irqsave(&ctx->completion_lock, flags);
3999 ret = io_timeout_cancel(ctx, sqe_addr);
4000 if (ret != -ENOENT)
4001 goto done;
4002 ret = io_poll_cancel(ctx, sqe_addr);
4003done:
b0dd8a41
JA
4004 if (!ret)
4005 ret = success_ret;
47f46768
JA
4006 io_cqring_fill_event(req, ret);
4007 io_commit_cqring(ctx);
4008 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4009 io_cqring_ev_posted(ctx);
4010
4e88d6e7
JA
4011 if (ret < 0)
4012 req_set_fail_links(req);
47f46768
JA
4013 io_put_req_find_next(req, nxt);
4014}
4015
3529d8c2
JA
4016static int io_async_cancel_prep(struct io_kiocb *req,
4017 const struct io_uring_sqe *sqe)
e977d6d3 4018{
fbf23849 4019 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3
JA
4020 return -EINVAL;
4021 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4022 sqe->cancel_flags)
4023 return -EINVAL;
4024
fbf23849
JA
4025 req->cancel.addr = READ_ONCE(sqe->addr);
4026 return 0;
4027}
4028
4029static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
4030{
4031 struct io_ring_ctx *ctx = req->ctx;
fbf23849
JA
4032
4033 io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
5262f567
JA
4034 return 0;
4035}
4036
05f3fb3c
JA
4037static int io_files_update_prep(struct io_kiocb *req,
4038 const struct io_uring_sqe *sqe)
4039{
4040 if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4041 return -EINVAL;
4042
4043 req->files_update.offset = READ_ONCE(sqe->off);
4044 req->files_update.nr_args = READ_ONCE(sqe->len);
4045 if (!req->files_update.nr_args)
4046 return -EINVAL;
4047 req->files_update.arg = READ_ONCE(sqe->addr);
4048 return 0;
4049}
4050
4051static int io_files_update(struct io_kiocb *req, bool force_nonblock)
fbf23849
JA
4052{
4053 struct io_ring_ctx *ctx = req->ctx;
05f3fb3c
JA
4054 struct io_uring_files_update up;
4055 int ret;
fbf23849 4056
f86cd20c 4057 if (force_nonblock)
05f3fb3c 4058 return -EAGAIN;
05f3fb3c
JA
4059
4060 up.offset = req->files_update.offset;
4061 up.fds = req->files_update.arg;
4062
4063 mutex_lock(&ctx->uring_lock);
4064 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4065 mutex_unlock(&ctx->uring_lock);
4066
4067 if (ret < 0)
4068 req_set_fail_links(req);
4069 io_cqring_add_event(req, ret);
4070 io_put_req(req);
5262f567
JA
4071 return 0;
4072}
4073
3529d8c2
JA
4074static int io_req_defer_prep(struct io_kiocb *req,
4075 const struct io_uring_sqe *sqe)
f67676d1 4076{
e781573e 4077 ssize_t ret = 0;
f67676d1 4078
f86cd20c
JA
4079 if (io_op_defs[req->opcode].file_table) {
4080 ret = io_grab_files(req);
4081 if (unlikely(ret))
4082 return ret;
4083 }
4084
cccf0ee8
JA
4085 io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4086
d625c6ee 4087 switch (req->opcode) {
e781573e
JA
4088 case IORING_OP_NOP:
4089 break;
f67676d1
JA
4090 case IORING_OP_READV:
4091 case IORING_OP_READ_FIXED:
3a6820f2 4092 case IORING_OP_READ:
3529d8c2 4093 ret = io_read_prep(req, sqe, true);
f67676d1
JA
4094 break;
4095 case IORING_OP_WRITEV:
4096 case IORING_OP_WRITE_FIXED:
3a6820f2 4097 case IORING_OP_WRITE:
3529d8c2 4098 ret = io_write_prep(req, sqe, true);
f67676d1 4099 break;
0969e783 4100 case IORING_OP_POLL_ADD:
3529d8c2 4101 ret = io_poll_add_prep(req, sqe);
0969e783
JA
4102 break;
4103 case IORING_OP_POLL_REMOVE:
3529d8c2 4104 ret = io_poll_remove_prep(req, sqe);
0969e783 4105 break;
8ed8d3c3 4106 case IORING_OP_FSYNC:
3529d8c2 4107 ret = io_prep_fsync(req, sqe);
8ed8d3c3
JA
4108 break;
4109 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2 4110 ret = io_prep_sfr(req, sqe);
8ed8d3c3 4111 break;
03b1230c 4112 case IORING_OP_SENDMSG:
fddaface 4113 case IORING_OP_SEND:
3529d8c2 4114 ret = io_sendmsg_prep(req, sqe);
03b1230c
JA
4115 break;
4116 case IORING_OP_RECVMSG:
fddaface 4117 case IORING_OP_RECV:
3529d8c2 4118 ret = io_recvmsg_prep(req, sqe);
03b1230c 4119 break;
f499a021 4120 case IORING_OP_CONNECT:
3529d8c2 4121 ret = io_connect_prep(req, sqe);
f499a021 4122 break;
2d28390a 4123 case IORING_OP_TIMEOUT:
3529d8c2 4124 ret = io_timeout_prep(req, sqe, false);
b7bb4f7d 4125 break;
b29472ee 4126 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2 4127 ret = io_timeout_remove_prep(req, sqe);
b29472ee 4128 break;
fbf23849 4129 case IORING_OP_ASYNC_CANCEL:
3529d8c2 4130 ret = io_async_cancel_prep(req, sqe);
fbf23849 4131 break;
2d28390a 4132 case IORING_OP_LINK_TIMEOUT:
3529d8c2 4133 ret = io_timeout_prep(req, sqe, true);
b7bb4f7d 4134 break;
8ed8d3c3 4135 case IORING_OP_ACCEPT:
3529d8c2 4136 ret = io_accept_prep(req, sqe);
8ed8d3c3 4137 break;
d63d1b5e
JA
4138 case IORING_OP_FALLOCATE:
4139 ret = io_fallocate_prep(req, sqe);
4140 break;
15b71abe
JA
4141 case IORING_OP_OPENAT:
4142 ret = io_openat_prep(req, sqe);
4143 break;
b5dba59e
JA
4144 case IORING_OP_CLOSE:
4145 ret = io_close_prep(req, sqe);
4146 break;
05f3fb3c
JA
4147 case IORING_OP_FILES_UPDATE:
4148 ret = io_files_update_prep(req, sqe);
4149 break;
eddc7ef5
JA
4150 case IORING_OP_STATX:
4151 ret = io_statx_prep(req, sqe);
4152 break;
4840e418
JA
4153 case IORING_OP_FADVISE:
4154 ret = io_fadvise_prep(req, sqe);
4155 break;
c1ca757b
JA
4156 case IORING_OP_MADVISE:
4157 ret = io_madvise_prep(req, sqe);
4158 break;
cebdb986
JA
4159 case IORING_OP_OPENAT2:
4160 ret = io_openat2_prep(req, sqe);
4161 break;
3e4827b0
JA
4162 case IORING_OP_EPOLL_CTL:
4163 ret = io_epoll_ctl_prep(req, sqe);
4164 break;
f67676d1 4165 default:
e781573e
JA
4166 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4167 req->opcode);
4168 ret = -EINVAL;
b7bb4f7d 4169 break;
f67676d1
JA
4170 }
4171
b7bb4f7d 4172 return ret;
f67676d1
JA
4173}
4174
3529d8c2 4175static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
de0617e4 4176{
a197f664 4177 struct io_ring_ctx *ctx = req->ctx;
f67676d1 4178 int ret;
de0617e4 4179
9d858b21
BL
4180 /* Still need defer if there is pending req in defer list. */
4181 if (!req_need_defer(req) && list_empty(&ctx->defer_list))
de0617e4
JA
4182 return 0;
4183
3529d8c2 4184 if (!req->io && io_alloc_async_ctx(req))
de0617e4
JA
4185 return -EAGAIN;
4186
3529d8c2 4187 ret = io_req_defer_prep(req, sqe);
b7bb4f7d 4188 if (ret < 0)
2d28390a 4189 return ret;
2d28390a 4190
de0617e4 4191 spin_lock_irq(&ctx->completion_lock);
9d858b21 4192 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
de0617e4 4193 spin_unlock_irq(&ctx->completion_lock);
de0617e4
JA
4194 return 0;
4195 }
4196
915967f6 4197 trace_io_uring_defer(ctx, req, req->user_data);
de0617e4
JA
4198 list_add_tail(&req->list, &ctx->defer_list);
4199 spin_unlock_irq(&ctx->completion_lock);
4200 return -EIOCBQUEUED;
4201}
4202
3529d8c2
JA
4203static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4204 struct io_kiocb **nxt, bool force_nonblock)
2b188cc1 4205{
a197f664 4206 struct io_ring_ctx *ctx = req->ctx;
d625c6ee 4207 int ret;
2b188cc1 4208
d625c6ee 4209 switch (req->opcode) {
2b188cc1 4210 case IORING_OP_NOP:
78e19bbe 4211 ret = io_nop(req);
2b188cc1
JA
4212 break;
4213 case IORING_OP_READV:
edafccee 4214 case IORING_OP_READ_FIXED:
3a6820f2 4215 case IORING_OP_READ:
3529d8c2
JA
4216 if (sqe) {
4217 ret = io_read_prep(req, sqe, force_nonblock);
4218 if (ret < 0)
4219 break;
4220 }
267bc904 4221 ret = io_read(req, nxt, force_nonblock);
edafccee 4222 break;
3529d8c2 4223 case IORING_OP_WRITEV:
edafccee 4224 case IORING_OP_WRITE_FIXED:
3a6820f2 4225 case IORING_OP_WRITE:
3529d8c2
JA
4226 if (sqe) {
4227 ret = io_write_prep(req, sqe, force_nonblock);
4228 if (ret < 0)
4229 break;
4230 }
267bc904 4231 ret = io_write(req, nxt, force_nonblock);
2b188cc1 4232 break;
c992fe29 4233 case IORING_OP_FSYNC:
3529d8c2
JA
4234 if (sqe) {
4235 ret = io_prep_fsync(req, sqe);
4236 if (ret < 0)
4237 break;
4238 }
fc4df999 4239 ret = io_fsync(req, nxt, force_nonblock);
c992fe29 4240 break;
221c5eb2 4241 case IORING_OP_POLL_ADD:
3529d8c2
JA
4242 if (sqe) {
4243 ret = io_poll_add_prep(req, sqe);
4244 if (ret)
4245 break;
4246 }
fc4df999 4247 ret = io_poll_add(req, nxt);
221c5eb2
JA
4248 break;
4249 case IORING_OP_POLL_REMOVE:
3529d8c2
JA
4250 if (sqe) {
4251 ret = io_poll_remove_prep(req, sqe);
4252 if (ret < 0)
4253 break;
4254 }
fc4df999 4255 ret = io_poll_remove(req);
221c5eb2 4256 break;
5d17b4a4 4257 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2
JA
4258 if (sqe) {
4259 ret = io_prep_sfr(req, sqe);
4260 if (ret < 0)
4261 break;
4262 }
fc4df999 4263 ret = io_sync_file_range(req, nxt, force_nonblock);
5d17b4a4 4264 break;
0fa03c62 4265 case IORING_OP_SENDMSG:
fddaface 4266 case IORING_OP_SEND:
3529d8c2
JA
4267 if (sqe) {
4268 ret = io_sendmsg_prep(req, sqe);
4269 if (ret < 0)
4270 break;
4271 }
fddaface
JA
4272 if (req->opcode == IORING_OP_SENDMSG)
4273 ret = io_sendmsg(req, nxt, force_nonblock);
4274 else
4275 ret = io_send(req, nxt, force_nonblock);
0fa03c62 4276 break;
aa1fa28f 4277 case IORING_OP_RECVMSG:
fddaface 4278 case IORING_OP_RECV:
3529d8c2
JA
4279 if (sqe) {
4280 ret = io_recvmsg_prep(req, sqe);
4281 if (ret)
4282 break;
4283 }
fddaface
JA
4284 if (req->opcode == IORING_OP_RECVMSG)
4285 ret = io_recvmsg(req, nxt, force_nonblock);
4286 else
4287 ret = io_recv(req, nxt, force_nonblock);
aa1fa28f 4288 break;
5262f567 4289 case IORING_OP_TIMEOUT:
3529d8c2
JA
4290 if (sqe) {
4291 ret = io_timeout_prep(req, sqe, false);
4292 if (ret)
4293 break;
4294 }
fc4df999 4295 ret = io_timeout(req);
5262f567 4296 break;
11365043 4297 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2
JA
4298 if (sqe) {
4299 ret = io_timeout_remove_prep(req, sqe);
4300 if (ret)
4301 break;
4302 }
fc4df999 4303 ret = io_timeout_remove(req);
11365043 4304 break;
17f2fe35 4305 case IORING_OP_ACCEPT:
3529d8c2
JA
4306 if (sqe) {
4307 ret = io_accept_prep(req, sqe);
4308 if (ret)
4309 break;
4310 }
fc4df999 4311 ret = io_accept(req, nxt, force_nonblock);
17f2fe35 4312 break;
f8e85cf2 4313 case IORING_OP_CONNECT:
3529d8c2
JA
4314 if (sqe) {
4315 ret = io_connect_prep(req, sqe);
4316 if (ret)
4317 break;
4318 }
fc4df999 4319 ret = io_connect(req, nxt, force_nonblock);
f8e85cf2 4320 break;
62755e35 4321 case IORING_OP_ASYNC_CANCEL:
3529d8c2
JA
4322 if (sqe) {
4323 ret = io_async_cancel_prep(req, sqe);
4324 if (ret)
4325 break;
4326 }
fc4df999 4327 ret = io_async_cancel(req, nxt);
62755e35 4328 break;
d63d1b5e
JA
4329 case IORING_OP_FALLOCATE:
4330 if (sqe) {
4331 ret = io_fallocate_prep(req, sqe);
4332 if (ret)
4333 break;
4334 }
4335 ret = io_fallocate(req, nxt, force_nonblock);
4336 break;
15b71abe
JA
4337 case IORING_OP_OPENAT:
4338 if (sqe) {
4339 ret = io_openat_prep(req, sqe);
4340 if (ret)
4341 break;
4342 }
4343 ret = io_openat(req, nxt, force_nonblock);
4344 break;
b5dba59e
JA
4345 case IORING_OP_CLOSE:
4346 if (sqe) {
4347 ret = io_close_prep(req, sqe);
4348 if (ret)
4349 break;
4350 }
4351 ret = io_close(req, nxt, force_nonblock);
4352 break;
05f3fb3c
JA
4353 case IORING_OP_FILES_UPDATE:
4354 if (sqe) {
4355 ret = io_files_update_prep(req, sqe);
4356 if (ret)
4357 break;
4358 }
4359 ret = io_files_update(req, force_nonblock);
4360 break;
eddc7ef5
JA
4361 case IORING_OP_STATX:
4362 if (sqe) {
4363 ret = io_statx_prep(req, sqe);
4364 if (ret)
4365 break;
4366 }
4367 ret = io_statx(req, nxt, force_nonblock);
4368 break;
4840e418
JA
4369 case IORING_OP_FADVISE:
4370 if (sqe) {
4371 ret = io_fadvise_prep(req, sqe);
4372 if (ret)
4373 break;
4374 }
4375 ret = io_fadvise(req, nxt, force_nonblock);
4376 break;
c1ca757b
JA
4377 case IORING_OP_MADVISE:
4378 if (sqe) {
4379 ret = io_madvise_prep(req, sqe);
4380 if (ret)
4381 break;
4382 }
4383 ret = io_madvise(req, nxt, force_nonblock);
4384 break;
cebdb986
JA
4385 case IORING_OP_OPENAT2:
4386 if (sqe) {
4387 ret = io_openat2_prep(req, sqe);
4388 if (ret)
4389 break;
4390 }
4391 ret = io_openat2(req, nxt, force_nonblock);
4392 break;
3e4827b0
JA
4393 case IORING_OP_EPOLL_CTL:
4394 if (sqe) {
4395 ret = io_epoll_ctl_prep(req, sqe);
4396 if (ret)
4397 break;
4398 }
4399 ret = io_epoll_ctl(req, nxt, force_nonblock);
4400 break;
2b188cc1
JA
4401 default:
4402 ret = -EINVAL;
4403 break;
4404 }
4405
def596e9
JA
4406 if (ret)
4407 return ret;
4408
4409 if (ctx->flags & IORING_SETUP_IOPOLL) {
11ba820b
JA
4410 const bool in_async = io_wq_current_is_worker();
4411
9e645e11 4412 if (req->result == -EAGAIN)
def596e9
JA
4413 return -EAGAIN;
4414
11ba820b
JA
4415 /* workqueue context doesn't hold uring_lock, grab it now */
4416 if (in_async)
4417 mutex_lock(&ctx->uring_lock);
4418
def596e9 4419 io_iopoll_req_issued(req);
11ba820b
JA
4420
4421 if (in_async)
4422 mutex_unlock(&ctx->uring_lock);
def596e9
JA
4423 }
4424
4425 return 0;
2b188cc1
JA
4426}
4427
561fb04a 4428static void io_wq_submit_work(struct io_wq_work **workptr)
2b188cc1 4429{
561fb04a 4430 struct io_wq_work *work = *workptr;
2b188cc1 4431 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
561fb04a
JA
4432 struct io_kiocb *nxt = NULL;
4433 int ret = 0;
2b188cc1 4434
0c9d5ccd
JA
4435 /* if NO_CANCEL is set, we must still run the work */
4436 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
4437 IO_WQ_WORK_CANCEL) {
561fb04a 4438 ret = -ECANCELED;
0c9d5ccd 4439 }
31b51510 4440
561fb04a 4441 if (!ret) {
cf6fd4bd
PB
4442 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
4443 req->in_async = true;
561fb04a 4444 do {
3529d8c2 4445 ret = io_issue_sqe(req, NULL, &nxt, false);
561fb04a
JA
4446 /*
4447 * We can get EAGAIN for polled IO even though we're
4448 * forcing a sync submission from here, since we can't
4449 * wait for request slots on the block side.
4450 */
4451 if (ret != -EAGAIN)
4452 break;
4453 cond_resched();
4454 } while (1);
4455 }
31b51510 4456
561fb04a 4457 /* drop submission reference */
ec9c02ad 4458 io_put_req(req);
817869d2 4459
561fb04a 4460 if (ret) {
4e88d6e7 4461 req_set_fail_links(req);
78e19bbe 4462 io_cqring_add_event(req, ret);
817869d2 4463 io_put_req(req);
edafccee 4464 }
2b188cc1 4465
561fb04a 4466 /* if a dependent link is ready, pass it back */
78912934
JA
4467 if (!ret && nxt)
4468 io_wq_assign_next(workptr, nxt);
2b188cc1
JA
4469}
4470
15b71abe 4471static int io_req_needs_file(struct io_kiocb *req, int fd)
9e3aa61a 4472{
d3656344 4473 if (!io_op_defs[req->opcode].needs_file)
9e3aa61a 4474 return 0;
d3656344
JA
4475 if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
4476 return 0;
4477 return 1;
09bb8394
JA
4478}
4479
65e19f54
JA
4480static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
4481 int index)
4482{
4483 struct fixed_file_table *table;
4484
05f3fb3c
JA
4485 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
4486 return table->files[index & IORING_FILE_TABLE_MASK];;
65e19f54
JA
4487}
4488
3529d8c2
JA
4489static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
4490 const struct io_uring_sqe *sqe)
09bb8394 4491{
a197f664 4492 struct io_ring_ctx *ctx = req->ctx;
09bb8394 4493 unsigned flags;
d3656344 4494 int fd;
09bb8394 4495
3529d8c2
JA
4496 flags = READ_ONCE(sqe->flags);
4497 fd = READ_ONCE(sqe->fd);
09bb8394 4498
d3656344
JA
4499 if (!io_req_needs_file(req, fd))
4500 return 0;
09bb8394
JA
4501
4502 if (flags & IOSQE_FIXED_FILE) {
05f3fb3c 4503 if (unlikely(!ctx->file_data ||
09bb8394
JA
4504 (unsigned) fd >= ctx->nr_user_files))
4505 return -EBADF;
b7620121 4506 fd = array_index_nospec(fd, ctx->nr_user_files);
65e19f54
JA
4507 req->file = io_file_from_index(ctx, fd);
4508 if (!req->file)
08a45173 4509 return -EBADF;
09bb8394 4510 req->flags |= REQ_F_FIXED_FILE;
05f3fb3c 4511 percpu_ref_get(&ctx->file_data->refs);
09bb8394 4512 } else {
cf6fd4bd 4513 if (req->needs_fixed_file)
09bb8394 4514 return -EBADF;
c826bd7a 4515 trace_io_uring_file_get(ctx, fd);
09bb8394
JA
4516 req->file = io_file_get(state, fd);
4517 if (unlikely(!req->file))
4518 return -EBADF;
4519 }
4520
4521 return 0;
4522}
4523
a197f664 4524static int io_grab_files(struct io_kiocb *req)
fcb323cc
JA
4525{
4526 int ret = -EBADF;
a197f664 4527 struct io_ring_ctx *ctx = req->ctx;
fcb323cc 4528
f86cd20c
JA
4529 if (req->work.files)
4530 return 0;
b14cca0c 4531 if (!ctx->ring_file)
b5dba59e
JA
4532 return -EBADF;
4533
fcb323cc
JA
4534 rcu_read_lock();
4535 spin_lock_irq(&ctx->inflight_lock);
4536 /*
4537 * We use the f_ops->flush() handler to ensure that we can flush
4538 * out work accessing these files if the fd is closed. Check if
4539 * the fd has changed since we started down this path, and disallow
4540 * this operation if it has.
4541 */
b14cca0c 4542 if (fcheck(ctx->ring_fd) == ctx->ring_file) {
fcb323cc
JA
4543 list_add(&req->inflight_entry, &ctx->inflight_list);
4544 req->flags |= REQ_F_INFLIGHT;
4545 req->work.files = current->files;
4546 ret = 0;
4547 }
4548 spin_unlock_irq(&ctx->inflight_lock);
4549 rcu_read_unlock();
4550
4551 return ret;
4552}
4553
2665abfd 4554static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 4555{
ad8a48ac
JA
4556 struct io_timeout_data *data = container_of(timer,
4557 struct io_timeout_data, timer);
4558 struct io_kiocb *req = data->req;
2665abfd
JA
4559 struct io_ring_ctx *ctx = req->ctx;
4560 struct io_kiocb *prev = NULL;
4561 unsigned long flags;
2665abfd
JA
4562
4563 spin_lock_irqsave(&ctx->completion_lock, flags);
4564
4565 /*
4566 * We don't expect the list to be empty, that will only happen if we
4567 * race with the completion of the linked work.
4568 */
4493233e
PB
4569 if (!list_empty(&req->link_list)) {
4570 prev = list_entry(req->link_list.prev, struct io_kiocb,
4571 link_list);
5d960724 4572 if (refcount_inc_not_zero(&prev->refs)) {
4493233e 4573 list_del_init(&req->link_list);
5d960724
JA
4574 prev->flags &= ~REQ_F_LINK_TIMEOUT;
4575 } else
76a46e06 4576 prev = NULL;
2665abfd
JA
4577 }
4578
4579 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4580
4581 if (prev) {
4e88d6e7 4582 req_set_fail_links(prev);
b0dd8a41
JA
4583 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
4584 -ETIME);
76a46e06 4585 io_put_req(prev);
47f46768
JA
4586 } else {
4587 io_cqring_add_event(req, -ETIME);
4588 io_put_req(req);
2665abfd 4589 }
2665abfd
JA
4590 return HRTIMER_NORESTART;
4591}
4592
ad8a48ac 4593static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 4594{
76a46e06 4595 struct io_ring_ctx *ctx = req->ctx;
2665abfd 4596
76a46e06
JA
4597 /*
4598 * If the list is now empty, then our linked request finished before
4599 * we got a chance to setup the timer
4600 */
4601 spin_lock_irq(&ctx->completion_lock);
4493233e 4602 if (!list_empty(&req->link_list)) {
2d28390a 4603 struct io_timeout_data *data = &req->io->timeout;
94ae5e77 4604
ad8a48ac
JA
4605 data->timer.function = io_link_timeout_fn;
4606 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
4607 data->mode);
2665abfd 4608 }
76a46e06 4609 spin_unlock_irq(&ctx->completion_lock);
2665abfd 4610
2665abfd 4611 /* drop submission reference */
76a46e06
JA
4612 io_put_req(req);
4613}
2665abfd 4614
ad8a48ac 4615static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd
JA
4616{
4617 struct io_kiocb *nxt;
4618
4619 if (!(req->flags & REQ_F_LINK))
4620 return NULL;
4621
4493233e
PB
4622 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
4623 link_list);
d625c6ee 4624 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 4625 return NULL;
2665abfd 4626
76a46e06 4627 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 4628 return nxt;
2665abfd
JA
4629}
4630
3529d8c2 4631static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 4632{
4a0a7a18 4633 struct io_kiocb *linked_timeout;
f9bd67f6 4634 struct io_kiocb *nxt = NULL;
e0c5c576 4635 int ret;
2b188cc1 4636
4a0a7a18
JA
4637again:
4638 linked_timeout = io_prep_linked_timeout(req);
4639
3529d8c2 4640 ret = io_issue_sqe(req, sqe, &nxt, true);
491381ce
JA
4641
4642 /*
4643 * We async punt it if the file wasn't marked NOWAIT, or if the file
4644 * doesn't support non-blocking read/write attempts
4645 */
4646 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
4647 (req->flags & REQ_F_MUST_PUNT))) {
86a761f8 4648punt:
f86cd20c 4649 if (io_op_defs[req->opcode].file_table) {
bbad27b2
PB
4650 ret = io_grab_files(req);
4651 if (ret)
4652 goto err;
2b188cc1 4653 }
bbad27b2
PB
4654
4655 /*
4656 * Queued up for async execution, worker will release
4657 * submit reference when the iocb is actually submitted.
4658 */
4659 io_queue_async_work(req);
4a0a7a18 4660 goto done_req;
2b188cc1 4661 }
e65ef56d 4662
fcb323cc 4663err:
76a46e06 4664 /* drop submission reference */
ec9c02ad 4665 io_put_req(req);
e65ef56d 4666
f9bd67f6 4667 if (linked_timeout) {
76a46e06 4668 if (!ret)
f9bd67f6 4669 io_queue_linked_timeout(linked_timeout);
76a46e06 4670 else
f9bd67f6 4671 io_put_req(linked_timeout);
76a46e06
JA
4672 }
4673
e65ef56d 4674 /* and drop final reference, if we failed */
9e645e11 4675 if (ret) {
78e19bbe 4676 io_cqring_add_event(req, ret);
4e88d6e7 4677 req_set_fail_links(req);
e65ef56d 4678 io_put_req(req);
9e645e11 4679 }
4a0a7a18
JA
4680done_req:
4681 if (nxt) {
4682 req = nxt;
4683 nxt = NULL;
86a761f8
PB
4684
4685 if (req->flags & REQ_F_FORCE_ASYNC)
4686 goto punt;
4a0a7a18
JA
4687 goto again;
4688 }
2b188cc1
JA
4689}
4690
3529d8c2 4691static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4fe2c963
JL
4692{
4693 int ret;
4694
3529d8c2 4695 ret = io_req_defer(req, sqe);
4fe2c963
JL
4696 if (ret) {
4697 if (ret != -EIOCBQUEUED) {
1118591a 4698fail_req:
78e19bbe 4699 io_cqring_add_event(req, ret);
4e88d6e7 4700 req_set_fail_links(req);
78e19bbe 4701 io_double_put_req(req);
4fe2c963 4702 }
2550878f 4703 } else if (req->flags & REQ_F_FORCE_ASYNC) {
1118591a
PB
4704 ret = io_req_defer_prep(req, sqe);
4705 if (unlikely(ret < 0))
4706 goto fail_req;
ce35a47a
JA
4707 /*
4708 * Never try inline submit of IOSQE_ASYNC is set, go straight
4709 * to async execution.
4710 */
4711 req->work.flags |= IO_WQ_WORK_CONCURRENT;
4712 io_queue_async_work(req);
4713 } else {
3529d8c2 4714 __io_queue_sqe(req, sqe);
ce35a47a 4715 }
4fe2c963
JL
4716}
4717
1b4a51b6 4718static inline void io_queue_link_head(struct io_kiocb *req)
4fe2c963 4719{
94ae5e77 4720 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
1b4a51b6
PB
4721 io_cqring_add_event(req, -ECANCELED);
4722 io_double_put_req(req);
4723 } else
3529d8c2 4724 io_queue_sqe(req, NULL);
4fe2c963
JL
4725}
4726
4e88d6e7 4727#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
ce35a47a 4728 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
9e645e11 4729
3529d8c2
JA
4730static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4731 struct io_submit_state *state, struct io_kiocb **link)
9e645e11 4732{
75c6a039 4733 const struct cred *old_creds = NULL;
a197f664 4734 struct io_ring_ctx *ctx = req->ctx;
32fe525b 4735 unsigned int sqe_flags;
75c6a039 4736 int ret, id;
9e645e11 4737
32fe525b 4738 sqe_flags = READ_ONCE(sqe->flags);
9e645e11
JA
4739
4740 /* enforce forwards compatibility on users */
32fe525b 4741 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
9e645e11 4742 ret = -EINVAL;
196be95c 4743 goto err_req;
9e645e11
JA
4744 }
4745
75c6a039
JA
4746 id = READ_ONCE(sqe->personality);
4747 if (id) {
4748 const struct cred *personality_creds;
4749
4750 personality_creds = idr_find(&ctx->personality_idr, id);
4751 if (unlikely(!personality_creds)) {
4752 ret = -EINVAL;
4753 goto err_req;
4754 }
4755 old_creds = override_creds(personality_creds);
4756 }
4757
6b47ee6e
PB
4758 /* same numerical values with corresponding REQ_F_*, safe to copy */
4759 req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
4760 IOSQE_ASYNC);
9e645e11 4761
3529d8c2 4762 ret = io_req_set_file(state, req, sqe);
9e645e11
JA
4763 if (unlikely(ret)) {
4764err_req:
78e19bbe
JA
4765 io_cqring_add_event(req, ret);
4766 io_double_put_req(req);
75c6a039
JA
4767 if (old_creds)
4768 revert_creds(old_creds);
2e6e1fde 4769 return false;
9e645e11
JA
4770 }
4771
9e645e11
JA
4772 /*
4773 * If we already have a head request, queue this one for async
4774 * submittal once the head completes. If we don't have a head but
4775 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
4776 * submitted sync once the chain is complete. If none of those
4777 * conditions are true (normal request), then just queue it.
4778 */
4779 if (*link) {
9d76377f 4780 struct io_kiocb *head = *link;
4e88d6e7 4781
8cdf2193
PB
4782 /*
4783 * Taking sequential execution of a link, draining both sides
4784 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
4785 * requests in the link. So, it drains the head and the
4786 * next after the link request. The last one is done via
4787 * drain_next flag to persist the effect across calls.
4788 */
711be031
PB
4789 if (sqe_flags & IOSQE_IO_DRAIN) {
4790 head->flags |= REQ_F_IO_DRAIN;
4791 ctx->drain_next = 1;
4792 }
b7bb4f7d 4793 if (io_alloc_async_ctx(req)) {
9e645e11
JA
4794 ret = -EAGAIN;
4795 goto err_req;
4796 }
4797
3529d8c2 4798 ret = io_req_defer_prep(req, sqe);
2d28390a 4799 if (ret) {
4e88d6e7 4800 /* fail even hard links since we don't submit */
9d76377f 4801 head->flags |= REQ_F_FAIL_LINK;
f67676d1 4802 goto err_req;
2d28390a 4803 }
9d76377f
PB
4804 trace_io_uring_link(ctx, req, head);
4805 list_add_tail(&req->link_list, &head->link_list);
32fe525b
PB
4806
4807 /* last request of a link, enqueue the link */
4808 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
4809 io_queue_link_head(head);
4810 *link = NULL;
4811 }
9e645e11 4812 } else {
711be031
PB
4813 if (unlikely(ctx->drain_next)) {
4814 req->flags |= REQ_F_IO_DRAIN;
4815 req->ctx->drain_next = 0;
4816 }
4817 if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
4818 req->flags |= REQ_F_LINK;
711be031
PB
4819 INIT_LIST_HEAD(&req->link_list);
4820 ret = io_req_defer_prep(req, sqe);
4821 if (ret)
4822 req->flags |= REQ_F_FAIL_LINK;
4823 *link = req;
4824 } else {
4825 io_queue_sqe(req, sqe);
4826 }
9e645e11 4827 }
2e6e1fde 4828
75c6a039
JA
4829 if (old_creds)
4830 revert_creds(old_creds);
2e6e1fde 4831 return true;
9e645e11
JA
4832}
4833
9a56a232
JA
4834/*
4835 * Batched submission is done, ensure local IO is flushed out.
4836 */
4837static void io_submit_state_end(struct io_submit_state *state)
4838{
4839 blk_finish_plug(&state->plug);
3d6770fb 4840 io_file_put(state);
2579f913
JA
4841 if (state->free_reqs)
4842 kmem_cache_free_bulk(req_cachep, state->free_reqs,
4843 &state->reqs[state->cur_req]);
9a56a232
JA
4844}
4845
4846/*
4847 * Start submission side cache.
4848 */
4849static void io_submit_state_start(struct io_submit_state *state,
22efde59 4850 unsigned int max_ios)
9a56a232
JA
4851{
4852 blk_start_plug(&state->plug);
2579f913 4853 state->free_reqs = 0;
9a56a232
JA
4854 state->file = NULL;
4855 state->ios_left = max_ios;
4856}
4857
2b188cc1
JA
4858static void io_commit_sqring(struct io_ring_ctx *ctx)
4859{
75b28aff 4860 struct io_rings *rings = ctx->rings;
2b188cc1 4861
caf582c6
PB
4862 /*
4863 * Ensure any loads from the SQEs are done at this point,
4864 * since once we write the new head, the application could
4865 * write new data to them.
4866 */
4867 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
4868}
4869
2b188cc1 4870/*
3529d8c2 4871 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
2b188cc1
JA
4872 * that is mapped by userspace. This means that care needs to be taken to
4873 * ensure that reads are stable, as we cannot rely on userspace always
4874 * being a good citizen. If members of the sqe are validated and then later
4875 * used, it's important that those reads are done through READ_ONCE() to
4876 * prevent a re-load down the line.
4877 */
3529d8c2
JA
4878static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
4879 const struct io_uring_sqe **sqe_ptr)
2b188cc1 4880{
75b28aff 4881 u32 *sq_array = ctx->sq_array;
2b188cc1
JA
4882 unsigned head;
4883
4884 /*
4885 * The cached sq head (or cq tail) serves two purposes:
4886 *
4887 * 1) allows us to batch the cost of updating the user visible
4888 * head updates.
4889 * 2) allows the kernel side to track the head on its own, even
4890 * though the application is the one updating it.
4891 */
ee7d46d9 4892 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
9835d6fa 4893 if (likely(head < ctx->sq_entries)) {
cf6fd4bd
PB
4894 /*
4895 * All io need record the previous position, if LINK vs DARIN,
4896 * it can be used to mark the position of the first IO in the
4897 * link list.
4898 */
4899 req->sequence = ctx->cached_sq_head;
3529d8c2
JA
4900 *sqe_ptr = &ctx->sq_sqes[head];
4901 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
4902 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
2b188cc1
JA
4903 ctx->cached_sq_head++;
4904 return true;
4905 }
4906
4907 /* drop invalid entries */
4908 ctx->cached_sq_head++;
498ccd9e 4909 ctx->cached_sq_dropped++;
ee7d46d9 4910 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
2b188cc1
JA
4911 return false;
4912}
4913
fb5ccc98 4914static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
ae9428ca
PB
4915 struct file *ring_file, int ring_fd,
4916 struct mm_struct **mm, bool async)
6c271ce2
JA
4917{
4918 struct io_submit_state state, *statep = NULL;
9e645e11 4919 struct io_kiocb *link = NULL;
9e645e11 4920 int i, submitted = 0;
95a1b3ff 4921 bool mm_fault = false;
6c271ce2 4922
c4a2ed72 4923 /* if we have a backlog and couldn't flush it all, return BUSY */
ad3eb2c8
JA
4924 if (test_bit(0, &ctx->sq_check_overflow)) {
4925 if (!list_empty(&ctx->cq_overflow_list) &&
4926 !io_cqring_overflow_flush(ctx, false))
4927 return -EBUSY;
4928 }
6c271ce2 4929
ee7d46d9
PB
4930 /* make sure SQ entry isn't read before tail */
4931 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
9ef4f124 4932
2b85edfc
PB
4933 if (!percpu_ref_tryget_many(&ctx->refs, nr))
4934 return -EAGAIN;
6c271ce2
JA
4935
4936 if (nr > IO_PLUG_THRESHOLD) {
22efde59 4937 io_submit_state_start(&state, nr);
6c271ce2
JA
4938 statep = &state;
4939 }
4940
b14cca0c
PB
4941 ctx->ring_fd = ring_fd;
4942 ctx->ring_file = ring_file;
4943
6c271ce2 4944 for (i = 0; i < nr; i++) {
3529d8c2 4945 const struct io_uring_sqe *sqe;
196be95c 4946 struct io_kiocb *req;
fb5ccc98 4947
196be95c
PB
4948 req = io_get_req(ctx, statep);
4949 if (unlikely(!req)) {
4950 if (!submitted)
4951 submitted = -EAGAIN;
fb5ccc98 4952 break;
196be95c 4953 }
3529d8c2 4954 if (!io_get_sqring(ctx, req, &sqe)) {
2b85edfc 4955 __io_req_do_free(req);
196be95c
PB
4956 break;
4957 }
fb5ccc98 4958
d3656344
JA
4959 /* will complete beyond this point, count as submitted */
4960 submitted++;
4961
4962 if (unlikely(req->opcode >= IORING_OP_LAST)) {
4963 io_cqring_add_event(req, -EINVAL);
4964 io_double_put_req(req);
196be95c
PB
4965 break;
4966 }
fb5ccc98 4967
d3656344 4968 if (io_op_defs[req->opcode].needs_mm && !*mm) {
95a1b3ff
PB
4969 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
4970 if (!mm_fault) {
4971 use_mm(ctx->sqo_mm);
4972 *mm = ctx->sqo_mm;
4973 }
9e645e11 4974 }
9e645e11 4975
cf6fd4bd
PB
4976 req->has_user = *mm != NULL;
4977 req->in_async = async;
4978 req->needs_fixed_file = async;
354420f7
JA
4979 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
4980 true, async);
3529d8c2 4981 if (!io_submit_sqe(req, sqe, statep, &link))
2e6e1fde 4982 break;
6c271ce2
JA
4983 }
4984
9466f437
PB
4985 if (unlikely(submitted != nr)) {
4986 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
4987
4988 percpu_ref_put_many(&ctx->refs, nr - ref_used);
4989 }
9e645e11 4990 if (link)
1b4a51b6 4991 io_queue_link_head(link);
6c271ce2
JA
4992 if (statep)
4993 io_submit_state_end(&state);
4994
ae9428ca
PB
4995 /* Commit SQ ring head once we've consumed and submitted all SQEs */
4996 io_commit_sqring(ctx);
4997
6c271ce2
JA
4998 return submitted;
4999}
5000
5001static int io_sq_thread(void *data)
5002{
6c271ce2
JA
5003 struct io_ring_ctx *ctx = data;
5004 struct mm_struct *cur_mm = NULL;
181e448d 5005 const struct cred *old_cred;
6c271ce2
JA
5006 mm_segment_t old_fs;
5007 DEFINE_WAIT(wait);
5008 unsigned inflight;
5009 unsigned long timeout;
c1edbf5f 5010 int ret;
6c271ce2 5011
206aefde 5012 complete(&ctx->completions[1]);
a4c0b3de 5013
6c271ce2
JA
5014 old_fs = get_fs();
5015 set_fs(USER_DS);
181e448d 5016 old_cred = override_creds(ctx->creds);
6c271ce2 5017
c1edbf5f 5018 ret = timeout = inflight = 0;
2bbcd6d3 5019 while (!kthread_should_park()) {
fb5ccc98 5020 unsigned int to_submit;
6c271ce2
JA
5021
5022 if (inflight) {
5023 unsigned nr_events = 0;
5024
5025 if (ctx->flags & IORING_SETUP_IOPOLL) {
2b2ed975
JA
5026 /*
5027 * inflight is the count of the maximum possible
5028 * entries we submitted, but it can be smaller
5029 * if we dropped some of them. If we don't have
5030 * poll entries available, then we know that we
5031 * have nothing left to poll for. Reset the
5032 * inflight count to zero in that case.
5033 */
5034 mutex_lock(&ctx->uring_lock);
5035 if (!list_empty(&ctx->poll_list))
5036 __io_iopoll_check(ctx, &nr_events, 0);
5037 else
5038 inflight = 0;
5039 mutex_unlock(&ctx->uring_lock);
6c271ce2
JA
5040 } else {
5041 /*
5042 * Normal IO, just pretend everything completed.
5043 * We don't have to poll completions for that.
5044 */
5045 nr_events = inflight;
5046 }
5047
5048 inflight -= nr_events;
5049 if (!inflight)
5050 timeout = jiffies + ctx->sq_thread_idle;
5051 }
5052
fb5ccc98 5053 to_submit = io_sqring_entries(ctx);
c1edbf5f
JA
5054
5055 /*
5056 * If submit got -EBUSY, flag us as needing the application
5057 * to enter the kernel to reap and flush events.
5058 */
5059 if (!to_submit || ret == -EBUSY) {
6c271ce2
JA
5060 /*
5061 * We're polling. If we're within the defined idle
5062 * period, then let us spin without work before going
c1edbf5f
JA
5063 * to sleep. The exception is if we got EBUSY doing
5064 * more IO, we should wait for the application to
5065 * reap events and wake us up.
6c271ce2 5066 */
c1edbf5f
JA
5067 if (inflight ||
5068 (!time_after(jiffies, timeout) && ret != -EBUSY)) {
9831a90c 5069 cond_resched();
6c271ce2
JA
5070 continue;
5071 }
5072
5073 /*
5074 * Drop cur_mm before scheduling, we can't hold it for
5075 * long periods (or over schedule()). Do this before
5076 * adding ourselves to the waitqueue, as the unuse/drop
5077 * may sleep.
5078 */
5079 if (cur_mm) {
5080 unuse_mm(cur_mm);
5081 mmput(cur_mm);
5082 cur_mm = NULL;
5083 }
5084
5085 prepare_to_wait(&ctx->sqo_wait, &wait,
5086 TASK_INTERRUPTIBLE);
5087
5088 /* Tell userspace we may need a wakeup call */
75b28aff 5089 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
0d7bae69
SB
5090 /* make sure to read SQ tail after writing flags */
5091 smp_mb();
6c271ce2 5092
fb5ccc98 5093 to_submit = io_sqring_entries(ctx);
c1edbf5f 5094 if (!to_submit || ret == -EBUSY) {
2bbcd6d3 5095 if (kthread_should_park()) {
6c271ce2
JA
5096 finish_wait(&ctx->sqo_wait, &wait);
5097 break;
5098 }
5099 if (signal_pending(current))
5100 flush_signals(current);
5101 schedule();
5102 finish_wait(&ctx->sqo_wait, &wait);
5103
75b28aff 5104 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5105 continue;
5106 }
5107 finish_wait(&ctx->sqo_wait, &wait);
5108
75b28aff 5109 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5110 }
5111
8a4955ff 5112 mutex_lock(&ctx->uring_lock);
1d7bb1d5 5113 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
8a4955ff 5114 mutex_unlock(&ctx->uring_lock);
1d7bb1d5
JA
5115 if (ret > 0)
5116 inflight += ret;
6c271ce2
JA
5117 }
5118
5119 set_fs(old_fs);
5120 if (cur_mm) {
5121 unuse_mm(cur_mm);
5122 mmput(cur_mm);
5123 }
181e448d 5124 revert_creds(old_cred);
06058632 5125
2bbcd6d3 5126 kthread_parkme();
06058632 5127
6c271ce2
JA
5128 return 0;
5129}
5130
bda52162
JA
5131struct io_wait_queue {
5132 struct wait_queue_entry wq;
5133 struct io_ring_ctx *ctx;
5134 unsigned to_wait;
5135 unsigned nr_timeouts;
5136};
5137
1d7bb1d5 5138static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
bda52162
JA
5139{
5140 struct io_ring_ctx *ctx = iowq->ctx;
5141
5142 /*
d195a66e 5143 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
5144 * started waiting. For timeouts, we always want to return to userspace,
5145 * regardless of event count.
5146 */
1d7bb1d5 5147 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
bda52162
JA
5148 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
5149}
5150
5151static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
5152 int wake_flags, void *key)
5153{
5154 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
5155 wq);
5156
1d7bb1d5
JA
5157 /* use noflush == true, as we can't safely rely on locking context */
5158 if (!io_should_wake(iowq, true))
bda52162
JA
5159 return -1;
5160
5161 return autoremove_wake_function(curr, mode, wake_flags, key);
5162}
5163
2b188cc1
JA
5164/*
5165 * Wait until events become available, if we don't already have some. The
5166 * application must reap them itself, as they reside on the shared cq ring.
5167 */
5168static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
5169 const sigset_t __user *sig, size_t sigsz)
5170{
bda52162
JA
5171 struct io_wait_queue iowq = {
5172 .wq = {
5173 .private = current,
5174 .func = io_wake_function,
5175 .entry = LIST_HEAD_INIT(iowq.wq.entry),
5176 },
5177 .ctx = ctx,
5178 .to_wait = min_events,
5179 };
75b28aff 5180 struct io_rings *rings = ctx->rings;
e9ffa5c2 5181 int ret = 0;
2b188cc1 5182
1d7bb1d5 5183 if (io_cqring_events(ctx, false) >= min_events)
2b188cc1
JA
5184 return 0;
5185
5186 if (sig) {
9e75ad5d
AB
5187#ifdef CONFIG_COMPAT
5188 if (in_compat_syscall())
5189 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 5190 sigsz);
9e75ad5d
AB
5191 else
5192#endif
b772434b 5193 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 5194
2b188cc1
JA
5195 if (ret)
5196 return ret;
5197 }
5198
bda52162 5199 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
c826bd7a 5200 trace_io_uring_cqring_wait(ctx, min_events);
bda52162
JA
5201 do {
5202 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
5203 TASK_INTERRUPTIBLE);
1d7bb1d5 5204 if (io_should_wake(&iowq, false))
bda52162
JA
5205 break;
5206 schedule();
5207 if (signal_pending(current)) {
e9ffa5c2 5208 ret = -EINTR;
bda52162
JA
5209 break;
5210 }
5211 } while (1);
5212 finish_wait(&ctx->wait, &iowq.wq);
5213
e9ffa5c2 5214 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 5215
75b28aff 5216 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
5217}
5218
6b06314c
JA
5219static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
5220{
5221#if defined(CONFIG_UNIX)
5222 if (ctx->ring_sock) {
5223 struct sock *sock = ctx->ring_sock->sk;
5224 struct sk_buff *skb;
5225
5226 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
5227 kfree_skb(skb);
5228 }
5229#else
5230 int i;
5231
65e19f54
JA
5232 for (i = 0; i < ctx->nr_user_files; i++) {
5233 struct file *file;
5234
5235 file = io_file_from_index(ctx, i);
5236 if (file)
5237 fput(file);
5238 }
6b06314c
JA
5239#endif
5240}
5241
05f3fb3c
JA
5242static void io_file_ref_kill(struct percpu_ref *ref)
5243{
5244 struct fixed_file_data *data;
5245
5246 data = container_of(ref, struct fixed_file_data, refs);
5247 complete(&data->done);
5248}
5249
6b06314c
JA
5250static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
5251{
05f3fb3c 5252 struct fixed_file_data *data = ctx->file_data;
65e19f54
JA
5253 unsigned nr_tables, i;
5254
05f3fb3c 5255 if (!data)
6b06314c
JA
5256 return -ENXIO;
5257
05f3fb3c 5258 /* protect against inflight atomic switch, which drops the ref */
05f3fb3c 5259 percpu_ref_get(&data->refs);
e46a7950
JA
5260 /* wait for existing switches */
5261 flush_work(&data->ref_work);
05f3fb3c
JA
5262 percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
5263 wait_for_completion(&data->done);
5264 percpu_ref_put(&data->refs);
e46a7950
JA
5265 /* flush potential new switch */
5266 flush_work(&data->ref_work);
05f3fb3c
JA
5267 percpu_ref_exit(&data->refs);
5268
6b06314c 5269 __io_sqe_files_unregister(ctx);
65e19f54
JA
5270 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
5271 for (i = 0; i < nr_tables; i++)
05f3fb3c
JA
5272 kfree(data->table[i].files);
5273 kfree(data->table);
5274 kfree(data);
5275 ctx->file_data = NULL;
6b06314c
JA
5276 ctx->nr_user_files = 0;
5277 return 0;
5278}
5279
6c271ce2
JA
5280static void io_sq_thread_stop(struct io_ring_ctx *ctx)
5281{
5282 if (ctx->sqo_thread) {
206aefde 5283 wait_for_completion(&ctx->completions[1]);
2bbcd6d3
RP
5284 /*
5285 * The park is a bit of a work-around, without it we get
5286 * warning spews on shutdown with SQPOLL set and affinity
5287 * set to a single CPU.
5288 */
06058632 5289 kthread_park(ctx->sqo_thread);
6c271ce2
JA
5290 kthread_stop(ctx->sqo_thread);
5291 ctx->sqo_thread = NULL;
5292 }
5293}
5294
6b06314c
JA
5295static void io_finish_async(struct io_ring_ctx *ctx)
5296{
6c271ce2
JA
5297 io_sq_thread_stop(ctx);
5298
561fb04a
JA
5299 if (ctx->io_wq) {
5300 io_wq_destroy(ctx->io_wq);
5301 ctx->io_wq = NULL;
6b06314c
JA
5302 }
5303}
5304
5305#if defined(CONFIG_UNIX)
6b06314c
JA
5306/*
5307 * Ensure the UNIX gc is aware of our file set, so we are certain that
5308 * the io_uring can be safely unregistered on process exit, even if we have
5309 * loops in the file referencing.
5310 */
5311static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
5312{
5313 struct sock *sk = ctx->ring_sock->sk;
5314 struct scm_fp_list *fpl;
5315 struct sk_buff *skb;
08a45173 5316 int i, nr_files;
6b06314c
JA
5317
5318 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
5319 unsigned long inflight = ctx->user->unix_inflight + nr;
5320
5321 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
5322 return -EMFILE;
5323 }
5324
5325 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
5326 if (!fpl)
5327 return -ENOMEM;
5328
5329 skb = alloc_skb(0, GFP_KERNEL);
5330 if (!skb) {
5331 kfree(fpl);
5332 return -ENOMEM;
5333 }
5334
5335 skb->sk = sk;
6b06314c 5336
08a45173 5337 nr_files = 0;
6b06314c
JA
5338 fpl->user = get_uid(ctx->user);
5339 for (i = 0; i < nr; i++) {
65e19f54
JA
5340 struct file *file = io_file_from_index(ctx, i + offset);
5341
5342 if (!file)
08a45173 5343 continue;
65e19f54 5344 fpl->fp[nr_files] = get_file(file);
08a45173
JA
5345 unix_inflight(fpl->user, fpl->fp[nr_files]);
5346 nr_files++;
6b06314c
JA
5347 }
5348
08a45173
JA
5349 if (nr_files) {
5350 fpl->max = SCM_MAX_FD;
5351 fpl->count = nr_files;
5352 UNIXCB(skb).fp = fpl;
05f3fb3c 5353 skb->destructor = unix_destruct_scm;
08a45173
JA
5354 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
5355 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 5356
08a45173
JA
5357 for (i = 0; i < nr_files; i++)
5358 fput(fpl->fp[i]);
5359 } else {
5360 kfree_skb(skb);
5361 kfree(fpl);
5362 }
6b06314c
JA
5363
5364 return 0;
5365}
5366
5367/*
5368 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
5369 * causes regular reference counting to break down. We rely on the UNIX
5370 * garbage collection to take care of this problem for us.
5371 */
5372static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5373{
5374 unsigned left, total;
5375 int ret = 0;
5376
5377 total = 0;
5378 left = ctx->nr_user_files;
5379 while (left) {
5380 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
5381
5382 ret = __io_sqe_files_scm(ctx, this_files, total);
5383 if (ret)
5384 break;
5385 left -= this_files;
5386 total += this_files;
5387 }
5388
5389 if (!ret)
5390 return 0;
5391
5392 while (total < ctx->nr_user_files) {
65e19f54
JA
5393 struct file *file = io_file_from_index(ctx, total);
5394
5395 if (file)
5396 fput(file);
6b06314c
JA
5397 total++;
5398 }
5399
5400 return ret;
5401}
5402#else
5403static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5404{
5405 return 0;
5406}
5407#endif
5408
65e19f54
JA
5409static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
5410 unsigned nr_files)
5411{
5412 int i;
5413
5414 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5415 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5416 unsigned this_files;
5417
5418 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
5419 table->files = kcalloc(this_files, sizeof(struct file *),
5420 GFP_KERNEL);
5421 if (!table->files)
5422 break;
5423 nr_files -= this_files;
5424 }
5425
5426 if (i == nr_tables)
5427 return 0;
5428
5429 for (i = 0; i < nr_tables; i++) {
05f3fb3c 5430 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
5431 kfree(table->files);
5432 }
5433 return 1;
5434}
5435
05f3fb3c
JA
5436static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
5437{
5438#if defined(CONFIG_UNIX)
5439 struct sock *sock = ctx->ring_sock->sk;
5440 struct sk_buff_head list, *head = &sock->sk_receive_queue;
5441 struct sk_buff *skb;
5442 int i;
5443
5444 __skb_queue_head_init(&list);
5445
5446 /*
5447 * Find the skb that holds this file in its SCM_RIGHTS. When found,
5448 * remove this entry and rearrange the file array.
5449 */
5450 skb = skb_dequeue(head);
5451 while (skb) {
5452 struct scm_fp_list *fp;
5453
5454 fp = UNIXCB(skb).fp;
5455 for (i = 0; i < fp->count; i++) {
5456 int left;
5457
5458 if (fp->fp[i] != file)
5459 continue;
5460
5461 unix_notinflight(fp->user, fp->fp[i]);
5462 left = fp->count - 1 - i;
5463 if (left) {
5464 memmove(&fp->fp[i], &fp->fp[i + 1],
5465 left * sizeof(struct file *));
5466 }
5467 fp->count--;
5468 if (!fp->count) {
5469 kfree_skb(skb);
5470 skb = NULL;
5471 } else {
5472 __skb_queue_tail(&list, skb);
5473 }
5474 fput(file);
5475 file = NULL;
5476 break;
5477 }
5478
5479 if (!file)
5480 break;
5481
5482 __skb_queue_tail(&list, skb);
5483
5484 skb = skb_dequeue(head);
5485 }
5486
5487 if (skb_peek(&list)) {
5488 spin_lock_irq(&head->lock);
5489 while ((skb = __skb_dequeue(&list)) != NULL)
5490 __skb_queue_tail(head, skb);
5491 spin_unlock_irq(&head->lock);
5492 }
5493#else
5494 fput(file);
5495#endif
5496}
5497
5498struct io_file_put {
5499 struct llist_node llist;
5500 struct file *file;
5501 struct completion *done;
5502};
5503
5504static void io_ring_file_ref_switch(struct work_struct *work)
65e19f54 5505{
05f3fb3c
JA
5506 struct io_file_put *pfile, *tmp;
5507 struct fixed_file_data *data;
5508 struct llist_node *node;
65e19f54 5509
05f3fb3c 5510 data = container_of(work, struct fixed_file_data, ref_work);
65e19f54 5511
05f3fb3c
JA
5512 while ((node = llist_del_all(&data->put_llist)) != NULL) {
5513 llist_for_each_entry_safe(pfile, tmp, node, llist) {
5514 io_ring_file_put(data->ctx, pfile->file);
5515 if (pfile->done)
5516 complete(pfile->done);
5517 else
5518 kfree(pfile);
5519 }
65e19f54
JA
5520 }
5521
05f3fb3c
JA
5522 percpu_ref_get(&data->refs);
5523 percpu_ref_switch_to_percpu(&data->refs);
5524}
65e19f54 5525
05f3fb3c
JA
5526static void io_file_data_ref_zero(struct percpu_ref *ref)
5527{
5528 struct fixed_file_data *data;
5529
5530 data = container_of(ref, struct fixed_file_data, refs);
5531
5532 /* we can't safely switch from inside this context, punt to wq */
5533 queue_work(system_wq, &data->ref_work);
65e19f54
JA
5534}
5535
6b06314c
JA
5536static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
5537 unsigned nr_args)
5538{
5539 __s32 __user *fds = (__s32 __user *) arg;
65e19f54 5540 unsigned nr_tables;
05f3fb3c 5541 struct file *file;
6b06314c
JA
5542 int fd, ret = 0;
5543 unsigned i;
5544
05f3fb3c 5545 if (ctx->file_data)
6b06314c
JA
5546 return -EBUSY;
5547 if (!nr_args)
5548 return -EINVAL;
5549 if (nr_args > IORING_MAX_FIXED_FILES)
5550 return -EMFILE;
5551
05f3fb3c
JA
5552 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
5553 if (!ctx->file_data)
5554 return -ENOMEM;
5555 ctx->file_data->ctx = ctx;
5556 init_completion(&ctx->file_data->done);
5557
65e19f54 5558 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
05f3fb3c
JA
5559 ctx->file_data->table = kcalloc(nr_tables,
5560 sizeof(struct fixed_file_table),
65e19f54 5561 GFP_KERNEL);
05f3fb3c
JA
5562 if (!ctx->file_data->table) {
5563 kfree(ctx->file_data);
5564 ctx->file_data = NULL;
6b06314c 5565 return -ENOMEM;
05f3fb3c
JA
5566 }
5567
5568 if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
5569 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
5570 kfree(ctx->file_data->table);
5571 kfree(ctx->file_data);
5572 ctx->file_data = NULL;
6b06314c 5573 return -ENOMEM;
05f3fb3c
JA
5574 }
5575 ctx->file_data->put_llist.first = NULL;
5576 INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
6b06314c 5577
65e19f54 5578 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
05f3fb3c
JA
5579 percpu_ref_exit(&ctx->file_data->refs);
5580 kfree(ctx->file_data->table);
5581 kfree(ctx->file_data);
5582 ctx->file_data = NULL;
65e19f54
JA
5583 return -ENOMEM;
5584 }
5585
08a45173 5586 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
65e19f54
JA
5587 struct fixed_file_table *table;
5588 unsigned index;
5589
6b06314c
JA
5590 ret = -EFAULT;
5591 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
5592 break;
08a45173
JA
5593 /* allow sparse sets */
5594 if (fd == -1) {
5595 ret = 0;
5596 continue;
5597 }
6b06314c 5598
05f3fb3c 5599 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54 5600 index = i & IORING_FILE_TABLE_MASK;
05f3fb3c 5601 file = fget(fd);
6b06314c
JA
5602
5603 ret = -EBADF;
05f3fb3c 5604 if (!file)
6b06314c 5605 break;
05f3fb3c 5606
6b06314c
JA
5607 /*
5608 * Don't allow io_uring instances to be registered. If UNIX
5609 * isn't enabled, then this causes a reference cycle and this
5610 * instance can never get freed. If UNIX is enabled we'll
5611 * handle it just fine, but there's still no point in allowing
5612 * a ring fd as it doesn't support regular read/write anyway.
5613 */
05f3fb3c
JA
5614 if (file->f_op == &io_uring_fops) {
5615 fput(file);
6b06314c
JA
5616 break;
5617 }
6b06314c 5618 ret = 0;
05f3fb3c 5619 table->files[index] = file;
6b06314c
JA
5620 }
5621
5622 if (ret) {
65e19f54 5623 for (i = 0; i < ctx->nr_user_files; i++) {
65e19f54
JA
5624 file = io_file_from_index(ctx, i);
5625 if (file)
5626 fput(file);
5627 }
5628 for (i = 0; i < nr_tables; i++)
05f3fb3c 5629 kfree(ctx->file_data->table[i].files);
6b06314c 5630
05f3fb3c
JA
5631 kfree(ctx->file_data->table);
5632 kfree(ctx->file_data);
5633 ctx->file_data = NULL;
6b06314c
JA
5634 ctx->nr_user_files = 0;
5635 return ret;
5636 }
5637
5638 ret = io_sqe_files_scm(ctx);
5639 if (ret)
5640 io_sqe_files_unregister(ctx);
5641
5642 return ret;
5643}
5644
c3a31e60
JA
5645static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
5646 int index)
5647{
5648#if defined(CONFIG_UNIX)
5649 struct sock *sock = ctx->ring_sock->sk;
5650 struct sk_buff_head *head = &sock->sk_receive_queue;
5651 struct sk_buff *skb;
5652
5653 /*
5654 * See if we can merge this file into an existing skb SCM_RIGHTS
5655 * file set. If there's no room, fall back to allocating a new skb
5656 * and filling it in.
5657 */
5658 spin_lock_irq(&head->lock);
5659 skb = skb_peek(head);
5660 if (skb) {
5661 struct scm_fp_list *fpl = UNIXCB(skb).fp;
5662
5663 if (fpl->count < SCM_MAX_FD) {
5664 __skb_unlink(skb, head);
5665 spin_unlock_irq(&head->lock);
5666 fpl->fp[fpl->count] = get_file(file);
5667 unix_inflight(fpl->user, fpl->fp[fpl->count]);
5668 fpl->count++;
5669 spin_lock_irq(&head->lock);
5670 __skb_queue_head(head, skb);
5671 } else {
5672 skb = NULL;
5673 }
5674 }
5675 spin_unlock_irq(&head->lock);
5676
5677 if (skb) {
5678 fput(file);
5679 return 0;
5680 }
5681
5682 return __io_sqe_files_scm(ctx, 1, index);
5683#else
5684 return 0;
5685#endif
5686}
5687
05f3fb3c 5688static void io_atomic_switch(struct percpu_ref *ref)
c3a31e60 5689{
05f3fb3c
JA
5690 struct fixed_file_data *data;
5691
5692 data = container_of(ref, struct fixed_file_data, refs);
5693 clear_bit(FFD_F_ATOMIC, &data->state);
5694}
5695
5696static bool io_queue_file_removal(struct fixed_file_data *data,
5697 struct file *file)
5698{
5699 struct io_file_put *pfile, pfile_stack;
5700 DECLARE_COMPLETION_ONSTACK(done);
5701
5702 /*
5703 * If we fail allocating the struct we need for doing async reomval
5704 * of this file, just punt to sync and wait for it.
5705 */
5706 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
5707 if (!pfile) {
5708 pfile = &pfile_stack;
5709 pfile->done = &done;
5710 }
5711
5712 pfile->file = file;
5713 llist_add(&pfile->llist, &data->put_llist);
5714
5715 if (pfile == &pfile_stack) {
5716 if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5717 percpu_ref_put(&data->refs);
5718 percpu_ref_switch_to_atomic(&data->refs,
5719 io_atomic_switch);
5720 }
5721 wait_for_completion(&done);
5722 flush_work(&data->ref_work);
5723 return false;
5724 }
5725
5726 return true;
5727}
5728
5729static int __io_sqe_files_update(struct io_ring_ctx *ctx,
5730 struct io_uring_files_update *up,
5731 unsigned nr_args)
5732{
5733 struct fixed_file_data *data = ctx->file_data;
5734 bool ref_switch = false;
5735 struct file *file;
c3a31e60
JA
5736 __s32 __user *fds;
5737 int fd, i, err;
5738 __u32 done;
5739
05f3fb3c 5740 if (check_add_overflow(up->offset, nr_args, &done))
c3a31e60
JA
5741 return -EOVERFLOW;
5742 if (done > ctx->nr_user_files)
5743 return -EINVAL;
5744
5745 done = 0;
05f3fb3c 5746 fds = u64_to_user_ptr(up->fds);
c3a31e60 5747 while (nr_args) {
65e19f54
JA
5748 struct fixed_file_table *table;
5749 unsigned index;
5750
c3a31e60
JA
5751 err = 0;
5752 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
5753 err = -EFAULT;
5754 break;
5755 }
05f3fb3c
JA
5756 i = array_index_nospec(up->offset, ctx->nr_user_files);
5757 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54
JA
5758 index = i & IORING_FILE_TABLE_MASK;
5759 if (table->files[index]) {
05f3fb3c 5760 file = io_file_from_index(ctx, index);
65e19f54 5761 table->files[index] = NULL;
05f3fb3c
JA
5762 if (io_queue_file_removal(data, file))
5763 ref_switch = true;
c3a31e60
JA
5764 }
5765 if (fd != -1) {
c3a31e60
JA
5766 file = fget(fd);
5767 if (!file) {
5768 err = -EBADF;
5769 break;
5770 }
5771 /*
5772 * Don't allow io_uring instances to be registered. If
5773 * UNIX isn't enabled, then this causes a reference
5774 * cycle and this instance can never get freed. If UNIX
5775 * is enabled we'll handle it just fine, but there's
5776 * still no point in allowing a ring fd as it doesn't
5777 * support regular read/write anyway.
5778 */
5779 if (file->f_op == &io_uring_fops) {
5780 fput(file);
5781 err = -EBADF;
5782 break;
5783 }
65e19f54 5784 table->files[index] = file;
c3a31e60
JA
5785 err = io_sqe_file_register(ctx, file, i);
5786 if (err)
5787 break;
5788 }
5789 nr_args--;
5790 done++;
05f3fb3c
JA
5791 up->offset++;
5792 }
5793
5794 if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5795 percpu_ref_put(&data->refs);
5796 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
c3a31e60
JA
5797 }
5798
5799 return done ? done : err;
5800}
05f3fb3c
JA
5801static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
5802 unsigned nr_args)
5803{
5804 struct io_uring_files_update up;
5805
5806 if (!ctx->file_data)
5807 return -ENXIO;
5808 if (!nr_args)
5809 return -EINVAL;
5810 if (copy_from_user(&up, arg, sizeof(up)))
5811 return -EFAULT;
5812 if (up.resv)
5813 return -EINVAL;
5814
5815 return __io_sqe_files_update(ctx, &up, nr_args);
5816}
c3a31e60 5817
7d723065
JA
5818static void io_put_work(struct io_wq_work *work)
5819{
5820 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5821
5822 io_put_req(req);
5823}
5824
5825static void io_get_work(struct io_wq_work *work)
5826{
5827 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5828
5829 refcount_inc(&req->refs);
5830}
5831
24369c2e
PB
5832static int io_init_wq_offload(struct io_ring_ctx *ctx,
5833 struct io_uring_params *p)
5834{
5835 struct io_wq_data data;
5836 struct fd f;
5837 struct io_ring_ctx *ctx_attach;
5838 unsigned int concurrency;
5839 int ret = 0;
5840
5841 data.user = ctx->user;
5842 data.get_work = io_get_work;
5843 data.put_work = io_put_work;
5844
5845 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
5846 /* Do QD, or 4 * CPUS, whatever is smallest */
5847 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
5848
5849 ctx->io_wq = io_wq_create(concurrency, &data);
5850 if (IS_ERR(ctx->io_wq)) {
5851 ret = PTR_ERR(ctx->io_wq);
5852 ctx->io_wq = NULL;
5853 }
5854 return ret;
5855 }
5856
5857 f = fdget(p->wq_fd);
5858 if (!f.file)
5859 return -EBADF;
5860
5861 if (f.file->f_op != &io_uring_fops) {
5862 ret = -EINVAL;
5863 goto out_fput;
5864 }
5865
5866 ctx_attach = f.file->private_data;
5867 /* @io_wq is protected by holding the fd */
5868 if (!io_wq_get(ctx_attach->io_wq, &data)) {
5869 ret = -EINVAL;
5870 goto out_fput;
5871 }
5872
5873 ctx->io_wq = ctx_attach->io_wq;
5874out_fput:
5875 fdput(f);
5876 return ret;
5877}
5878
6c271ce2
JA
5879static int io_sq_offload_start(struct io_ring_ctx *ctx,
5880 struct io_uring_params *p)
2b188cc1
JA
5881{
5882 int ret;
5883
6c271ce2 5884 init_waitqueue_head(&ctx->sqo_wait);
2b188cc1
JA
5885 mmgrab(current->mm);
5886 ctx->sqo_mm = current->mm;
5887
6c271ce2 5888 if (ctx->flags & IORING_SETUP_SQPOLL) {
3ec482d1
JA
5889 ret = -EPERM;
5890 if (!capable(CAP_SYS_ADMIN))
5891 goto err;
5892
917257da
JA
5893 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
5894 if (!ctx->sq_thread_idle)
5895 ctx->sq_thread_idle = HZ;
5896
6c271ce2 5897 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 5898 int cpu = p->sq_thread_cpu;
6c271ce2 5899
917257da 5900 ret = -EINVAL;
44a9bd18
JA
5901 if (cpu >= nr_cpu_ids)
5902 goto err;
7889f44d 5903 if (!cpu_online(cpu))
917257da
JA
5904 goto err;
5905
6c271ce2
JA
5906 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
5907 ctx, cpu,
5908 "io_uring-sq");
5909 } else {
5910 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
5911 "io_uring-sq");
5912 }
5913 if (IS_ERR(ctx->sqo_thread)) {
5914 ret = PTR_ERR(ctx->sqo_thread);
5915 ctx->sqo_thread = NULL;
5916 goto err;
5917 }
5918 wake_up_process(ctx->sqo_thread);
5919 } else if (p->flags & IORING_SETUP_SQ_AFF) {
5920 /* Can't have SQ_AFF without SQPOLL */
5921 ret = -EINVAL;
5922 goto err;
5923 }
5924
24369c2e
PB
5925 ret = io_init_wq_offload(ctx, p);
5926 if (ret)
2b188cc1 5927 goto err;
2b188cc1
JA
5928
5929 return 0;
5930err:
54a91f3b 5931 io_finish_async(ctx);
2b188cc1
JA
5932 mmdrop(ctx->sqo_mm);
5933 ctx->sqo_mm = NULL;
5934 return ret;
5935}
5936
5937static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
5938{
5939 atomic_long_sub(nr_pages, &user->locked_vm);
5940}
5941
5942static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
5943{
5944 unsigned long page_limit, cur_pages, new_pages;
5945
5946 /* Don't allow more pages than we can safely lock */
5947 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
5948
5949 do {
5950 cur_pages = atomic_long_read(&user->locked_vm);
5951 new_pages = cur_pages + nr_pages;
5952 if (new_pages > page_limit)
5953 return -ENOMEM;
5954 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
5955 new_pages) != cur_pages);
5956
5957 return 0;
5958}
5959
5960static void io_mem_free(void *ptr)
5961{
52e04ef4
MR
5962 struct page *page;
5963
5964 if (!ptr)
5965 return;
2b188cc1 5966
52e04ef4 5967 page = virt_to_head_page(ptr);
2b188cc1
JA
5968 if (put_page_testzero(page))
5969 free_compound_page(page);
5970}
5971
5972static void *io_mem_alloc(size_t size)
5973{
5974 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
5975 __GFP_NORETRY;
5976
5977 return (void *) __get_free_pages(gfp_flags, get_order(size));
5978}
5979
75b28aff
HV
5980static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
5981 size_t *sq_offset)
5982{
5983 struct io_rings *rings;
5984 size_t off, sq_array_size;
5985
5986 off = struct_size(rings, cqes, cq_entries);
5987 if (off == SIZE_MAX)
5988 return SIZE_MAX;
5989
5990#ifdef CONFIG_SMP
5991 off = ALIGN(off, SMP_CACHE_BYTES);
5992 if (off == 0)
5993 return SIZE_MAX;
5994#endif
5995
5996 sq_array_size = array_size(sizeof(u32), sq_entries);
5997 if (sq_array_size == SIZE_MAX)
5998 return SIZE_MAX;
5999
6000 if (check_add_overflow(off, sq_array_size, &off))
6001 return SIZE_MAX;
6002
6003 if (sq_offset)
6004 *sq_offset = off;
6005
6006 return off;
6007}
6008
2b188cc1
JA
6009static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
6010{
75b28aff 6011 size_t pages;
2b188cc1 6012
75b28aff
HV
6013 pages = (size_t)1 << get_order(
6014 rings_size(sq_entries, cq_entries, NULL));
6015 pages += (size_t)1 << get_order(
6016 array_size(sizeof(struct io_uring_sqe), sq_entries));
2b188cc1 6017
75b28aff 6018 return pages;
2b188cc1
JA
6019}
6020
edafccee
JA
6021static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
6022{
6023 int i, j;
6024
6025 if (!ctx->user_bufs)
6026 return -ENXIO;
6027
6028 for (i = 0; i < ctx->nr_user_bufs; i++) {
6029 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6030
6031 for (j = 0; j < imu->nr_bvecs; j++)
27c4d3a3 6032 put_user_page(imu->bvec[j].bv_page);
edafccee
JA
6033
6034 if (ctx->account_mem)
6035 io_unaccount_mem(ctx->user, imu->nr_bvecs);
d4ef6475 6036 kvfree(imu->bvec);
edafccee
JA
6037 imu->nr_bvecs = 0;
6038 }
6039
6040 kfree(ctx->user_bufs);
6041 ctx->user_bufs = NULL;
6042 ctx->nr_user_bufs = 0;
6043 return 0;
6044}
6045
6046static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
6047 void __user *arg, unsigned index)
6048{
6049 struct iovec __user *src;
6050
6051#ifdef CONFIG_COMPAT
6052 if (ctx->compat) {
6053 struct compat_iovec __user *ciovs;
6054 struct compat_iovec ciov;
6055
6056 ciovs = (struct compat_iovec __user *) arg;
6057 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
6058 return -EFAULT;
6059
d55e5f5b 6060 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
6061 dst->iov_len = ciov.iov_len;
6062 return 0;
6063 }
6064#endif
6065 src = (struct iovec __user *) arg;
6066 if (copy_from_user(dst, &src[index], sizeof(*dst)))
6067 return -EFAULT;
6068 return 0;
6069}
6070
6071static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
6072 unsigned nr_args)
6073{
6074 struct vm_area_struct **vmas = NULL;
6075 struct page **pages = NULL;
6076 int i, j, got_pages = 0;
6077 int ret = -EINVAL;
6078
6079 if (ctx->user_bufs)
6080 return -EBUSY;
6081 if (!nr_args || nr_args > UIO_MAXIOV)
6082 return -EINVAL;
6083
6084 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
6085 GFP_KERNEL);
6086 if (!ctx->user_bufs)
6087 return -ENOMEM;
6088
6089 for (i = 0; i < nr_args; i++) {
6090 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6091 unsigned long off, start, end, ubuf;
6092 int pret, nr_pages;
6093 struct iovec iov;
6094 size_t size;
6095
6096 ret = io_copy_iov(ctx, &iov, arg, i);
6097 if (ret)
a278682d 6098 goto err;
edafccee
JA
6099
6100 /*
6101 * Don't impose further limits on the size and buffer
6102 * constraints here, we'll -EINVAL later when IO is
6103 * submitted if they are wrong.
6104 */
6105 ret = -EFAULT;
6106 if (!iov.iov_base || !iov.iov_len)
6107 goto err;
6108
6109 /* arbitrary limit, but we need something */
6110 if (iov.iov_len > SZ_1G)
6111 goto err;
6112
6113 ubuf = (unsigned long) iov.iov_base;
6114 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
6115 start = ubuf >> PAGE_SHIFT;
6116 nr_pages = end - start;
6117
6118 if (ctx->account_mem) {
6119 ret = io_account_mem(ctx->user, nr_pages);
6120 if (ret)
6121 goto err;
6122 }
6123
6124 ret = 0;
6125 if (!pages || nr_pages > got_pages) {
6126 kfree(vmas);
6127 kfree(pages);
d4ef6475 6128 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
edafccee 6129 GFP_KERNEL);
d4ef6475 6130 vmas = kvmalloc_array(nr_pages,
edafccee
JA
6131 sizeof(struct vm_area_struct *),
6132 GFP_KERNEL);
6133 if (!pages || !vmas) {
6134 ret = -ENOMEM;
6135 if (ctx->account_mem)
6136 io_unaccount_mem(ctx->user, nr_pages);
6137 goto err;
6138 }
6139 got_pages = nr_pages;
6140 }
6141
d4ef6475 6142 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
edafccee
JA
6143 GFP_KERNEL);
6144 ret = -ENOMEM;
6145 if (!imu->bvec) {
6146 if (ctx->account_mem)
6147 io_unaccount_mem(ctx->user, nr_pages);
6148 goto err;
6149 }
6150
6151 ret = 0;
6152 down_read(&current->mm->mmap_sem);
932f4a63
IW
6153 pret = get_user_pages(ubuf, nr_pages,
6154 FOLL_WRITE | FOLL_LONGTERM,
6155 pages, vmas);
edafccee
JA
6156 if (pret == nr_pages) {
6157 /* don't support file backed memory */
6158 for (j = 0; j < nr_pages; j++) {
6159 struct vm_area_struct *vma = vmas[j];
6160
6161 if (vma->vm_file &&
6162 !is_file_hugepages(vma->vm_file)) {
6163 ret = -EOPNOTSUPP;
6164 break;
6165 }
6166 }
6167 } else {
6168 ret = pret < 0 ? pret : -EFAULT;
6169 }
6170 up_read(&current->mm->mmap_sem);
6171 if (ret) {
6172 /*
6173 * if we did partial map, or found file backed vmas,
6174 * release any pages we did get
6175 */
27c4d3a3
JH
6176 if (pret > 0)
6177 put_user_pages(pages, pret);
edafccee
JA
6178 if (ctx->account_mem)
6179 io_unaccount_mem(ctx->user, nr_pages);
d4ef6475 6180 kvfree(imu->bvec);
edafccee
JA
6181 goto err;
6182 }
6183
6184 off = ubuf & ~PAGE_MASK;
6185 size = iov.iov_len;
6186 for (j = 0; j < nr_pages; j++) {
6187 size_t vec_len;
6188
6189 vec_len = min_t(size_t, size, PAGE_SIZE - off);
6190 imu->bvec[j].bv_page = pages[j];
6191 imu->bvec[j].bv_len = vec_len;
6192 imu->bvec[j].bv_offset = off;
6193 off = 0;
6194 size -= vec_len;
6195 }
6196 /* store original address for later verification */
6197 imu->ubuf = ubuf;
6198 imu->len = iov.iov_len;
6199 imu->nr_bvecs = nr_pages;
6200
6201 ctx->nr_user_bufs++;
6202 }
d4ef6475
MR
6203 kvfree(pages);
6204 kvfree(vmas);
edafccee
JA
6205 return 0;
6206err:
d4ef6475
MR
6207 kvfree(pages);
6208 kvfree(vmas);
edafccee
JA
6209 io_sqe_buffer_unregister(ctx);
6210 return ret;
6211}
6212
9b402849
JA
6213static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
6214{
6215 __s32 __user *fds = arg;
6216 int fd;
6217
6218 if (ctx->cq_ev_fd)
6219 return -EBUSY;
6220
6221 if (copy_from_user(&fd, fds, sizeof(*fds)))
6222 return -EFAULT;
6223
6224 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
6225 if (IS_ERR(ctx->cq_ev_fd)) {
6226 int ret = PTR_ERR(ctx->cq_ev_fd);
6227 ctx->cq_ev_fd = NULL;
6228 return ret;
6229 }
6230
6231 return 0;
6232}
6233
6234static int io_eventfd_unregister(struct io_ring_ctx *ctx)
6235{
6236 if (ctx->cq_ev_fd) {
6237 eventfd_ctx_put(ctx->cq_ev_fd);
6238 ctx->cq_ev_fd = NULL;
6239 return 0;
6240 }
6241
6242 return -ENXIO;
6243}
6244
2b188cc1
JA
6245static void io_ring_ctx_free(struct io_ring_ctx *ctx)
6246{
6b06314c 6247 io_finish_async(ctx);
2b188cc1
JA
6248 if (ctx->sqo_mm)
6249 mmdrop(ctx->sqo_mm);
def596e9
JA
6250
6251 io_iopoll_reap_events(ctx);
edafccee 6252 io_sqe_buffer_unregister(ctx);
6b06314c 6253 io_sqe_files_unregister(ctx);
9b402849 6254 io_eventfd_unregister(ctx);
def596e9 6255
2b188cc1 6256#if defined(CONFIG_UNIX)
355e8d26
EB
6257 if (ctx->ring_sock) {
6258 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 6259 sock_release(ctx->ring_sock);
355e8d26 6260 }
2b188cc1
JA
6261#endif
6262
75b28aff 6263 io_mem_free(ctx->rings);
2b188cc1 6264 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
6265
6266 percpu_ref_exit(&ctx->refs);
6267 if (ctx->account_mem)
6268 io_unaccount_mem(ctx->user,
6269 ring_pages(ctx->sq_entries, ctx->cq_entries));
6270 free_uid(ctx->user);
181e448d 6271 put_cred(ctx->creds);
206aefde 6272 kfree(ctx->completions);
78076bb6 6273 kfree(ctx->cancel_hash);
0ddf92e8 6274 kmem_cache_free(req_cachep, ctx->fallback_req);
2b188cc1
JA
6275 kfree(ctx);
6276}
6277
6278static __poll_t io_uring_poll(struct file *file, poll_table *wait)
6279{
6280 struct io_ring_ctx *ctx = file->private_data;
6281 __poll_t mask = 0;
6282
6283 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
6284 /*
6285 * synchronizes with barrier from wq_has_sleeper call in
6286 * io_commit_cqring
6287 */
2b188cc1 6288 smp_rmb();
75b28aff
HV
6289 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
6290 ctx->rings->sq_ring_entries)
2b188cc1 6291 mask |= EPOLLOUT | EPOLLWRNORM;
daa5de54 6292 if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
2b188cc1
JA
6293 mask |= EPOLLIN | EPOLLRDNORM;
6294
6295 return mask;
6296}
6297
6298static int io_uring_fasync(int fd, struct file *file, int on)
6299{
6300 struct io_ring_ctx *ctx = file->private_data;
6301
6302 return fasync_helper(fd, file, on, &ctx->cq_fasync);
6303}
6304
071698e1
JA
6305static int io_remove_personalities(int id, void *p, void *data)
6306{
6307 struct io_ring_ctx *ctx = data;
6308 const struct cred *cred;
6309
6310 cred = idr_remove(&ctx->personality_idr, id);
6311 if (cred)
6312 put_cred(cred);
6313 return 0;
6314}
6315
2b188cc1
JA
6316static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
6317{
6318 mutex_lock(&ctx->uring_lock);
6319 percpu_ref_kill(&ctx->refs);
6320 mutex_unlock(&ctx->uring_lock);
6321
5262f567 6322 io_kill_timeouts(ctx);
221c5eb2 6323 io_poll_remove_all(ctx);
561fb04a
JA
6324
6325 if (ctx->io_wq)
6326 io_wq_cancel_all(ctx->io_wq);
6327
def596e9 6328 io_iopoll_reap_events(ctx);
15dff286
JA
6329 /* if we failed setting up the ctx, we might not have any rings */
6330 if (ctx->rings)
6331 io_cqring_overflow_flush(ctx, true);
071698e1 6332 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
206aefde 6333 wait_for_completion(&ctx->completions[0]);
2b188cc1
JA
6334 io_ring_ctx_free(ctx);
6335}
6336
6337static int io_uring_release(struct inode *inode, struct file *file)
6338{
6339 struct io_ring_ctx *ctx = file->private_data;
6340
6341 file->private_data = NULL;
6342 io_ring_ctx_wait_and_kill(ctx);
6343 return 0;
6344}
6345
fcb323cc
JA
6346static void io_uring_cancel_files(struct io_ring_ctx *ctx,
6347 struct files_struct *files)
6348{
6349 struct io_kiocb *req;
6350 DEFINE_WAIT(wait);
6351
6352 while (!list_empty_careful(&ctx->inflight_list)) {
768134d4 6353 struct io_kiocb *cancel_req = NULL;
fcb323cc
JA
6354
6355 spin_lock_irq(&ctx->inflight_lock);
6356 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
768134d4
JA
6357 if (req->work.files != files)
6358 continue;
6359 /* req is being completed, ignore */
6360 if (!refcount_inc_not_zero(&req->refs))
6361 continue;
6362 cancel_req = req;
6363 break;
fcb323cc 6364 }
768134d4 6365 if (cancel_req)
fcb323cc 6366 prepare_to_wait(&ctx->inflight_wait, &wait,
768134d4 6367 TASK_UNINTERRUPTIBLE);
fcb323cc
JA
6368 spin_unlock_irq(&ctx->inflight_lock);
6369
768134d4
JA
6370 /* We need to keep going until we don't find a matching req */
6371 if (!cancel_req)
fcb323cc 6372 break;
2f6d9b9d
BL
6373
6374 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
6375 io_put_req(cancel_req);
fcb323cc
JA
6376 schedule();
6377 }
768134d4 6378 finish_wait(&ctx->inflight_wait, &wait);
fcb323cc
JA
6379}
6380
6381static int io_uring_flush(struct file *file, void *data)
6382{
6383 struct io_ring_ctx *ctx = file->private_data;
6384
6385 io_uring_cancel_files(ctx, data);
fcb323cc
JA
6386 return 0;
6387}
6388
6c5c240e
RP
6389static void *io_uring_validate_mmap_request(struct file *file,
6390 loff_t pgoff, size_t sz)
2b188cc1 6391{
2b188cc1 6392 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 6393 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
6394 struct page *page;
6395 void *ptr;
6396
6397 switch (offset) {
6398 case IORING_OFF_SQ_RING:
75b28aff
HV
6399 case IORING_OFF_CQ_RING:
6400 ptr = ctx->rings;
2b188cc1
JA
6401 break;
6402 case IORING_OFF_SQES:
6403 ptr = ctx->sq_sqes;
6404 break;
2b188cc1 6405 default:
6c5c240e 6406 return ERR_PTR(-EINVAL);
2b188cc1
JA
6407 }
6408
6409 page = virt_to_head_page(ptr);
a50b854e 6410 if (sz > page_size(page))
6c5c240e
RP
6411 return ERR_PTR(-EINVAL);
6412
6413 return ptr;
6414}
6415
6416#ifdef CONFIG_MMU
6417
6418static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6419{
6420 size_t sz = vma->vm_end - vma->vm_start;
6421 unsigned long pfn;
6422 void *ptr;
6423
6424 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
6425 if (IS_ERR(ptr))
6426 return PTR_ERR(ptr);
2b188cc1
JA
6427
6428 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
6429 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
6430}
6431
6c5c240e
RP
6432#else /* !CONFIG_MMU */
6433
6434static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6435{
6436 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
6437}
6438
6439static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
6440{
6441 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
6442}
6443
6444static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
6445 unsigned long addr, unsigned long len,
6446 unsigned long pgoff, unsigned long flags)
6447{
6448 void *ptr;
6449
6450 ptr = io_uring_validate_mmap_request(file, pgoff, len);
6451 if (IS_ERR(ptr))
6452 return PTR_ERR(ptr);
6453
6454 return (unsigned long) ptr;
6455}
6456
6457#endif /* !CONFIG_MMU */
6458
2b188cc1
JA
6459SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
6460 u32, min_complete, u32, flags, const sigset_t __user *, sig,
6461 size_t, sigsz)
6462{
6463 struct io_ring_ctx *ctx;
6464 long ret = -EBADF;
6465 int submitted = 0;
6466 struct fd f;
6467
6c271ce2 6468 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2b188cc1
JA
6469 return -EINVAL;
6470
6471 f = fdget(fd);
6472 if (!f.file)
6473 return -EBADF;
6474
6475 ret = -EOPNOTSUPP;
6476 if (f.file->f_op != &io_uring_fops)
6477 goto out_fput;
6478
6479 ret = -ENXIO;
6480 ctx = f.file->private_data;
6481 if (!percpu_ref_tryget(&ctx->refs))
6482 goto out_fput;
6483
6c271ce2
JA
6484 /*
6485 * For SQ polling, the thread will do all submissions and completions.
6486 * Just return the requested submit count, and wake the thread if
6487 * we were asked to.
6488 */
b2a9eada 6489 ret = 0;
6c271ce2 6490 if (ctx->flags & IORING_SETUP_SQPOLL) {
c1edbf5f
JA
6491 if (!list_empty_careful(&ctx->cq_overflow_list))
6492 io_cqring_overflow_flush(ctx, false);
6c271ce2
JA
6493 if (flags & IORING_ENTER_SQ_WAKEUP)
6494 wake_up(&ctx->sqo_wait);
6495 submitted = to_submit;
b2a9eada 6496 } else if (to_submit) {
ae9428ca 6497 struct mm_struct *cur_mm;
2b188cc1
JA
6498
6499 mutex_lock(&ctx->uring_lock);
ae9428ca
PB
6500 /* already have mm, so io_submit_sqes() won't try to grab it */
6501 cur_mm = ctx->sqo_mm;
6502 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
6503 &cur_mm, false);
2b188cc1 6504 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
6505
6506 if (submitted != to_submit)
6507 goto out;
2b188cc1
JA
6508 }
6509 if (flags & IORING_ENTER_GETEVENTS) {
def596e9
JA
6510 unsigned nr_events = 0;
6511
2b188cc1
JA
6512 min_complete = min(min_complete, ctx->cq_entries);
6513
def596e9 6514 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9 6515 ret = io_iopoll_check(ctx, &nr_events, min_complete);
def596e9
JA
6516 } else {
6517 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
6518 }
2b188cc1
JA
6519 }
6520
7c504e65 6521out:
6805b32e 6522 percpu_ref_put(&ctx->refs);
2b188cc1
JA
6523out_fput:
6524 fdput(f);
6525 return submitted ? submitted : ret;
6526}
6527
87ce955b
JA
6528static int io_uring_show_cred(int id, void *p, void *data)
6529{
6530 const struct cred *cred = p;
6531 struct seq_file *m = data;
6532 struct user_namespace *uns = seq_user_ns(m);
6533 struct group_info *gi;
6534 kernel_cap_t cap;
6535 unsigned __capi;
6536 int g;
6537
6538 seq_printf(m, "%5d\n", id);
6539 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
6540 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
6541 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
6542 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
6543 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
6544 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
6545 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
6546 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
6547 seq_puts(m, "\n\tGroups:\t");
6548 gi = cred->group_info;
6549 for (g = 0; g < gi->ngroups; g++) {
6550 seq_put_decimal_ull(m, g ? " " : "",
6551 from_kgid_munged(uns, gi->gid[g]));
6552 }
6553 seq_puts(m, "\n\tCapEff:\t");
6554 cap = cred->cap_effective;
6555 CAP_FOR_EACH_U32(__capi)
6556 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
6557 seq_putc(m, '\n');
6558 return 0;
6559}
6560
6561static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
6562{
6563 int i;
6564
6565 mutex_lock(&ctx->uring_lock);
6566 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
6567 for (i = 0; i < ctx->nr_user_files; i++) {
6568 struct fixed_file_table *table;
6569 struct file *f;
6570
6571 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6572 f = table->files[i & IORING_FILE_TABLE_MASK];
6573 if (f)
6574 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
6575 else
6576 seq_printf(m, "%5u: <none>\n", i);
6577 }
6578 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
6579 for (i = 0; i < ctx->nr_user_bufs; i++) {
6580 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
6581
6582 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
6583 (unsigned int) buf->len);
6584 }
6585 if (!idr_is_empty(&ctx->personality_idr)) {
6586 seq_printf(m, "Personalities:\n");
6587 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
6588 }
6589 mutex_unlock(&ctx->uring_lock);
6590}
6591
6592static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
6593{
6594 struct io_ring_ctx *ctx = f->private_data;
6595
6596 if (percpu_ref_tryget(&ctx->refs)) {
6597 __io_uring_show_fdinfo(ctx, m);
6598 percpu_ref_put(&ctx->refs);
6599 }
6600}
6601
2b188cc1
JA
6602static const struct file_operations io_uring_fops = {
6603 .release = io_uring_release,
fcb323cc 6604 .flush = io_uring_flush,
2b188cc1 6605 .mmap = io_uring_mmap,
6c5c240e
RP
6606#ifndef CONFIG_MMU
6607 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
6608 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
6609#endif
2b188cc1
JA
6610 .poll = io_uring_poll,
6611 .fasync = io_uring_fasync,
87ce955b 6612 .show_fdinfo = io_uring_show_fdinfo,
2b188cc1
JA
6613};
6614
6615static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
6616 struct io_uring_params *p)
6617{
75b28aff
HV
6618 struct io_rings *rings;
6619 size_t size, sq_array_offset;
2b188cc1 6620
75b28aff
HV
6621 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
6622 if (size == SIZE_MAX)
6623 return -EOVERFLOW;
6624
6625 rings = io_mem_alloc(size);
6626 if (!rings)
2b188cc1
JA
6627 return -ENOMEM;
6628
75b28aff
HV
6629 ctx->rings = rings;
6630 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
6631 rings->sq_ring_mask = p->sq_entries - 1;
6632 rings->cq_ring_mask = p->cq_entries - 1;
6633 rings->sq_ring_entries = p->sq_entries;
6634 rings->cq_ring_entries = p->cq_entries;
6635 ctx->sq_mask = rings->sq_ring_mask;
6636 ctx->cq_mask = rings->cq_ring_mask;
6637 ctx->sq_entries = rings->sq_ring_entries;
6638 ctx->cq_entries = rings->cq_ring_entries;
2b188cc1
JA
6639
6640 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
6641 if (size == SIZE_MAX) {
6642 io_mem_free(ctx->rings);
6643 ctx->rings = NULL;
2b188cc1 6644 return -EOVERFLOW;
eb065d30 6645 }
2b188cc1
JA
6646
6647 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
6648 if (!ctx->sq_sqes) {
6649 io_mem_free(ctx->rings);
6650 ctx->rings = NULL;
2b188cc1 6651 return -ENOMEM;
eb065d30 6652 }
2b188cc1 6653
2b188cc1
JA
6654 return 0;
6655}
6656
6657/*
6658 * Allocate an anonymous fd, this is what constitutes the application
6659 * visible backing of an io_uring instance. The application mmaps this
6660 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
6661 * we have to tie this fd to a socket for file garbage collection purposes.
6662 */
6663static int io_uring_get_fd(struct io_ring_ctx *ctx)
6664{
6665 struct file *file;
6666 int ret;
6667
6668#if defined(CONFIG_UNIX)
6669 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
6670 &ctx->ring_sock);
6671 if (ret)
6672 return ret;
6673#endif
6674
6675 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
6676 if (ret < 0)
6677 goto err;
6678
6679 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
6680 O_RDWR | O_CLOEXEC);
6681 if (IS_ERR(file)) {
6682 put_unused_fd(ret);
6683 ret = PTR_ERR(file);
6684 goto err;
6685 }
6686
6687#if defined(CONFIG_UNIX)
6688 ctx->ring_sock->file = file;
6689#endif
6690 fd_install(ret, file);
6691 return ret;
6692err:
6693#if defined(CONFIG_UNIX)
6694 sock_release(ctx->ring_sock);
6695 ctx->ring_sock = NULL;
6696#endif
6697 return ret;
6698}
6699
6700static int io_uring_create(unsigned entries, struct io_uring_params *p)
6701{
6702 struct user_struct *user = NULL;
6703 struct io_ring_ctx *ctx;
6704 bool account_mem;
6705 int ret;
6706
8110c1a6 6707 if (!entries)
2b188cc1 6708 return -EINVAL;
8110c1a6
JA
6709 if (entries > IORING_MAX_ENTRIES) {
6710 if (!(p->flags & IORING_SETUP_CLAMP))
6711 return -EINVAL;
6712 entries = IORING_MAX_ENTRIES;
6713 }
2b188cc1
JA
6714
6715 /*
6716 * Use twice as many entries for the CQ ring. It's possible for the
6717 * application to drive a higher depth than the size of the SQ ring,
6718 * since the sqes are only used at submission time. This allows for
33a107f0
JA
6719 * some flexibility in overcommitting a bit. If the application has
6720 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
6721 * of CQ ring entries manually.
2b188cc1
JA
6722 */
6723 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
6724 if (p->flags & IORING_SETUP_CQSIZE) {
6725 /*
6726 * If IORING_SETUP_CQSIZE is set, we do the same roundup
6727 * to a power-of-two, if it isn't already. We do NOT impose
6728 * any cq vs sq ring sizing.
6729 */
8110c1a6 6730 if (p->cq_entries < p->sq_entries)
33a107f0 6731 return -EINVAL;
8110c1a6
JA
6732 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
6733 if (!(p->flags & IORING_SETUP_CLAMP))
6734 return -EINVAL;
6735 p->cq_entries = IORING_MAX_CQ_ENTRIES;
6736 }
33a107f0
JA
6737 p->cq_entries = roundup_pow_of_two(p->cq_entries);
6738 } else {
6739 p->cq_entries = 2 * p->sq_entries;
6740 }
2b188cc1
JA
6741
6742 user = get_uid(current_user());
6743 account_mem = !capable(CAP_IPC_LOCK);
6744
6745 if (account_mem) {
6746 ret = io_account_mem(user,
6747 ring_pages(p->sq_entries, p->cq_entries));
6748 if (ret) {
6749 free_uid(user);
6750 return ret;
6751 }
6752 }
6753
6754 ctx = io_ring_ctx_alloc(p);
6755 if (!ctx) {
6756 if (account_mem)
6757 io_unaccount_mem(user, ring_pages(p->sq_entries,
6758 p->cq_entries));
6759 free_uid(user);
6760 return -ENOMEM;
6761 }
6762 ctx->compat = in_compat_syscall();
6763 ctx->account_mem = account_mem;
6764 ctx->user = user;
0b8c0ec7 6765 ctx->creds = get_current_cred();
2b188cc1
JA
6766
6767 ret = io_allocate_scq_urings(ctx, p);
6768 if (ret)
6769 goto err;
6770
6c271ce2 6771 ret = io_sq_offload_start(ctx, p);
2b188cc1
JA
6772 if (ret)
6773 goto err;
6774
2b188cc1 6775 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
6776 p->sq_off.head = offsetof(struct io_rings, sq.head);
6777 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
6778 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
6779 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
6780 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
6781 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
6782 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
6783
6784 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
6785 p->cq_off.head = offsetof(struct io_rings, cq.head);
6786 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
6787 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
6788 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
6789 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
6790 p->cq_off.cqes = offsetof(struct io_rings, cqes);
ac90f249 6791
044c1ab3
JA
6792 /*
6793 * Install ring fd as the very last thing, so we don't risk someone
6794 * having closed it before we finish setup
6795 */
6796 ret = io_uring_get_fd(ctx);
6797 if (ret < 0)
6798 goto err;
6799
da8c9690 6800 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
cccf0ee8
JA
6801 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
6802 IORING_FEAT_CUR_PERSONALITY;
c826bd7a 6803 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
6804 return ret;
6805err:
6806 io_ring_ctx_wait_and_kill(ctx);
6807 return ret;
6808}
6809
6810/*
6811 * Sets up an aio uring context, and returns the fd. Applications asks for a
6812 * ring size, we return the actual sq/cq ring sizes (among other things) in the
6813 * params structure passed in.
6814 */
6815static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
6816{
6817 struct io_uring_params p;
6818 long ret;
6819 int i;
6820
6821 if (copy_from_user(&p, params, sizeof(p)))
6822 return -EFAULT;
6823 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
6824 if (p.resv[i])
6825 return -EINVAL;
6826 }
6827
6c271ce2 6828 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 6829 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
24369c2e 6830 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
2b188cc1
JA
6831 return -EINVAL;
6832
6833 ret = io_uring_create(entries, &p);
6834 if (ret < 0)
6835 return ret;
6836
6837 if (copy_to_user(params, &p, sizeof(p)))
6838 return -EFAULT;
6839
6840 return ret;
6841}
6842
6843SYSCALL_DEFINE2(io_uring_setup, u32, entries,
6844 struct io_uring_params __user *, params)
6845{
6846 return io_uring_setup(entries, params);
6847}
6848
66f4af93
JA
6849static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
6850{
6851 struct io_uring_probe *p;
6852 size_t size;
6853 int i, ret;
6854
6855 size = struct_size(p, ops, nr_args);
6856 if (size == SIZE_MAX)
6857 return -EOVERFLOW;
6858 p = kzalloc(size, GFP_KERNEL);
6859 if (!p)
6860 return -ENOMEM;
6861
6862 ret = -EFAULT;
6863 if (copy_from_user(p, arg, size))
6864 goto out;
6865 ret = -EINVAL;
6866 if (memchr_inv(p, 0, size))
6867 goto out;
6868
6869 p->last_op = IORING_OP_LAST - 1;
6870 if (nr_args > IORING_OP_LAST)
6871 nr_args = IORING_OP_LAST;
6872
6873 for (i = 0; i < nr_args; i++) {
6874 p->ops[i].op = i;
6875 if (!io_op_defs[i].not_supported)
6876 p->ops[i].flags = IO_URING_OP_SUPPORTED;
6877 }
6878 p->ops_len = i;
6879
6880 ret = 0;
6881 if (copy_to_user(arg, p, size))
6882 ret = -EFAULT;
6883out:
6884 kfree(p);
6885 return ret;
6886}
6887
071698e1
JA
6888static int io_register_personality(struct io_ring_ctx *ctx)
6889{
6890 const struct cred *creds = get_current_cred();
6891 int id;
6892
6893 id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
6894 USHRT_MAX, GFP_KERNEL);
6895 if (id < 0)
6896 put_cred(creds);
6897 return id;
6898}
6899
6900static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
6901{
6902 const struct cred *old_creds;
6903
6904 old_creds = idr_remove(&ctx->personality_idr, id);
6905 if (old_creds) {
6906 put_cred(old_creds);
6907 return 0;
6908 }
6909
6910 return -EINVAL;
6911}
6912
6913static bool io_register_op_must_quiesce(int op)
6914{
6915 switch (op) {
6916 case IORING_UNREGISTER_FILES:
6917 case IORING_REGISTER_FILES_UPDATE:
6918 case IORING_REGISTER_PROBE:
6919 case IORING_REGISTER_PERSONALITY:
6920 case IORING_UNREGISTER_PERSONALITY:
6921 return false;
6922 default:
6923 return true;
6924 }
6925}
6926
edafccee
JA
6927static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
6928 void __user *arg, unsigned nr_args)
b19062a5
JA
6929 __releases(ctx->uring_lock)
6930 __acquires(ctx->uring_lock)
edafccee
JA
6931{
6932 int ret;
6933
35fa71a0
JA
6934 /*
6935 * We're inside the ring mutex, if the ref is already dying, then
6936 * someone else killed the ctx or is already going through
6937 * io_uring_register().
6938 */
6939 if (percpu_ref_is_dying(&ctx->refs))
6940 return -ENXIO;
6941
071698e1 6942 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 6943 percpu_ref_kill(&ctx->refs);
b19062a5 6944
05f3fb3c
JA
6945 /*
6946 * Drop uring mutex before waiting for references to exit. If
6947 * another thread is currently inside io_uring_enter() it might
6948 * need to grab the uring_lock to make progress. If we hold it
6949 * here across the drain wait, then we can deadlock. It's safe
6950 * to drop the mutex here, since no new references will come in
6951 * after we've killed the percpu ref.
6952 */
6953 mutex_unlock(&ctx->uring_lock);
c150368b 6954 ret = wait_for_completion_interruptible(&ctx->completions[0]);
05f3fb3c 6955 mutex_lock(&ctx->uring_lock);
c150368b
JA
6956 if (ret) {
6957 percpu_ref_resurrect(&ctx->refs);
6958 ret = -EINTR;
6959 goto out;
6960 }
05f3fb3c 6961 }
edafccee
JA
6962
6963 switch (opcode) {
6964 case IORING_REGISTER_BUFFERS:
6965 ret = io_sqe_buffer_register(ctx, arg, nr_args);
6966 break;
6967 case IORING_UNREGISTER_BUFFERS:
6968 ret = -EINVAL;
6969 if (arg || nr_args)
6970 break;
6971 ret = io_sqe_buffer_unregister(ctx);
6972 break;
6b06314c
JA
6973 case IORING_REGISTER_FILES:
6974 ret = io_sqe_files_register(ctx, arg, nr_args);
6975 break;
6976 case IORING_UNREGISTER_FILES:
6977 ret = -EINVAL;
6978 if (arg || nr_args)
6979 break;
6980 ret = io_sqe_files_unregister(ctx);
6981 break;
c3a31e60
JA
6982 case IORING_REGISTER_FILES_UPDATE:
6983 ret = io_sqe_files_update(ctx, arg, nr_args);
6984 break;
9b402849 6985 case IORING_REGISTER_EVENTFD:
f2842ab5 6986 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
6987 ret = -EINVAL;
6988 if (nr_args != 1)
6989 break;
6990 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
6991 if (ret)
6992 break;
6993 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
6994 ctx->eventfd_async = 1;
6995 else
6996 ctx->eventfd_async = 0;
9b402849
JA
6997 break;
6998 case IORING_UNREGISTER_EVENTFD:
6999 ret = -EINVAL;
7000 if (arg || nr_args)
7001 break;
7002 ret = io_eventfd_unregister(ctx);
7003 break;
66f4af93
JA
7004 case IORING_REGISTER_PROBE:
7005 ret = -EINVAL;
7006 if (!arg || nr_args > 256)
7007 break;
7008 ret = io_probe(ctx, arg, nr_args);
7009 break;
071698e1
JA
7010 case IORING_REGISTER_PERSONALITY:
7011 ret = -EINVAL;
7012 if (arg || nr_args)
7013 break;
7014 ret = io_register_personality(ctx);
7015 break;
7016 case IORING_UNREGISTER_PERSONALITY:
7017 ret = -EINVAL;
7018 if (arg)
7019 break;
7020 ret = io_unregister_personality(ctx, nr_args);
7021 break;
edafccee
JA
7022 default:
7023 ret = -EINVAL;
7024 break;
7025 }
7026
071698e1 7027 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 7028 /* bring the ctx back to life */
05f3fb3c 7029 percpu_ref_reinit(&ctx->refs);
c150368b
JA
7030out:
7031 reinit_completion(&ctx->completions[0]);
05f3fb3c 7032 }
edafccee
JA
7033 return ret;
7034}
7035
7036SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
7037 void __user *, arg, unsigned int, nr_args)
7038{
7039 struct io_ring_ctx *ctx;
7040 long ret = -EBADF;
7041 struct fd f;
7042
7043 f = fdget(fd);
7044 if (!f.file)
7045 return -EBADF;
7046
7047 ret = -EOPNOTSUPP;
7048 if (f.file->f_op != &io_uring_fops)
7049 goto out_fput;
7050
7051 ctx = f.file->private_data;
7052
7053 mutex_lock(&ctx->uring_lock);
7054 ret = __io_uring_register(ctx, opcode, arg, nr_args);
7055 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
7056 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
7057 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
7058out_fput:
7059 fdput(f);
7060 return ret;
7061}
7062
2b188cc1
JA
7063static int __init io_uring_init(void)
7064{
d7f62e82
SM
7065#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
7066 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
7067 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
7068} while (0)
7069
7070#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
7071 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
7072 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
7073 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
7074 BUILD_BUG_SQE_ELEM(1, __u8, flags);
7075 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
7076 BUILD_BUG_SQE_ELEM(4, __s32, fd);
7077 BUILD_BUG_SQE_ELEM(8, __u64, off);
7078 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
7079 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7080 BUILD_BUG_SQE_ELEM(24, __u32, len);
7081 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
7082 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
7083 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
7084 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
7085 BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
7086 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
7087 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
7088 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
7089 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
7090 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
7091 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
7092 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
7093 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7094 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
7095 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
7096 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7097
d3656344 7098 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
2b188cc1
JA
7099 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
7100 return 0;
7101};
7102__initcall(io_uring_init);