]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - fs/io_uring.c
13f72d2a3feceef245a3c3199524acaf4247d216
[mirror_ubuntu-hirsute-kernel.git] / fs / io_uring.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
40 * Copyright (c) 2018-2019 Christoph Hellwig
41 */
42 #include <linux/kernel.h>
43 #include <linux/init.h>
44 #include <linux/errno.h>
45 #include <linux/syscalls.h>
46 #include <linux/compat.h>
47 #include <net/compat.h>
48 #include <linux/refcount.h>
49 #include <linux/uio.h>
50 #include <linux/bits.h>
51
52 #include <linux/sched/signal.h>
53 #include <linux/fs.h>
54 #include <linux/file.h>
55 #include <linux/fdtable.h>
56 #include <linux/mm.h>
57 #include <linux/mman.h>
58 #include <linux/percpu.h>
59 #include <linux/slab.h>
60 #include <linux/kthread.h>
61 #include <linux/blkdev.h>
62 #include <linux/bvec.h>
63 #include <linux/net.h>
64 #include <net/sock.h>
65 #include <net/af_unix.h>
66 #include <net/scm.h>
67 #include <linux/anon_inodes.h>
68 #include <linux/sched/mm.h>
69 #include <linux/uaccess.h>
70 #include <linux/nospec.h>
71 #include <linux/sizes.h>
72 #include <linux/hugetlb.h>
73 #include <linux/highmem.h>
74 #include <linux/namei.h>
75 #include <linux/fsnotify.h>
76 #include <linux/fadvise.h>
77 #include <linux/eventpoll.h>
78 #include <linux/fs_struct.h>
79 #include <linux/splice.h>
80 #include <linux/task_work.h>
81 #include <linux/pagemap.h>
82
83 #define CREATE_TRACE_POINTS
84 #include <trace/events/io_uring.h>
85
86 #include <uapi/linux/io_uring.h>
87
88 #include "internal.h"
89 #include "io-wq.h"
90
91 #define IORING_MAX_ENTRIES 32768
92 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
93
94 /*
95 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
96 */
97 #define IORING_FILE_TABLE_SHIFT 9
98 #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
99 #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
100 #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
101
102 struct io_uring {
103 u32 head ____cacheline_aligned_in_smp;
104 u32 tail ____cacheline_aligned_in_smp;
105 };
106
107 /*
108 * This data is shared with the application through the mmap at offsets
109 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
110 *
111 * The offsets to the member fields are published through struct
112 * io_sqring_offsets when calling io_uring_setup.
113 */
114 struct io_rings {
115 /*
116 * Head and tail offsets into the ring; the offsets need to be
117 * masked to get valid indices.
118 *
119 * The kernel controls head of the sq ring and the tail of the cq ring,
120 * and the application controls tail of the sq ring and the head of the
121 * cq ring.
122 */
123 struct io_uring sq, cq;
124 /*
125 * Bitmasks to apply to head and tail offsets (constant, equals
126 * ring_entries - 1)
127 */
128 u32 sq_ring_mask, cq_ring_mask;
129 /* Ring sizes (constant, power of 2) */
130 u32 sq_ring_entries, cq_ring_entries;
131 /*
132 * Number of invalid entries dropped by the kernel due to
133 * invalid index stored in array
134 *
135 * Written by the kernel, shouldn't be modified by the
136 * application (i.e. get number of "new events" by comparing to
137 * cached value).
138 *
139 * After a new SQ head value was read by the application this
140 * counter includes all submissions that were dropped reaching
141 * the new SQ head (and possibly more).
142 */
143 u32 sq_dropped;
144 /*
145 * Runtime SQ flags
146 *
147 * Written by the kernel, shouldn't be modified by the
148 * application.
149 *
150 * The application needs a full memory barrier before checking
151 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
152 */
153 u32 sq_flags;
154 /*
155 * Runtime CQ flags
156 *
157 * Written by the application, shouldn't be modified by the
158 * kernel.
159 */
160 u32 cq_flags;
161 /*
162 * Number of completion events lost because the queue was full;
163 * this should be avoided by the application by making sure
164 * there are not more requests pending than there is space in
165 * the completion queue.
166 *
167 * Written by the kernel, shouldn't be modified by the
168 * application (i.e. get number of "new events" by comparing to
169 * cached value).
170 *
171 * As completion events come in out of order this counter is not
172 * ordered with any other data.
173 */
174 u32 cq_overflow;
175 /*
176 * Ring buffer of completion events.
177 *
178 * The kernel writes completion events fresh every time they are
179 * produced, so the application is allowed to modify pending
180 * entries.
181 */
182 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
183 };
184
185 struct io_mapped_ubuf {
186 u64 ubuf;
187 size_t len;
188 struct bio_vec *bvec;
189 unsigned int nr_bvecs;
190 };
191
192 struct fixed_file_table {
193 struct file **files;
194 };
195
196 struct fixed_file_ref_node {
197 struct percpu_ref refs;
198 struct list_head node;
199 struct list_head file_list;
200 struct fixed_file_data *file_data;
201 struct llist_node llist;
202 };
203
204 struct fixed_file_data {
205 struct fixed_file_table *table;
206 struct io_ring_ctx *ctx;
207
208 struct percpu_ref *cur_refs;
209 struct percpu_ref refs;
210 struct completion done;
211 struct list_head ref_list;
212 spinlock_t lock;
213 };
214
215 struct io_buffer {
216 struct list_head list;
217 __u64 addr;
218 __s32 len;
219 __u16 bid;
220 };
221
222 struct io_ring_ctx {
223 struct {
224 struct percpu_ref refs;
225 } ____cacheline_aligned_in_smp;
226
227 struct {
228 unsigned int flags;
229 unsigned int compat: 1;
230 unsigned int limit_mem: 1;
231 unsigned int cq_overflow_flushed: 1;
232 unsigned int drain_next: 1;
233 unsigned int eventfd_async: 1;
234
235 /*
236 * Ring buffer of indices into array of io_uring_sqe, which is
237 * mmapped by the application using the IORING_OFF_SQES offset.
238 *
239 * This indirection could e.g. be used to assign fixed
240 * io_uring_sqe entries to operations and only submit them to
241 * the queue when needed.
242 *
243 * The kernel modifies neither the indices array nor the entries
244 * array.
245 */
246 u32 *sq_array;
247 unsigned cached_sq_head;
248 unsigned sq_entries;
249 unsigned sq_mask;
250 unsigned sq_thread_idle;
251 unsigned cached_sq_dropped;
252 atomic_t cached_cq_overflow;
253 unsigned long sq_check_overflow;
254
255 struct list_head defer_list;
256 struct list_head timeout_list;
257 struct list_head cq_overflow_list;
258
259 wait_queue_head_t inflight_wait;
260 struct io_uring_sqe *sq_sqes;
261 } ____cacheline_aligned_in_smp;
262
263 struct io_rings *rings;
264
265 /* IO offload */
266 struct io_wq *io_wq;
267 struct task_struct *sqo_thread; /* if using sq thread polling */
268 struct mm_struct *sqo_mm;
269 wait_queue_head_t sqo_wait;
270
271 /*
272 * If used, fixed file set. Writers must ensure that ->refs is dead,
273 * readers must ensure that ->refs is alive as long as the file* is
274 * used. Only updated through io_uring_register(2).
275 */
276 struct fixed_file_data *file_data;
277 unsigned nr_user_files;
278 int ring_fd;
279 struct file *ring_file;
280
281 /* if used, fixed mapped user buffers */
282 unsigned nr_user_bufs;
283 struct io_mapped_ubuf *user_bufs;
284
285 struct user_struct *user;
286
287 const struct cred *creds;
288
289 struct completion ref_comp;
290 struct completion sq_thread_comp;
291
292 /* if all else fails... */
293 struct io_kiocb *fallback_req;
294
295 #if defined(CONFIG_UNIX)
296 struct socket *ring_sock;
297 #endif
298
299 struct idr io_buffer_idr;
300
301 struct idr personality_idr;
302
303 struct {
304 unsigned cached_cq_tail;
305 unsigned cq_entries;
306 unsigned cq_mask;
307 atomic_t cq_timeouts;
308 unsigned long cq_check_overflow;
309 struct wait_queue_head cq_wait;
310 struct fasync_struct *cq_fasync;
311 struct eventfd_ctx *cq_ev_fd;
312 } ____cacheline_aligned_in_smp;
313
314 struct {
315 struct mutex uring_lock;
316 wait_queue_head_t wait;
317 } ____cacheline_aligned_in_smp;
318
319 struct {
320 spinlock_t completion_lock;
321
322 /*
323 * ->poll_list is protected by the ctx->uring_lock for
324 * io_uring instances that don't use IORING_SETUP_SQPOLL.
325 * For SQPOLL, only the single threaded io_sq_thread() will
326 * manipulate the list, hence no extra locking is needed there.
327 */
328 struct list_head poll_list;
329 struct hlist_head *cancel_hash;
330 unsigned cancel_hash_bits;
331 bool poll_multi_file;
332
333 spinlock_t inflight_lock;
334 struct list_head inflight_list;
335 } ____cacheline_aligned_in_smp;
336
337 struct delayed_work file_put_work;
338 struct llist_head file_put_llist;
339
340 struct work_struct exit_work;
341 };
342
343 /*
344 * First field must be the file pointer in all the
345 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
346 */
347 struct io_poll_iocb {
348 struct file *file;
349 union {
350 struct wait_queue_head *head;
351 u64 addr;
352 };
353 __poll_t events;
354 bool done;
355 bool canceled;
356 struct wait_queue_entry wait;
357 };
358
359 struct io_close {
360 struct file *file;
361 struct file *put_file;
362 int fd;
363 };
364
365 struct io_timeout_data {
366 struct io_kiocb *req;
367 struct hrtimer timer;
368 struct timespec64 ts;
369 enum hrtimer_mode mode;
370 };
371
372 struct io_accept {
373 struct file *file;
374 struct sockaddr __user *addr;
375 int __user *addr_len;
376 int flags;
377 unsigned long nofile;
378 };
379
380 struct io_sync {
381 struct file *file;
382 loff_t len;
383 loff_t off;
384 int flags;
385 int mode;
386 };
387
388 struct io_cancel {
389 struct file *file;
390 u64 addr;
391 };
392
393 struct io_timeout {
394 struct file *file;
395 u64 addr;
396 int flags;
397 u32 off;
398 u32 target_seq;
399 };
400
401 struct io_rw {
402 /* NOTE: kiocb has the file as the first member, so don't do it here */
403 struct kiocb kiocb;
404 u64 addr;
405 u64 len;
406 };
407
408 struct io_connect {
409 struct file *file;
410 struct sockaddr __user *addr;
411 int addr_len;
412 };
413
414 struct io_sr_msg {
415 struct file *file;
416 union {
417 struct user_msghdr __user *msg;
418 void __user *buf;
419 };
420 int msg_flags;
421 int bgid;
422 size_t len;
423 struct io_buffer *kbuf;
424 };
425
426 struct io_open {
427 struct file *file;
428 int dfd;
429 struct filename *filename;
430 struct open_how how;
431 unsigned long nofile;
432 };
433
434 struct io_files_update {
435 struct file *file;
436 u64 arg;
437 u32 nr_args;
438 u32 offset;
439 };
440
441 struct io_fadvise {
442 struct file *file;
443 u64 offset;
444 u32 len;
445 u32 advice;
446 };
447
448 struct io_madvise {
449 struct file *file;
450 u64 addr;
451 u32 len;
452 u32 advice;
453 };
454
455 struct io_epoll {
456 struct file *file;
457 int epfd;
458 int op;
459 int fd;
460 struct epoll_event event;
461 };
462
463 struct io_splice {
464 struct file *file_out;
465 struct file *file_in;
466 loff_t off_out;
467 loff_t off_in;
468 u64 len;
469 unsigned int flags;
470 };
471
472 struct io_provide_buf {
473 struct file *file;
474 __u64 addr;
475 __s32 len;
476 __u32 bgid;
477 __u16 nbufs;
478 __u16 bid;
479 };
480
481 struct io_statx {
482 struct file *file;
483 int dfd;
484 unsigned int mask;
485 unsigned int flags;
486 const char __user *filename;
487 struct statx __user *buffer;
488 };
489
490 struct io_async_connect {
491 struct sockaddr_storage address;
492 };
493
494 struct io_async_msghdr {
495 struct iovec fast_iov[UIO_FASTIOV];
496 struct iovec *iov;
497 struct sockaddr __user *uaddr;
498 struct msghdr msg;
499 struct sockaddr_storage addr;
500 };
501
502 struct io_async_rw {
503 struct iovec fast_iov[UIO_FASTIOV];
504 struct iovec *iov;
505 ssize_t nr_segs;
506 ssize_t size;
507 struct wait_page_queue wpq;
508 struct callback_head task_work;
509 };
510
511 struct io_async_ctx {
512 union {
513 struct io_async_rw rw;
514 struct io_async_msghdr msg;
515 struct io_async_connect connect;
516 struct io_timeout_data timeout;
517 };
518 };
519
520 enum {
521 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
522 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
523 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
524 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
525 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
526 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
527
528 REQ_F_LINK_HEAD_BIT,
529 REQ_F_LINK_NEXT_BIT,
530 REQ_F_FAIL_LINK_BIT,
531 REQ_F_INFLIGHT_BIT,
532 REQ_F_CUR_POS_BIT,
533 REQ_F_NOWAIT_BIT,
534 REQ_F_LINK_TIMEOUT_BIT,
535 REQ_F_TIMEOUT_BIT,
536 REQ_F_ISREG_BIT,
537 REQ_F_MUST_PUNT_BIT,
538 REQ_F_TIMEOUT_NOSEQ_BIT,
539 REQ_F_COMP_LOCKED_BIT,
540 REQ_F_NEED_CLEANUP_BIT,
541 REQ_F_OVERFLOW_BIT,
542 REQ_F_POLLED_BIT,
543 REQ_F_BUFFER_SELECTED_BIT,
544 REQ_F_NO_FILE_TABLE_BIT,
545 REQ_F_QUEUE_TIMEOUT_BIT,
546 REQ_F_WORK_INITIALIZED_BIT,
547 REQ_F_TASK_PINNED_BIT,
548
549 /* not a real bit, just to check we're not overflowing the space */
550 __REQ_F_LAST_BIT,
551 };
552
553 enum {
554 /* ctx owns file */
555 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
556 /* drain existing IO first */
557 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
558 /* linked sqes */
559 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
560 /* doesn't sever on completion < 0 */
561 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
562 /* IOSQE_ASYNC */
563 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
564 /* IOSQE_BUFFER_SELECT */
565 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
566
567 /* head of a link */
568 REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
569 /* already grabbed next link */
570 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
571 /* fail rest of links */
572 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
573 /* on inflight list */
574 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
575 /* read/write uses file position */
576 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
577 /* must not punt to workers */
578 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
579 /* has linked timeout */
580 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
581 /* timeout request */
582 REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
583 /* regular file */
584 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
585 /* must be punted even for NONBLOCK */
586 REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
587 /* no timeout sequence */
588 REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
589 /* completion under lock */
590 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
591 /* needs cleanup */
592 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
593 /* in overflow list */
594 REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
595 /* already went through poll handler */
596 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
597 /* buffer already selected */
598 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
599 /* doesn't need file table for this request */
600 REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
601 /* needs to queue linked timeout */
602 REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
603 /* io_wq_work is initialized */
604 REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
605 /* req->task is refcounted */
606 REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
607 };
608
609 struct async_poll {
610 struct io_poll_iocb poll;
611 struct io_wq_work work;
612 };
613
614 /*
615 * NOTE! Each of the iocb union members has the file pointer
616 * as the first entry in their struct definition. So you can
617 * access the file pointer through any of the sub-structs,
618 * or directly as just 'ki_filp' in this struct.
619 */
620 struct io_kiocb {
621 union {
622 struct file *file;
623 struct io_rw rw;
624 struct io_poll_iocb poll;
625 struct io_accept accept;
626 struct io_sync sync;
627 struct io_cancel cancel;
628 struct io_timeout timeout;
629 struct io_connect connect;
630 struct io_sr_msg sr_msg;
631 struct io_open open;
632 struct io_close close;
633 struct io_files_update files_update;
634 struct io_fadvise fadvise;
635 struct io_madvise madvise;
636 struct io_epoll epoll;
637 struct io_splice splice;
638 struct io_provide_buf pbuf;
639 struct io_statx statx;
640 };
641
642 struct io_async_ctx *io;
643 int cflags;
644 u8 opcode;
645 /* polled IO has completed */
646 u8 iopoll_completed;
647
648 u16 buf_index;
649
650 struct io_ring_ctx *ctx;
651 struct list_head list;
652 unsigned int flags;
653 refcount_t refs;
654 struct task_struct *task;
655 unsigned long fsize;
656 u64 user_data;
657 u32 result;
658 u32 sequence;
659
660 struct list_head link_list;
661
662 struct list_head inflight_entry;
663
664 struct percpu_ref *fixed_file_refs;
665
666 union {
667 /*
668 * Only commands that never go async can use the below fields,
669 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
670 * async armed poll handlers for regular commands. The latter
671 * restore the work, if needed.
672 */
673 struct {
674 struct callback_head task_work;
675 struct hlist_node hash_node;
676 struct async_poll *apoll;
677 };
678 struct io_wq_work work;
679 };
680 };
681
682 #define IO_IOPOLL_BATCH 8
683
684 struct io_submit_state {
685 struct blk_plug plug;
686
687 /*
688 * io_kiocb alloc cache
689 */
690 void *reqs[IO_IOPOLL_BATCH];
691 unsigned int free_reqs;
692
693 /*
694 * File reference cache
695 */
696 struct file *file;
697 unsigned int fd;
698 unsigned int has_refs;
699 unsigned int used_refs;
700 unsigned int ios_left;
701 };
702
703 struct io_op_def {
704 /* needs req->io allocated for deferral/async */
705 unsigned async_ctx : 1;
706 /* needs current->mm setup, does mm access */
707 unsigned needs_mm : 1;
708 /* needs req->file assigned */
709 unsigned needs_file : 1;
710 /* don't fail if file grab fails */
711 unsigned needs_file_no_error : 1;
712 /* hash wq insertion if file is a regular file */
713 unsigned hash_reg_file : 1;
714 /* unbound wq insertion if file is a non-regular file */
715 unsigned unbound_nonreg_file : 1;
716 /* opcode is not supported by this kernel */
717 unsigned not_supported : 1;
718 /* needs file table */
719 unsigned file_table : 1;
720 /* needs ->fs */
721 unsigned needs_fs : 1;
722 /* set if opcode supports polled "wait" */
723 unsigned pollin : 1;
724 unsigned pollout : 1;
725 /* op supports buffer selection */
726 unsigned buffer_select : 1;
727 };
728
729 static const struct io_op_def io_op_defs[] = {
730 [IORING_OP_NOP] = {},
731 [IORING_OP_READV] = {
732 .async_ctx = 1,
733 .needs_mm = 1,
734 .needs_file = 1,
735 .unbound_nonreg_file = 1,
736 .pollin = 1,
737 .buffer_select = 1,
738 },
739 [IORING_OP_WRITEV] = {
740 .async_ctx = 1,
741 .needs_mm = 1,
742 .needs_file = 1,
743 .hash_reg_file = 1,
744 .unbound_nonreg_file = 1,
745 .pollout = 1,
746 },
747 [IORING_OP_FSYNC] = {
748 .needs_file = 1,
749 },
750 [IORING_OP_READ_FIXED] = {
751 .needs_file = 1,
752 .unbound_nonreg_file = 1,
753 .pollin = 1,
754 },
755 [IORING_OP_WRITE_FIXED] = {
756 .needs_file = 1,
757 .hash_reg_file = 1,
758 .unbound_nonreg_file = 1,
759 .pollout = 1,
760 },
761 [IORING_OP_POLL_ADD] = {
762 .needs_file = 1,
763 .unbound_nonreg_file = 1,
764 },
765 [IORING_OP_POLL_REMOVE] = {},
766 [IORING_OP_SYNC_FILE_RANGE] = {
767 .needs_file = 1,
768 },
769 [IORING_OP_SENDMSG] = {
770 .async_ctx = 1,
771 .needs_mm = 1,
772 .needs_file = 1,
773 .unbound_nonreg_file = 1,
774 .needs_fs = 1,
775 .pollout = 1,
776 },
777 [IORING_OP_RECVMSG] = {
778 .async_ctx = 1,
779 .needs_mm = 1,
780 .needs_file = 1,
781 .unbound_nonreg_file = 1,
782 .needs_fs = 1,
783 .pollin = 1,
784 .buffer_select = 1,
785 },
786 [IORING_OP_TIMEOUT] = {
787 .async_ctx = 1,
788 .needs_mm = 1,
789 },
790 [IORING_OP_TIMEOUT_REMOVE] = {},
791 [IORING_OP_ACCEPT] = {
792 .needs_mm = 1,
793 .needs_file = 1,
794 .unbound_nonreg_file = 1,
795 .file_table = 1,
796 .pollin = 1,
797 },
798 [IORING_OP_ASYNC_CANCEL] = {},
799 [IORING_OP_LINK_TIMEOUT] = {
800 .async_ctx = 1,
801 .needs_mm = 1,
802 },
803 [IORING_OP_CONNECT] = {
804 .async_ctx = 1,
805 .needs_mm = 1,
806 .needs_file = 1,
807 .unbound_nonreg_file = 1,
808 .pollout = 1,
809 },
810 [IORING_OP_FALLOCATE] = {
811 .needs_file = 1,
812 },
813 [IORING_OP_OPENAT] = {
814 .file_table = 1,
815 .needs_fs = 1,
816 },
817 [IORING_OP_CLOSE] = {
818 .needs_file = 1,
819 .needs_file_no_error = 1,
820 .file_table = 1,
821 },
822 [IORING_OP_FILES_UPDATE] = {
823 .needs_mm = 1,
824 .file_table = 1,
825 },
826 [IORING_OP_STATX] = {
827 .needs_mm = 1,
828 .needs_fs = 1,
829 .file_table = 1,
830 },
831 [IORING_OP_READ] = {
832 .needs_mm = 1,
833 .needs_file = 1,
834 .unbound_nonreg_file = 1,
835 .pollin = 1,
836 .buffer_select = 1,
837 },
838 [IORING_OP_WRITE] = {
839 .needs_mm = 1,
840 .needs_file = 1,
841 .unbound_nonreg_file = 1,
842 .pollout = 1,
843 },
844 [IORING_OP_FADVISE] = {
845 .needs_file = 1,
846 },
847 [IORING_OP_MADVISE] = {
848 .needs_mm = 1,
849 },
850 [IORING_OP_SEND] = {
851 .needs_mm = 1,
852 .needs_file = 1,
853 .unbound_nonreg_file = 1,
854 .pollout = 1,
855 },
856 [IORING_OP_RECV] = {
857 .needs_mm = 1,
858 .needs_file = 1,
859 .unbound_nonreg_file = 1,
860 .pollin = 1,
861 .buffer_select = 1,
862 },
863 [IORING_OP_OPENAT2] = {
864 .file_table = 1,
865 .needs_fs = 1,
866 },
867 [IORING_OP_EPOLL_CTL] = {
868 .unbound_nonreg_file = 1,
869 .file_table = 1,
870 },
871 [IORING_OP_SPLICE] = {
872 .needs_file = 1,
873 .hash_reg_file = 1,
874 .unbound_nonreg_file = 1,
875 },
876 [IORING_OP_PROVIDE_BUFFERS] = {},
877 [IORING_OP_REMOVE_BUFFERS] = {},
878 [IORING_OP_TEE] = {
879 .needs_file = 1,
880 .hash_reg_file = 1,
881 .unbound_nonreg_file = 1,
882 },
883 };
884
885 enum io_mem_account {
886 ACCT_LOCKED,
887 ACCT_PINNED,
888 };
889
890 static void io_wq_submit_work(struct io_wq_work **workptr);
891 static void io_cqring_fill_event(struct io_kiocb *req, long res);
892 static void io_put_req(struct io_kiocb *req);
893 static void __io_double_put_req(struct io_kiocb *req);
894 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
895 static void io_queue_linked_timeout(struct io_kiocb *req);
896 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
897 struct io_uring_files_update *ip,
898 unsigned nr_args);
899 static int io_grab_files(struct io_kiocb *req);
900 static void io_cleanup_req(struct io_kiocb *req);
901 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
902 int fd, struct file **out_file, bool fixed);
903 static void __io_queue_sqe(struct io_kiocb *req,
904 const struct io_uring_sqe *sqe);
905
906 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
907 struct iovec **iovec, struct iov_iter *iter,
908 bool needs_lock);
909 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
910 struct iovec *iovec, struct iovec *fast_iov,
911 struct iov_iter *iter);
912
913 static struct kmem_cache *req_cachep;
914
915 static const struct file_operations io_uring_fops;
916
917 struct sock *io_uring_get_socket(struct file *file)
918 {
919 #if defined(CONFIG_UNIX)
920 if (file->f_op == &io_uring_fops) {
921 struct io_ring_ctx *ctx = file->private_data;
922
923 return ctx->ring_sock->sk;
924 }
925 #endif
926 return NULL;
927 }
928 EXPORT_SYMBOL(io_uring_get_socket);
929
930 static void io_get_req_task(struct io_kiocb *req)
931 {
932 if (req->flags & REQ_F_TASK_PINNED)
933 return;
934 get_task_struct(req->task);
935 req->flags |= REQ_F_TASK_PINNED;
936 }
937
938 /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
939 static void __io_put_req_task(struct io_kiocb *req)
940 {
941 if (req->flags & REQ_F_TASK_PINNED)
942 put_task_struct(req->task);
943 }
944
945 static void io_file_put_work(struct work_struct *work);
946
947 /*
948 * Note: must call io_req_init_async() for the first time you
949 * touch any members of io_wq_work.
950 */
951 static inline void io_req_init_async(struct io_kiocb *req)
952 {
953 if (req->flags & REQ_F_WORK_INITIALIZED)
954 return;
955
956 memset(&req->work, 0, sizeof(req->work));
957 req->flags |= REQ_F_WORK_INITIALIZED;
958 }
959
960 static inline bool io_async_submit(struct io_ring_ctx *ctx)
961 {
962 return ctx->flags & IORING_SETUP_SQPOLL;
963 }
964
965 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
966 {
967 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
968
969 complete(&ctx->ref_comp);
970 }
971
972 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
973 {
974 struct io_ring_ctx *ctx;
975 int hash_bits;
976
977 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
978 if (!ctx)
979 return NULL;
980
981 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
982 if (!ctx->fallback_req)
983 goto err;
984
985 /*
986 * Use 5 bits less than the max cq entries, that should give us around
987 * 32 entries per hash list if totally full and uniformly spread.
988 */
989 hash_bits = ilog2(p->cq_entries);
990 hash_bits -= 5;
991 if (hash_bits <= 0)
992 hash_bits = 1;
993 ctx->cancel_hash_bits = hash_bits;
994 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
995 GFP_KERNEL);
996 if (!ctx->cancel_hash)
997 goto err;
998 __hash_init(ctx->cancel_hash, 1U << hash_bits);
999
1000 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1001 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1002 goto err;
1003
1004 ctx->flags = p->flags;
1005 init_waitqueue_head(&ctx->sqo_wait);
1006 init_waitqueue_head(&ctx->cq_wait);
1007 INIT_LIST_HEAD(&ctx->cq_overflow_list);
1008 init_completion(&ctx->ref_comp);
1009 init_completion(&ctx->sq_thread_comp);
1010 idr_init(&ctx->io_buffer_idr);
1011 idr_init(&ctx->personality_idr);
1012 mutex_init(&ctx->uring_lock);
1013 init_waitqueue_head(&ctx->wait);
1014 spin_lock_init(&ctx->completion_lock);
1015 INIT_LIST_HEAD(&ctx->poll_list);
1016 INIT_LIST_HEAD(&ctx->defer_list);
1017 INIT_LIST_HEAD(&ctx->timeout_list);
1018 init_waitqueue_head(&ctx->inflight_wait);
1019 spin_lock_init(&ctx->inflight_lock);
1020 INIT_LIST_HEAD(&ctx->inflight_list);
1021 INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
1022 init_llist_head(&ctx->file_put_llist);
1023 return ctx;
1024 err:
1025 if (ctx->fallback_req)
1026 kmem_cache_free(req_cachep, ctx->fallback_req);
1027 kfree(ctx->cancel_hash);
1028 kfree(ctx);
1029 return NULL;
1030 }
1031
1032 static inline bool __req_need_defer(struct io_kiocb *req)
1033 {
1034 struct io_ring_ctx *ctx = req->ctx;
1035
1036 return req->sequence != ctx->cached_cq_tail
1037 + atomic_read(&ctx->cached_cq_overflow);
1038 }
1039
1040 static inline bool req_need_defer(struct io_kiocb *req)
1041 {
1042 if (unlikely(req->flags & REQ_F_IO_DRAIN))
1043 return __req_need_defer(req);
1044
1045 return false;
1046 }
1047
1048 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1049 {
1050 struct io_rings *rings = ctx->rings;
1051
1052 /* order cqe stores with ring update */
1053 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1054
1055 if (wq_has_sleeper(&ctx->cq_wait)) {
1056 wake_up_interruptible(&ctx->cq_wait);
1057 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1058 }
1059 }
1060
1061 static inline void io_req_work_grab_env(struct io_kiocb *req,
1062 const struct io_op_def *def)
1063 {
1064 if (!req->work.mm && def->needs_mm) {
1065 mmgrab(current->mm);
1066 req->work.mm = current->mm;
1067 }
1068 if (!req->work.creds)
1069 req->work.creds = get_current_cred();
1070 if (!req->work.fs && def->needs_fs) {
1071 spin_lock(&current->fs->lock);
1072 if (!current->fs->in_exec) {
1073 req->work.fs = current->fs;
1074 req->work.fs->users++;
1075 } else {
1076 req->work.flags |= IO_WQ_WORK_CANCEL;
1077 }
1078 spin_unlock(&current->fs->lock);
1079 }
1080 }
1081
1082 static inline void io_req_work_drop_env(struct io_kiocb *req)
1083 {
1084 if (!(req->flags & REQ_F_WORK_INITIALIZED))
1085 return;
1086
1087 if (req->work.mm) {
1088 mmdrop(req->work.mm);
1089 req->work.mm = NULL;
1090 }
1091 if (req->work.creds) {
1092 put_cred(req->work.creds);
1093 req->work.creds = NULL;
1094 }
1095 if (req->work.fs) {
1096 struct fs_struct *fs = req->work.fs;
1097
1098 spin_lock(&req->work.fs->lock);
1099 if (--fs->users)
1100 fs = NULL;
1101 spin_unlock(&req->work.fs->lock);
1102 if (fs)
1103 free_fs_struct(fs);
1104 }
1105 }
1106
1107 static inline void io_prep_async_work(struct io_kiocb *req,
1108 struct io_kiocb **link)
1109 {
1110 const struct io_op_def *def = &io_op_defs[req->opcode];
1111
1112 if (req->flags & REQ_F_ISREG) {
1113 if (def->hash_reg_file)
1114 io_wq_hash_work(&req->work, file_inode(req->file));
1115 } else {
1116 if (def->unbound_nonreg_file)
1117 req->work.flags |= IO_WQ_WORK_UNBOUND;
1118 }
1119
1120 io_req_init_async(req);
1121 io_req_work_grab_env(req, def);
1122
1123 *link = io_prep_linked_timeout(req);
1124 }
1125
1126 static inline void io_queue_async_work(struct io_kiocb *req)
1127 {
1128 struct io_ring_ctx *ctx = req->ctx;
1129 struct io_kiocb *link;
1130
1131 io_prep_async_work(req, &link);
1132
1133 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1134 &req->work, req->flags);
1135 io_wq_enqueue(ctx->io_wq, &req->work);
1136
1137 if (link)
1138 io_queue_linked_timeout(link);
1139 }
1140
1141 static void io_kill_timeout(struct io_kiocb *req)
1142 {
1143 int ret;
1144
1145 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1146 if (ret != -1) {
1147 atomic_inc(&req->ctx->cq_timeouts);
1148 list_del_init(&req->list);
1149 req->flags |= REQ_F_COMP_LOCKED;
1150 io_cqring_fill_event(req, 0);
1151 io_put_req(req);
1152 }
1153 }
1154
1155 static void io_kill_timeouts(struct io_ring_ctx *ctx)
1156 {
1157 struct io_kiocb *req, *tmp;
1158
1159 spin_lock_irq(&ctx->completion_lock);
1160 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1161 io_kill_timeout(req);
1162 spin_unlock_irq(&ctx->completion_lock);
1163 }
1164
1165 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1166 {
1167 do {
1168 struct io_kiocb *req = list_first_entry(&ctx->defer_list,
1169 struct io_kiocb, list);
1170
1171 if (req_need_defer(req))
1172 break;
1173 list_del_init(&req->list);
1174 io_queue_async_work(req);
1175 } while (!list_empty(&ctx->defer_list));
1176 }
1177
1178 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1179 {
1180 while (!list_empty(&ctx->timeout_list)) {
1181 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1182 struct io_kiocb, list);
1183
1184 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
1185 break;
1186 if (req->timeout.target_seq != ctx->cached_cq_tail
1187 - atomic_read(&ctx->cq_timeouts))
1188 break;
1189
1190 list_del_init(&req->list);
1191 io_kill_timeout(req);
1192 }
1193 }
1194
1195 static void io_commit_cqring(struct io_ring_ctx *ctx)
1196 {
1197 io_flush_timeouts(ctx);
1198 __io_commit_cqring(ctx);
1199
1200 if (unlikely(!list_empty(&ctx->defer_list)))
1201 __io_queue_deferred(ctx);
1202 }
1203
1204 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1205 {
1206 struct io_rings *rings = ctx->rings;
1207 unsigned tail;
1208
1209 tail = ctx->cached_cq_tail;
1210 /*
1211 * writes to the cq entry need to come after reading head; the
1212 * control dependency is enough as we're using WRITE_ONCE to
1213 * fill the cq entry
1214 */
1215 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1216 return NULL;
1217
1218 ctx->cached_cq_tail++;
1219 return &rings->cqes[tail & ctx->cq_mask];
1220 }
1221
1222 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1223 {
1224 if (!ctx->cq_ev_fd)
1225 return false;
1226 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1227 return false;
1228 if (!ctx->eventfd_async)
1229 return true;
1230 return io_wq_current_is_worker();
1231 }
1232
1233 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1234 {
1235 if (waitqueue_active(&ctx->wait))
1236 wake_up(&ctx->wait);
1237 if (waitqueue_active(&ctx->sqo_wait))
1238 wake_up(&ctx->sqo_wait);
1239 if (io_should_trigger_evfd(ctx))
1240 eventfd_signal(ctx->cq_ev_fd, 1);
1241 }
1242
1243 /* Returns true if there are no backlogged entries after the flush */
1244 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1245 {
1246 struct io_rings *rings = ctx->rings;
1247 struct io_uring_cqe *cqe;
1248 struct io_kiocb *req;
1249 unsigned long flags;
1250 LIST_HEAD(list);
1251
1252 if (!force) {
1253 if (list_empty_careful(&ctx->cq_overflow_list))
1254 return true;
1255 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1256 rings->cq_ring_entries))
1257 return false;
1258 }
1259
1260 spin_lock_irqsave(&ctx->completion_lock, flags);
1261
1262 /* if force is set, the ring is going away. always drop after that */
1263 if (force)
1264 ctx->cq_overflow_flushed = 1;
1265
1266 cqe = NULL;
1267 while (!list_empty(&ctx->cq_overflow_list)) {
1268 cqe = io_get_cqring(ctx);
1269 if (!cqe && !force)
1270 break;
1271
1272 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1273 list);
1274 list_move(&req->list, &list);
1275 req->flags &= ~REQ_F_OVERFLOW;
1276 if (cqe) {
1277 WRITE_ONCE(cqe->user_data, req->user_data);
1278 WRITE_ONCE(cqe->res, req->result);
1279 WRITE_ONCE(cqe->flags, req->cflags);
1280 } else {
1281 WRITE_ONCE(ctx->rings->cq_overflow,
1282 atomic_inc_return(&ctx->cached_cq_overflow));
1283 }
1284 }
1285
1286 io_commit_cqring(ctx);
1287 if (cqe) {
1288 clear_bit(0, &ctx->sq_check_overflow);
1289 clear_bit(0, &ctx->cq_check_overflow);
1290 }
1291 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1292 io_cqring_ev_posted(ctx);
1293
1294 while (!list_empty(&list)) {
1295 req = list_first_entry(&list, struct io_kiocb, list);
1296 list_del(&req->list);
1297 io_put_req(req);
1298 }
1299
1300 return cqe != NULL;
1301 }
1302
1303 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1304 {
1305 struct io_ring_ctx *ctx = req->ctx;
1306 struct io_uring_cqe *cqe;
1307
1308 trace_io_uring_complete(ctx, req->user_data, res);
1309
1310 /*
1311 * If we can't get a cq entry, userspace overflowed the
1312 * submission (by quite a lot). Increment the overflow count in
1313 * the ring.
1314 */
1315 cqe = io_get_cqring(ctx);
1316 if (likely(cqe)) {
1317 WRITE_ONCE(cqe->user_data, req->user_data);
1318 WRITE_ONCE(cqe->res, res);
1319 WRITE_ONCE(cqe->flags, cflags);
1320 } else if (ctx->cq_overflow_flushed) {
1321 WRITE_ONCE(ctx->rings->cq_overflow,
1322 atomic_inc_return(&ctx->cached_cq_overflow));
1323 } else {
1324 if (list_empty(&ctx->cq_overflow_list)) {
1325 set_bit(0, &ctx->sq_check_overflow);
1326 set_bit(0, &ctx->cq_check_overflow);
1327 }
1328 req->flags |= REQ_F_OVERFLOW;
1329 refcount_inc(&req->refs);
1330 req->result = res;
1331 req->cflags = cflags;
1332 list_add_tail(&req->list, &ctx->cq_overflow_list);
1333 }
1334 }
1335
1336 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1337 {
1338 __io_cqring_fill_event(req, res, 0);
1339 }
1340
1341 static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1342 {
1343 struct io_ring_ctx *ctx = req->ctx;
1344 unsigned long flags;
1345
1346 spin_lock_irqsave(&ctx->completion_lock, flags);
1347 __io_cqring_fill_event(req, res, cflags);
1348 io_commit_cqring(ctx);
1349 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1350
1351 io_cqring_ev_posted(ctx);
1352 }
1353
1354 static void io_cqring_add_event(struct io_kiocb *req, long res)
1355 {
1356 __io_cqring_add_event(req, res, 0);
1357 }
1358
1359 static inline bool io_is_fallback_req(struct io_kiocb *req)
1360 {
1361 return req == (struct io_kiocb *)
1362 ((unsigned long) req->ctx->fallback_req & ~1UL);
1363 }
1364
1365 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1366 {
1367 struct io_kiocb *req;
1368
1369 req = ctx->fallback_req;
1370 if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1371 return req;
1372
1373 return NULL;
1374 }
1375
1376 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1377 struct io_submit_state *state)
1378 {
1379 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1380 struct io_kiocb *req;
1381
1382 if (!state) {
1383 req = kmem_cache_alloc(req_cachep, gfp);
1384 if (unlikely(!req))
1385 goto fallback;
1386 } else if (!state->free_reqs) {
1387 size_t sz;
1388 int ret;
1389
1390 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1391 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1392
1393 /*
1394 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1395 * retry single alloc to be on the safe side.
1396 */
1397 if (unlikely(ret <= 0)) {
1398 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1399 if (!state->reqs[0])
1400 goto fallback;
1401 ret = 1;
1402 }
1403 state->free_reqs = ret - 1;
1404 req = state->reqs[ret - 1];
1405 } else {
1406 state->free_reqs--;
1407 req = state->reqs[state->free_reqs];
1408 }
1409
1410 return req;
1411 fallback:
1412 return io_get_fallback_req(ctx);
1413 }
1414
1415 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1416 bool fixed)
1417 {
1418 if (fixed)
1419 percpu_ref_put(req->fixed_file_refs);
1420 else
1421 fput(file);
1422 }
1423
1424 static void __io_req_aux_free(struct io_kiocb *req)
1425 {
1426 if (req->flags & REQ_F_NEED_CLEANUP)
1427 io_cleanup_req(req);
1428
1429 kfree(req->io);
1430 if (req->file)
1431 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1432 __io_put_req_task(req);
1433 io_req_work_drop_env(req);
1434 }
1435
1436 static void __io_free_req(struct io_kiocb *req)
1437 {
1438 __io_req_aux_free(req);
1439
1440 if (req->flags & REQ_F_INFLIGHT) {
1441 struct io_ring_ctx *ctx = req->ctx;
1442 unsigned long flags;
1443
1444 spin_lock_irqsave(&ctx->inflight_lock, flags);
1445 list_del(&req->inflight_entry);
1446 if (waitqueue_active(&ctx->inflight_wait))
1447 wake_up(&ctx->inflight_wait);
1448 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1449 }
1450
1451 percpu_ref_put(&req->ctx->refs);
1452 if (likely(!io_is_fallback_req(req)))
1453 kmem_cache_free(req_cachep, req);
1454 else
1455 clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
1456 }
1457
1458 struct req_batch {
1459 void *reqs[IO_IOPOLL_BATCH];
1460 int to_free;
1461 int need_iter;
1462 };
1463
1464 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1465 {
1466 if (!rb->to_free)
1467 return;
1468 if (rb->need_iter) {
1469 int i, inflight = 0;
1470 unsigned long flags;
1471
1472 for (i = 0; i < rb->to_free; i++) {
1473 struct io_kiocb *req = rb->reqs[i];
1474
1475 if (req->flags & REQ_F_INFLIGHT)
1476 inflight++;
1477 __io_req_aux_free(req);
1478 }
1479 if (!inflight)
1480 goto do_free;
1481
1482 spin_lock_irqsave(&ctx->inflight_lock, flags);
1483 for (i = 0; i < rb->to_free; i++) {
1484 struct io_kiocb *req = rb->reqs[i];
1485
1486 if (req->flags & REQ_F_INFLIGHT) {
1487 list_del(&req->inflight_entry);
1488 if (!--inflight)
1489 break;
1490 }
1491 }
1492 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1493
1494 if (waitqueue_active(&ctx->inflight_wait))
1495 wake_up(&ctx->inflight_wait);
1496 }
1497 do_free:
1498 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1499 percpu_ref_put_many(&ctx->refs, rb->to_free);
1500 rb->to_free = rb->need_iter = 0;
1501 }
1502
1503 static bool io_link_cancel_timeout(struct io_kiocb *req)
1504 {
1505 struct io_ring_ctx *ctx = req->ctx;
1506 int ret;
1507
1508 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1509 if (ret != -1) {
1510 io_cqring_fill_event(req, -ECANCELED);
1511 io_commit_cqring(ctx);
1512 req->flags &= ~REQ_F_LINK_HEAD;
1513 io_put_req(req);
1514 return true;
1515 }
1516
1517 return false;
1518 }
1519
1520 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1521 {
1522 struct io_ring_ctx *ctx = req->ctx;
1523 bool wake_ev = false;
1524
1525 /* Already got next link */
1526 if (req->flags & REQ_F_LINK_NEXT)
1527 return;
1528
1529 /*
1530 * The list should never be empty when we are called here. But could
1531 * potentially happen if the chain is messed up, check to be on the
1532 * safe side.
1533 */
1534 while (!list_empty(&req->link_list)) {
1535 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1536 struct io_kiocb, link_list);
1537
1538 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1539 (nxt->flags & REQ_F_TIMEOUT))) {
1540 list_del_init(&nxt->link_list);
1541 wake_ev |= io_link_cancel_timeout(nxt);
1542 req->flags &= ~REQ_F_LINK_TIMEOUT;
1543 continue;
1544 }
1545
1546 list_del_init(&req->link_list);
1547 if (!list_empty(&nxt->link_list))
1548 nxt->flags |= REQ_F_LINK_HEAD;
1549 *nxtptr = nxt;
1550 break;
1551 }
1552
1553 req->flags |= REQ_F_LINK_NEXT;
1554 if (wake_ev)
1555 io_cqring_ev_posted(ctx);
1556 }
1557
1558 /*
1559 * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1560 */
1561 static void io_fail_links(struct io_kiocb *req)
1562 {
1563 struct io_ring_ctx *ctx = req->ctx;
1564 unsigned long flags;
1565
1566 spin_lock_irqsave(&ctx->completion_lock, flags);
1567
1568 while (!list_empty(&req->link_list)) {
1569 struct io_kiocb *link = list_first_entry(&req->link_list,
1570 struct io_kiocb, link_list);
1571
1572 list_del_init(&link->link_list);
1573 trace_io_uring_fail_link(req, link);
1574
1575 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1576 link->opcode == IORING_OP_LINK_TIMEOUT) {
1577 io_link_cancel_timeout(link);
1578 } else {
1579 io_cqring_fill_event(link, -ECANCELED);
1580 __io_double_put_req(link);
1581 }
1582 req->flags &= ~REQ_F_LINK_TIMEOUT;
1583 }
1584
1585 io_commit_cqring(ctx);
1586 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1587 io_cqring_ev_posted(ctx);
1588 }
1589
1590 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1591 {
1592 if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1593 return;
1594
1595 /*
1596 * If LINK is set, we have dependent requests in this chain. If we
1597 * didn't fail this request, queue the first one up, moving any other
1598 * dependencies to the next request. In case of failure, fail the rest
1599 * of the chain.
1600 */
1601 if (req->flags & REQ_F_FAIL_LINK) {
1602 io_fail_links(req);
1603 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1604 REQ_F_LINK_TIMEOUT) {
1605 struct io_ring_ctx *ctx = req->ctx;
1606 unsigned long flags;
1607
1608 /*
1609 * If this is a timeout link, we could be racing with the
1610 * timeout timer. Grab the completion lock for this case to
1611 * protect against that.
1612 */
1613 spin_lock_irqsave(&ctx->completion_lock, flags);
1614 io_req_link_next(req, nxt);
1615 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1616 } else {
1617 io_req_link_next(req, nxt);
1618 }
1619 }
1620
1621 static void io_free_req(struct io_kiocb *req)
1622 {
1623 struct io_kiocb *nxt = NULL;
1624
1625 io_req_find_next(req, &nxt);
1626 __io_free_req(req);
1627
1628 if (nxt)
1629 io_queue_async_work(nxt);
1630 }
1631
1632 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1633 {
1634 struct io_kiocb *link;
1635 const struct io_op_def *def = &io_op_defs[nxt->opcode];
1636
1637 if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
1638 io_wq_hash_work(&nxt->work, file_inode(nxt->file));
1639
1640 *workptr = &nxt->work;
1641 link = io_prep_linked_timeout(nxt);
1642 if (link)
1643 nxt->flags |= REQ_F_QUEUE_TIMEOUT;
1644 }
1645
1646 /*
1647 * Drop reference to request, return next in chain (if there is one) if this
1648 * was the last reference to this request.
1649 */
1650 __attribute__((nonnull))
1651 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1652 {
1653 if (refcount_dec_and_test(&req->refs)) {
1654 io_req_find_next(req, nxtptr);
1655 __io_free_req(req);
1656 }
1657 }
1658
1659 static void io_put_req(struct io_kiocb *req)
1660 {
1661 if (refcount_dec_and_test(&req->refs))
1662 io_free_req(req);
1663 }
1664
1665 static void io_steal_work(struct io_kiocb *req,
1666 struct io_wq_work **workptr)
1667 {
1668 /*
1669 * It's in an io-wq worker, so there always should be at least
1670 * one reference, which will be dropped in io_put_work() just
1671 * after the current handler returns.
1672 *
1673 * It also means, that if the counter dropped to 1, then there is
1674 * no asynchronous users left, so it's safe to steal the next work.
1675 */
1676 if (refcount_read(&req->refs) == 1) {
1677 struct io_kiocb *nxt = NULL;
1678
1679 io_req_find_next(req, &nxt);
1680 if (nxt)
1681 io_wq_assign_next(workptr, nxt);
1682 }
1683 }
1684
1685 /*
1686 * Must only be used if we don't need to care about links, usually from
1687 * within the completion handling itself.
1688 */
1689 static void __io_double_put_req(struct io_kiocb *req)
1690 {
1691 /* drop both submit and complete references */
1692 if (refcount_sub_and_test(2, &req->refs))
1693 __io_free_req(req);
1694 }
1695
1696 static void io_double_put_req(struct io_kiocb *req)
1697 {
1698 /* drop both submit and complete references */
1699 if (refcount_sub_and_test(2, &req->refs))
1700 io_free_req(req);
1701 }
1702
1703 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1704 {
1705 struct io_rings *rings = ctx->rings;
1706
1707 if (test_bit(0, &ctx->cq_check_overflow)) {
1708 /*
1709 * noflush == true is from the waitqueue handler, just ensure
1710 * we wake up the task, and the next invocation will flush the
1711 * entries. We cannot safely to it from here.
1712 */
1713 if (noflush && !list_empty(&ctx->cq_overflow_list))
1714 return -1U;
1715
1716 io_cqring_overflow_flush(ctx, false);
1717 }
1718
1719 /* See comment at the top of this file */
1720 smp_rmb();
1721 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
1722 }
1723
1724 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1725 {
1726 struct io_rings *rings = ctx->rings;
1727
1728 /* make sure SQ entry isn't read before tail */
1729 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1730 }
1731
1732 static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
1733 {
1734 if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
1735 return false;
1736
1737 if (req->file || req->io)
1738 rb->need_iter++;
1739
1740 rb->reqs[rb->to_free++] = req;
1741 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1742 io_free_req_many(req->ctx, rb);
1743 return true;
1744 }
1745
1746 static int io_put_kbuf(struct io_kiocb *req)
1747 {
1748 struct io_buffer *kbuf;
1749 int cflags;
1750
1751 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
1752 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
1753 cflags |= IORING_CQE_F_BUFFER;
1754 req->rw.addr = 0;
1755 kfree(kbuf);
1756 return cflags;
1757 }
1758
1759 static void io_iopoll_queue(struct list_head *again)
1760 {
1761 struct io_kiocb *req;
1762
1763 do {
1764 req = list_first_entry(again, struct io_kiocb, list);
1765 list_del(&req->list);
1766 refcount_inc(&req->refs);
1767 io_queue_async_work(req);
1768 } while (!list_empty(again));
1769 }
1770
1771 /*
1772 * Find and free completed poll iocbs
1773 */
1774 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1775 struct list_head *done)
1776 {
1777 struct req_batch rb;
1778 struct io_kiocb *req;
1779 LIST_HEAD(again);
1780
1781 /* order with ->result store in io_complete_rw_iopoll() */
1782 smp_rmb();
1783
1784 rb.to_free = rb.need_iter = 0;
1785 while (!list_empty(done)) {
1786 int cflags = 0;
1787
1788 req = list_first_entry(done, struct io_kiocb, list);
1789 if (READ_ONCE(req->result) == -EAGAIN) {
1790 req->iopoll_completed = 0;
1791 list_move_tail(&req->list, &again);
1792 continue;
1793 }
1794 list_del(&req->list);
1795
1796 if (req->flags & REQ_F_BUFFER_SELECTED)
1797 cflags = io_put_kbuf(req);
1798
1799 __io_cqring_fill_event(req, req->result, cflags);
1800 (*nr_events)++;
1801
1802 if (refcount_dec_and_test(&req->refs) &&
1803 !io_req_multi_free(&rb, req))
1804 io_free_req(req);
1805 }
1806
1807 io_commit_cqring(ctx);
1808 if (ctx->flags & IORING_SETUP_SQPOLL)
1809 io_cqring_ev_posted(ctx);
1810 io_free_req_many(ctx, &rb);
1811
1812 if (!list_empty(&again))
1813 io_iopoll_queue(&again);
1814 }
1815
1816 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1817 long min)
1818 {
1819 struct io_kiocb *req, *tmp;
1820 LIST_HEAD(done);
1821 bool spin;
1822 int ret;
1823
1824 /*
1825 * Only spin for completions if we don't have multiple devices hanging
1826 * off our complete list, and we're under the requested amount.
1827 */
1828 spin = !ctx->poll_multi_file && *nr_events < min;
1829
1830 ret = 0;
1831 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1832 struct kiocb *kiocb = &req->rw.kiocb;
1833
1834 /*
1835 * Move completed and retryable entries to our local lists.
1836 * If we find a request that requires polling, break out
1837 * and complete those lists first, if we have entries there.
1838 */
1839 if (READ_ONCE(req->iopoll_completed)) {
1840 list_move_tail(&req->list, &done);
1841 continue;
1842 }
1843 if (!list_empty(&done))
1844 break;
1845
1846 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1847 if (ret < 0)
1848 break;
1849
1850 if (ret && spin)
1851 spin = false;
1852 ret = 0;
1853 }
1854
1855 if (!list_empty(&done))
1856 io_iopoll_complete(ctx, nr_events, &done);
1857
1858 return ret;
1859 }
1860
1861 /*
1862 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1863 * non-spinning poll check - we'll still enter the driver poll loop, but only
1864 * as a non-spinning completion check.
1865 */
1866 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1867 long min)
1868 {
1869 while (!list_empty(&ctx->poll_list) && !need_resched()) {
1870 int ret;
1871
1872 ret = io_do_iopoll(ctx, nr_events, min);
1873 if (ret < 0)
1874 return ret;
1875 if (!min || *nr_events >= min)
1876 return 0;
1877 }
1878
1879 return 1;
1880 }
1881
1882 /*
1883 * We can't just wait for polled events to come to us, we have to actively
1884 * find and complete them.
1885 */
1886 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1887 {
1888 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1889 return;
1890
1891 mutex_lock(&ctx->uring_lock);
1892 while (!list_empty(&ctx->poll_list)) {
1893 unsigned int nr_events = 0;
1894
1895 io_iopoll_getevents(ctx, &nr_events, 1);
1896
1897 /*
1898 * Ensure we allow local-to-the-cpu processing to take place,
1899 * in this case we need to ensure that we reap all events.
1900 */
1901 cond_resched();
1902 }
1903 mutex_unlock(&ctx->uring_lock);
1904 }
1905
1906 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1907 long min)
1908 {
1909 int iters = 0, ret = 0;
1910
1911 /*
1912 * We disallow the app entering submit/complete with polling, but we
1913 * still need to lock the ring to prevent racing with polled issue
1914 * that got punted to a workqueue.
1915 */
1916 mutex_lock(&ctx->uring_lock);
1917 do {
1918 int tmin = 0;
1919
1920 /*
1921 * Don't enter poll loop if we already have events pending.
1922 * If we do, we can potentially be spinning for commands that
1923 * already triggered a CQE (eg in error).
1924 */
1925 if (io_cqring_events(ctx, false))
1926 break;
1927
1928 /*
1929 * If a submit got punted to a workqueue, we can have the
1930 * application entering polling for a command before it gets
1931 * issued. That app will hold the uring_lock for the duration
1932 * of the poll right here, so we need to take a breather every
1933 * now and then to ensure that the issue has a chance to add
1934 * the poll to the issued list. Otherwise we can spin here
1935 * forever, while the workqueue is stuck trying to acquire the
1936 * very same mutex.
1937 */
1938 if (!(++iters & 7)) {
1939 mutex_unlock(&ctx->uring_lock);
1940 mutex_lock(&ctx->uring_lock);
1941 }
1942
1943 if (*nr_events < min)
1944 tmin = min - *nr_events;
1945
1946 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1947 if (ret <= 0)
1948 break;
1949 ret = 0;
1950 } while (min && !*nr_events && !need_resched());
1951
1952 mutex_unlock(&ctx->uring_lock);
1953 return ret;
1954 }
1955
1956 static void kiocb_end_write(struct io_kiocb *req)
1957 {
1958 /*
1959 * Tell lockdep we inherited freeze protection from submission
1960 * thread.
1961 */
1962 if (req->flags & REQ_F_ISREG) {
1963 struct inode *inode = file_inode(req->file);
1964
1965 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1966 }
1967 file_end_write(req->file);
1968 }
1969
1970 static inline void req_set_fail_links(struct io_kiocb *req)
1971 {
1972 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1973 req->flags |= REQ_F_FAIL_LINK;
1974 }
1975
1976 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1977 {
1978 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1979 int cflags = 0;
1980
1981 if (kiocb->ki_flags & IOCB_WRITE)
1982 kiocb_end_write(req);
1983
1984 if (res != req->result)
1985 req_set_fail_links(req);
1986 if (req->flags & REQ_F_BUFFER_SELECTED)
1987 cflags = io_put_kbuf(req);
1988 __io_cqring_add_event(req, res, cflags);
1989 }
1990
1991 static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
1992 {
1993 struct mm_struct *mm = current->mm;
1994
1995 if (mm) {
1996 kthread_unuse_mm(mm);
1997 mmput(mm);
1998 }
1999 }
2000
2001 static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
2002 struct io_kiocb *req)
2003 {
2004 if (io_op_defs[req->opcode].needs_mm && !current->mm) {
2005 if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
2006 return -EFAULT;
2007 kthread_use_mm(ctx->sqo_mm);
2008 }
2009
2010 return 0;
2011 }
2012
2013 #ifdef CONFIG_BLOCK
2014 static bool io_resubmit_prep(struct io_kiocb *req, int error)
2015 {
2016 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2017 ssize_t ret = -ECANCELED;
2018 struct iov_iter iter;
2019 int rw;
2020
2021 if (error) {
2022 ret = error;
2023 goto end_req;
2024 }
2025
2026 switch (req->opcode) {
2027 case IORING_OP_READV:
2028 case IORING_OP_READ_FIXED:
2029 case IORING_OP_READ:
2030 rw = READ;
2031 break;
2032 case IORING_OP_WRITEV:
2033 case IORING_OP_WRITE_FIXED:
2034 case IORING_OP_WRITE:
2035 rw = WRITE;
2036 break;
2037 default:
2038 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2039 req->opcode);
2040 goto end_req;
2041 }
2042
2043 ret = io_import_iovec(rw, req, &iovec, &iter, false);
2044 if (ret < 0)
2045 goto end_req;
2046 ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
2047 if (!ret)
2048 return true;
2049 kfree(iovec);
2050 end_req:
2051 io_cqring_add_event(req, ret);
2052 req_set_fail_links(req);
2053 io_put_req(req);
2054 return false;
2055 }
2056
2057 static void io_rw_resubmit(struct callback_head *cb)
2058 {
2059 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2060 struct io_ring_ctx *ctx = req->ctx;
2061 int err;
2062
2063 __set_current_state(TASK_RUNNING);
2064
2065 err = io_sq_thread_acquire_mm(ctx, req);
2066
2067 if (io_resubmit_prep(req, err)) {
2068 refcount_inc(&req->refs);
2069 io_queue_async_work(req);
2070 }
2071 }
2072 #endif
2073
2074 static bool io_rw_reissue(struct io_kiocb *req, long res)
2075 {
2076 #ifdef CONFIG_BLOCK
2077 struct task_struct *tsk;
2078 int ret;
2079
2080 if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
2081 return false;
2082
2083 tsk = req->task;
2084 init_task_work(&req->task_work, io_rw_resubmit);
2085 ret = task_work_add(tsk, &req->task_work, true);
2086 if (!ret)
2087 return true;
2088 #endif
2089 return false;
2090 }
2091
2092 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2093 {
2094 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2095
2096 if (!io_rw_reissue(req, res)) {
2097 io_complete_rw_common(kiocb, res);
2098 io_put_req(req);
2099 }
2100 }
2101
2102 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2103 {
2104 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2105
2106 if (kiocb->ki_flags & IOCB_WRITE)
2107 kiocb_end_write(req);
2108
2109 if (res != -EAGAIN && res != req->result)
2110 req_set_fail_links(req);
2111
2112 WRITE_ONCE(req->result, res);
2113 /* order with io_poll_complete() checking ->result */
2114 if (res != -EAGAIN) {
2115 smp_wmb();
2116 WRITE_ONCE(req->iopoll_completed, 1);
2117 }
2118 }
2119
2120 /*
2121 * After the iocb has been issued, it's safe to be found on the poll list.
2122 * Adding the kiocb to the list AFTER submission ensures that we don't
2123 * find it from a io_iopoll_getevents() thread before the issuer is done
2124 * accessing the kiocb cookie.
2125 */
2126 static void io_iopoll_req_issued(struct io_kiocb *req)
2127 {
2128 struct io_ring_ctx *ctx = req->ctx;
2129
2130 /*
2131 * Track whether we have multiple files in our lists. This will impact
2132 * how we do polling eventually, not spinning if we're on potentially
2133 * different devices.
2134 */
2135 if (list_empty(&ctx->poll_list)) {
2136 ctx->poll_multi_file = false;
2137 } else if (!ctx->poll_multi_file) {
2138 struct io_kiocb *list_req;
2139
2140 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
2141 list);
2142 if (list_req->file != req->file)
2143 ctx->poll_multi_file = true;
2144 }
2145
2146 /*
2147 * For fast devices, IO may have already completed. If it has, add
2148 * it to the front so we find it first.
2149 */
2150 if (READ_ONCE(req->iopoll_completed))
2151 list_add(&req->list, &ctx->poll_list);
2152 else
2153 list_add_tail(&req->list, &ctx->poll_list);
2154
2155 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2156 wq_has_sleeper(&ctx->sqo_wait))
2157 wake_up(&ctx->sqo_wait);
2158 }
2159
2160 static void __io_state_file_put(struct io_submit_state *state)
2161 {
2162 int diff = state->has_refs - state->used_refs;
2163
2164 if (diff)
2165 fput_many(state->file, diff);
2166 state->file = NULL;
2167 }
2168
2169 static inline void io_state_file_put(struct io_submit_state *state)
2170 {
2171 if (state->file)
2172 __io_state_file_put(state);
2173 }
2174
2175 /*
2176 * Get as many references to a file as we have IOs left in this submission,
2177 * assuming most submissions are for one file, or at least that each file
2178 * has more than one submission.
2179 */
2180 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2181 {
2182 if (!state)
2183 return fget(fd);
2184
2185 if (state->file) {
2186 if (state->fd == fd) {
2187 state->used_refs++;
2188 state->ios_left--;
2189 return state->file;
2190 }
2191 __io_state_file_put(state);
2192 }
2193 state->file = fget_many(fd, state->ios_left);
2194 if (!state->file)
2195 return NULL;
2196
2197 state->fd = fd;
2198 state->has_refs = state->ios_left;
2199 state->used_refs = 1;
2200 state->ios_left--;
2201 return state->file;
2202 }
2203
2204 static bool io_bdev_nowait(struct block_device *bdev)
2205 {
2206 #ifdef CONFIG_BLOCK
2207 return !bdev || queue_is_mq(bdev_get_queue(bdev));
2208 #else
2209 return true;
2210 #endif
2211 }
2212
2213 /*
2214 * If we tracked the file through the SCM inflight mechanism, we could support
2215 * any file. For now, just ensure that anything potentially problematic is done
2216 * inline.
2217 */
2218 static bool io_file_supports_async(struct file *file, int rw)
2219 {
2220 umode_t mode = file_inode(file)->i_mode;
2221
2222 if (S_ISBLK(mode)) {
2223 if (io_bdev_nowait(file->f_inode->i_bdev))
2224 return true;
2225 return false;
2226 }
2227 if (S_ISCHR(mode) || S_ISSOCK(mode))
2228 return true;
2229 if (S_ISREG(mode)) {
2230 if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2231 file->f_op != &io_uring_fops)
2232 return true;
2233 return false;
2234 }
2235
2236 /* any ->read/write should understand O_NONBLOCK */
2237 if (file->f_flags & O_NONBLOCK)
2238 return true;
2239
2240 if (!(file->f_mode & FMODE_NOWAIT))
2241 return false;
2242
2243 if (rw == READ)
2244 return file->f_op->read_iter != NULL;
2245
2246 return file->f_op->write_iter != NULL;
2247 }
2248
2249 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2250 bool force_nonblock)
2251 {
2252 struct io_ring_ctx *ctx = req->ctx;
2253 struct kiocb *kiocb = &req->rw.kiocb;
2254 unsigned ioprio;
2255 int ret;
2256
2257 if (S_ISREG(file_inode(req->file)->i_mode))
2258 req->flags |= REQ_F_ISREG;
2259
2260 kiocb->ki_pos = READ_ONCE(sqe->off);
2261 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2262 req->flags |= REQ_F_CUR_POS;
2263 kiocb->ki_pos = req->file->f_pos;
2264 }
2265 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2266 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2267 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2268 if (unlikely(ret))
2269 return ret;
2270
2271 ioprio = READ_ONCE(sqe->ioprio);
2272 if (ioprio) {
2273 ret = ioprio_check_cap(ioprio);
2274 if (ret)
2275 return ret;
2276
2277 kiocb->ki_ioprio = ioprio;
2278 } else
2279 kiocb->ki_ioprio = get_current_ioprio();
2280
2281 /* don't allow async punt if RWF_NOWAIT was requested */
2282 if (kiocb->ki_flags & IOCB_NOWAIT)
2283 req->flags |= REQ_F_NOWAIT;
2284
2285 if (kiocb->ki_flags & IOCB_DIRECT)
2286 io_get_req_task(req);
2287
2288 if (force_nonblock)
2289 kiocb->ki_flags |= IOCB_NOWAIT;
2290
2291 if (ctx->flags & IORING_SETUP_IOPOLL) {
2292 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2293 !kiocb->ki_filp->f_op->iopoll)
2294 return -EOPNOTSUPP;
2295
2296 kiocb->ki_flags |= IOCB_HIPRI;
2297 kiocb->ki_complete = io_complete_rw_iopoll;
2298 req->result = 0;
2299 req->iopoll_completed = 0;
2300 } else {
2301 if (kiocb->ki_flags & IOCB_HIPRI)
2302 return -EINVAL;
2303 kiocb->ki_complete = io_complete_rw;
2304 }
2305
2306 req->rw.addr = READ_ONCE(sqe->addr);
2307 req->rw.len = READ_ONCE(sqe->len);
2308 req->buf_index = READ_ONCE(sqe->buf_index);
2309 return 0;
2310 }
2311
2312 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2313 {
2314 switch (ret) {
2315 case -EIOCBQUEUED:
2316 break;
2317 case -ERESTARTSYS:
2318 case -ERESTARTNOINTR:
2319 case -ERESTARTNOHAND:
2320 case -ERESTART_RESTARTBLOCK:
2321 /*
2322 * We can't just restart the syscall, since previously
2323 * submitted sqes may already be in progress. Just fail this
2324 * IO with EINTR.
2325 */
2326 ret = -EINTR;
2327 /* fall through */
2328 default:
2329 kiocb->ki_complete(kiocb, ret, 0);
2330 }
2331 }
2332
2333 static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
2334 {
2335 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2336
2337 if (req->flags & REQ_F_CUR_POS)
2338 req->file->f_pos = kiocb->ki_pos;
2339 if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2340 io_complete_rw(kiocb, ret, 0);
2341 else
2342 io_rw_done(kiocb, ret);
2343 }
2344
2345 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2346 struct iov_iter *iter)
2347 {
2348 struct io_ring_ctx *ctx = req->ctx;
2349 size_t len = req->rw.len;
2350 struct io_mapped_ubuf *imu;
2351 u16 index, buf_index;
2352 size_t offset;
2353 u64 buf_addr;
2354
2355 /* attempt to use fixed buffers without having provided iovecs */
2356 if (unlikely(!ctx->user_bufs))
2357 return -EFAULT;
2358
2359 buf_index = req->buf_index;
2360 if (unlikely(buf_index >= ctx->nr_user_bufs))
2361 return -EFAULT;
2362
2363 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2364 imu = &ctx->user_bufs[index];
2365 buf_addr = req->rw.addr;
2366
2367 /* overflow */
2368 if (buf_addr + len < buf_addr)
2369 return -EFAULT;
2370 /* not inside the mapped region */
2371 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2372 return -EFAULT;
2373
2374 /*
2375 * May not be a start of buffer, set size appropriately
2376 * and advance us to the beginning.
2377 */
2378 offset = buf_addr - imu->ubuf;
2379 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2380
2381 if (offset) {
2382 /*
2383 * Don't use iov_iter_advance() here, as it's really slow for
2384 * using the latter parts of a big fixed buffer - it iterates
2385 * over each segment manually. We can cheat a bit here, because
2386 * we know that:
2387 *
2388 * 1) it's a BVEC iter, we set it up
2389 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2390 * first and last bvec
2391 *
2392 * So just find our index, and adjust the iterator afterwards.
2393 * If the offset is within the first bvec (or the whole first
2394 * bvec, just use iov_iter_advance(). This makes it easier
2395 * since we can just skip the first segment, which may not
2396 * be PAGE_SIZE aligned.
2397 */
2398 const struct bio_vec *bvec = imu->bvec;
2399
2400 if (offset <= bvec->bv_len) {
2401 iov_iter_advance(iter, offset);
2402 } else {
2403 unsigned long seg_skip;
2404
2405 /* skip first vec */
2406 offset -= bvec->bv_len;
2407 seg_skip = 1 + (offset >> PAGE_SHIFT);
2408
2409 iter->bvec = bvec + seg_skip;
2410 iter->nr_segs -= seg_skip;
2411 iter->count -= bvec->bv_len + offset;
2412 iter->iov_offset = offset & ~PAGE_MASK;
2413 }
2414 }
2415
2416 return len;
2417 }
2418
2419 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2420 {
2421 if (needs_lock)
2422 mutex_unlock(&ctx->uring_lock);
2423 }
2424
2425 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2426 {
2427 /*
2428 * "Normal" inline submissions always hold the uring_lock, since we
2429 * grab it from the system call. Same is true for the SQPOLL offload.
2430 * The only exception is when we've detached the request and issue it
2431 * from an async worker thread, grab the lock for that case.
2432 */
2433 if (needs_lock)
2434 mutex_lock(&ctx->uring_lock);
2435 }
2436
2437 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2438 int bgid, struct io_buffer *kbuf,
2439 bool needs_lock)
2440 {
2441 struct io_buffer *head;
2442
2443 if (req->flags & REQ_F_BUFFER_SELECTED)
2444 return kbuf;
2445
2446 io_ring_submit_lock(req->ctx, needs_lock);
2447
2448 lockdep_assert_held(&req->ctx->uring_lock);
2449
2450 head = idr_find(&req->ctx->io_buffer_idr, bgid);
2451 if (head) {
2452 if (!list_empty(&head->list)) {
2453 kbuf = list_last_entry(&head->list, struct io_buffer,
2454 list);
2455 list_del(&kbuf->list);
2456 } else {
2457 kbuf = head;
2458 idr_remove(&req->ctx->io_buffer_idr, bgid);
2459 }
2460 if (*len > kbuf->len)
2461 *len = kbuf->len;
2462 } else {
2463 kbuf = ERR_PTR(-ENOBUFS);
2464 }
2465
2466 io_ring_submit_unlock(req->ctx, needs_lock);
2467
2468 return kbuf;
2469 }
2470
2471 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2472 bool needs_lock)
2473 {
2474 struct io_buffer *kbuf;
2475 u16 bgid;
2476
2477 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2478 bgid = req->buf_index;
2479 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2480 if (IS_ERR(kbuf))
2481 return kbuf;
2482 req->rw.addr = (u64) (unsigned long) kbuf;
2483 req->flags |= REQ_F_BUFFER_SELECTED;
2484 return u64_to_user_ptr(kbuf->addr);
2485 }
2486
2487 #ifdef CONFIG_COMPAT
2488 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2489 bool needs_lock)
2490 {
2491 struct compat_iovec __user *uiov;
2492 compat_ssize_t clen;
2493 void __user *buf;
2494 ssize_t len;
2495
2496 uiov = u64_to_user_ptr(req->rw.addr);
2497 if (!access_ok(uiov, sizeof(*uiov)))
2498 return -EFAULT;
2499 if (__get_user(clen, &uiov->iov_len))
2500 return -EFAULT;
2501 if (clen < 0)
2502 return -EINVAL;
2503
2504 len = clen;
2505 buf = io_rw_buffer_select(req, &len, needs_lock);
2506 if (IS_ERR(buf))
2507 return PTR_ERR(buf);
2508 iov[0].iov_base = buf;
2509 iov[0].iov_len = (compat_size_t) len;
2510 return 0;
2511 }
2512 #endif
2513
2514 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2515 bool needs_lock)
2516 {
2517 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2518 void __user *buf;
2519 ssize_t len;
2520
2521 if (copy_from_user(iov, uiov, sizeof(*uiov)))
2522 return -EFAULT;
2523
2524 len = iov[0].iov_len;
2525 if (len < 0)
2526 return -EINVAL;
2527 buf = io_rw_buffer_select(req, &len, needs_lock);
2528 if (IS_ERR(buf))
2529 return PTR_ERR(buf);
2530 iov[0].iov_base = buf;
2531 iov[0].iov_len = len;
2532 return 0;
2533 }
2534
2535 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2536 bool needs_lock)
2537 {
2538 if (req->flags & REQ_F_BUFFER_SELECTED) {
2539 struct io_buffer *kbuf;
2540
2541 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2542 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2543 iov[0].iov_len = kbuf->len;
2544 return 0;
2545 }
2546 if (!req->rw.len)
2547 return 0;
2548 else if (req->rw.len > 1)
2549 return -EINVAL;
2550
2551 #ifdef CONFIG_COMPAT
2552 if (req->ctx->compat)
2553 return io_compat_import(req, iov, needs_lock);
2554 #endif
2555
2556 return __io_iov_buffer_select(req, iov, needs_lock);
2557 }
2558
2559 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2560 struct iovec **iovec, struct iov_iter *iter,
2561 bool needs_lock)
2562 {
2563 void __user *buf = u64_to_user_ptr(req->rw.addr);
2564 size_t sqe_len = req->rw.len;
2565 ssize_t ret;
2566 u8 opcode;
2567
2568 opcode = req->opcode;
2569 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2570 *iovec = NULL;
2571 return io_import_fixed(req, rw, iter);
2572 }
2573
2574 /* buffer index only valid with fixed read/write, or buffer select */
2575 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2576 return -EINVAL;
2577
2578 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2579 if (req->flags & REQ_F_BUFFER_SELECT) {
2580 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2581 if (IS_ERR(buf)) {
2582 *iovec = NULL;
2583 return PTR_ERR(buf);
2584 }
2585 req->rw.len = sqe_len;
2586 }
2587
2588 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2589 *iovec = NULL;
2590 return ret < 0 ? ret : sqe_len;
2591 }
2592
2593 if (req->io) {
2594 struct io_async_rw *iorw = &req->io->rw;
2595
2596 *iovec = iorw->iov;
2597 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2598 if (iorw->iov == iorw->fast_iov)
2599 *iovec = NULL;
2600 return iorw->size;
2601 }
2602
2603 if (req->flags & REQ_F_BUFFER_SELECT) {
2604 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2605 if (!ret) {
2606 ret = (*iovec)->iov_len;
2607 iov_iter_init(iter, rw, *iovec, 1, ret);
2608 }
2609 *iovec = NULL;
2610 return ret;
2611 }
2612
2613 #ifdef CONFIG_COMPAT
2614 if (req->ctx->compat)
2615 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2616 iovec, iter);
2617 #endif
2618
2619 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2620 }
2621
2622 /*
2623 * For files that don't have ->read_iter() and ->write_iter(), handle them
2624 * by looping over ->read() or ->write() manually.
2625 */
2626 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2627 struct iov_iter *iter)
2628 {
2629 ssize_t ret = 0;
2630
2631 /*
2632 * Don't support polled IO through this interface, and we can't
2633 * support non-blocking either. For the latter, this just causes
2634 * the kiocb to be handled from an async context.
2635 */
2636 if (kiocb->ki_flags & IOCB_HIPRI)
2637 return -EOPNOTSUPP;
2638 if (kiocb->ki_flags & IOCB_NOWAIT)
2639 return -EAGAIN;
2640
2641 while (iov_iter_count(iter)) {
2642 struct iovec iovec;
2643 ssize_t nr;
2644
2645 if (!iov_iter_is_bvec(iter)) {
2646 iovec = iov_iter_iovec(iter);
2647 } else {
2648 /* fixed buffers import bvec */
2649 iovec.iov_base = kmap(iter->bvec->bv_page)
2650 + iter->iov_offset;
2651 iovec.iov_len = min(iter->count,
2652 iter->bvec->bv_len - iter->iov_offset);
2653 }
2654
2655 if (rw == READ) {
2656 nr = file->f_op->read(file, iovec.iov_base,
2657 iovec.iov_len, &kiocb->ki_pos);
2658 } else {
2659 nr = file->f_op->write(file, iovec.iov_base,
2660 iovec.iov_len, &kiocb->ki_pos);
2661 }
2662
2663 if (iov_iter_is_bvec(iter))
2664 kunmap(iter->bvec->bv_page);
2665
2666 if (nr < 0) {
2667 if (!ret)
2668 ret = nr;
2669 break;
2670 }
2671 ret += nr;
2672 if (nr != iovec.iov_len)
2673 break;
2674 iov_iter_advance(iter, nr);
2675 }
2676
2677 return ret;
2678 }
2679
2680 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
2681 struct iovec *iovec, struct iovec *fast_iov,
2682 struct iov_iter *iter)
2683 {
2684 req->io->rw.nr_segs = iter->nr_segs;
2685 req->io->rw.size = io_size;
2686 req->io->rw.iov = iovec;
2687 if (!req->io->rw.iov) {
2688 req->io->rw.iov = req->io->rw.fast_iov;
2689 if (req->io->rw.iov != fast_iov)
2690 memcpy(req->io->rw.iov, fast_iov,
2691 sizeof(struct iovec) * iter->nr_segs);
2692 } else {
2693 req->flags |= REQ_F_NEED_CLEANUP;
2694 }
2695 }
2696
2697 static inline int __io_alloc_async_ctx(struct io_kiocb *req)
2698 {
2699 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
2700 return req->io == NULL;
2701 }
2702
2703 static int io_alloc_async_ctx(struct io_kiocb *req)
2704 {
2705 if (!io_op_defs[req->opcode].async_ctx)
2706 return 0;
2707
2708 return __io_alloc_async_ctx(req);
2709 }
2710
2711 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2712 struct iovec *iovec, struct iovec *fast_iov,
2713 struct iov_iter *iter)
2714 {
2715 if (!io_op_defs[req->opcode].async_ctx)
2716 return 0;
2717 if (!req->io) {
2718 if (__io_alloc_async_ctx(req))
2719 return -ENOMEM;
2720
2721 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2722 }
2723 return 0;
2724 }
2725
2726 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2727 bool force_nonblock)
2728 {
2729 struct io_async_ctx *io;
2730 struct iov_iter iter;
2731 ssize_t ret;
2732
2733 ret = io_prep_rw(req, sqe, force_nonblock);
2734 if (ret)
2735 return ret;
2736
2737 if (unlikely(!(req->file->f_mode & FMODE_READ)))
2738 return -EBADF;
2739
2740 /* either don't need iovec imported or already have it */
2741 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2742 return 0;
2743
2744 io = req->io;
2745 io->rw.iov = io->rw.fast_iov;
2746 req->io = NULL;
2747 ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
2748 req->io = io;
2749 if (ret < 0)
2750 return ret;
2751
2752 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2753 return 0;
2754 }
2755
2756 static void __io_async_buf_error(struct io_kiocb *req, int error)
2757 {
2758 struct io_ring_ctx *ctx = req->ctx;
2759
2760 spin_lock_irq(&ctx->completion_lock);
2761 io_cqring_fill_event(req, error);
2762 io_commit_cqring(ctx);
2763 spin_unlock_irq(&ctx->completion_lock);
2764
2765 io_cqring_ev_posted(ctx);
2766 req_set_fail_links(req);
2767 io_double_put_req(req);
2768 }
2769
2770 static void io_async_buf_cancel(struct callback_head *cb)
2771 {
2772 struct io_async_rw *rw;
2773 struct io_kiocb *req;
2774
2775 rw = container_of(cb, struct io_async_rw, task_work);
2776 req = rw->wpq.wait.private;
2777 __io_async_buf_error(req, -ECANCELED);
2778 }
2779
2780 static void io_async_buf_retry(struct callback_head *cb)
2781 {
2782 struct io_async_rw *rw;
2783 struct io_ring_ctx *ctx;
2784 struct io_kiocb *req;
2785
2786 rw = container_of(cb, struct io_async_rw, task_work);
2787 req = rw->wpq.wait.private;
2788 ctx = req->ctx;
2789
2790 __set_current_state(TASK_RUNNING);
2791 if (!io_sq_thread_acquire_mm(ctx, req)) {
2792 mutex_lock(&ctx->uring_lock);
2793 __io_queue_sqe(req, NULL);
2794 mutex_unlock(&ctx->uring_lock);
2795 } else {
2796 __io_async_buf_error(req, -EFAULT);
2797 }
2798 }
2799
2800 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
2801 int sync, void *arg)
2802 {
2803 struct wait_page_queue *wpq;
2804 struct io_kiocb *req = wait->private;
2805 struct io_async_rw *rw = &req->io->rw;
2806 struct wait_page_key *key = arg;
2807 struct task_struct *tsk;
2808 int ret;
2809
2810 wpq = container_of(wait, struct wait_page_queue, wait);
2811
2812 ret = wake_page_match(wpq, key);
2813 if (ret != 1)
2814 return ret;
2815
2816 list_del_init(&wait->entry);
2817
2818 init_task_work(&rw->task_work, io_async_buf_retry);
2819 /* submit ref gets dropped, acquire a new one */
2820 refcount_inc(&req->refs);
2821 tsk = req->task;
2822 ret = task_work_add(tsk, &rw->task_work, true);
2823 if (unlikely(ret)) {
2824 /* queue just for cancelation */
2825 init_task_work(&rw->task_work, io_async_buf_cancel);
2826 tsk = io_wq_get_task(req->ctx->io_wq);
2827 task_work_add(tsk, &rw->task_work, true);
2828 }
2829 wake_up_process(tsk);
2830 return 1;
2831 }
2832
2833 static bool io_rw_should_retry(struct io_kiocb *req)
2834 {
2835 struct kiocb *kiocb = &req->rw.kiocb;
2836 int ret;
2837
2838 /* never retry for NOWAIT, we just complete with -EAGAIN */
2839 if (req->flags & REQ_F_NOWAIT)
2840 return false;
2841
2842 /* already tried, or we're doing O_DIRECT */
2843 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
2844 return false;
2845 /*
2846 * just use poll if we can, and don't attempt if the fs doesn't
2847 * support callback based unlocks
2848 */
2849 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
2850 return false;
2851
2852 /*
2853 * If request type doesn't require req->io to defer in general,
2854 * we need to allocate it here
2855 */
2856 if (!req->io && __io_alloc_async_ctx(req))
2857 return false;
2858
2859 ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
2860 io_async_buf_func, req);
2861 if (!ret) {
2862 io_get_req_task(req);
2863 return true;
2864 }
2865
2866 return false;
2867 }
2868
2869 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
2870 {
2871 if (req->file->f_op->read_iter)
2872 return call_read_iter(req->file, &req->rw.kiocb, iter);
2873 return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
2874 }
2875
2876 static int io_read(struct io_kiocb *req, bool force_nonblock)
2877 {
2878 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2879 struct kiocb *kiocb = &req->rw.kiocb;
2880 struct iov_iter iter;
2881 size_t iov_count;
2882 ssize_t io_size, ret;
2883
2884 ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
2885 if (ret < 0)
2886 return ret;
2887
2888 /* Ensure we clear previously set non-block flag */
2889 if (!force_nonblock)
2890 kiocb->ki_flags &= ~IOCB_NOWAIT;
2891
2892 req->result = 0;
2893 io_size = ret;
2894 if (req->flags & REQ_F_LINK_HEAD)
2895 req->result = io_size;
2896
2897 /*
2898 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2899 * we know to async punt it even if it was opened O_NONBLOCK
2900 */
2901 if (force_nonblock && !io_file_supports_async(req->file, READ))
2902 goto copy_iov;
2903
2904 iov_count = iov_iter_count(&iter);
2905 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2906 if (!ret) {
2907 unsigned long nr_segs = iter.nr_segs;
2908 ssize_t ret2 = 0;
2909
2910 ret2 = io_iter_do_read(req, &iter);
2911
2912 /* Catch -EAGAIN return for forced non-blocking submission */
2913 if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
2914 kiocb_done(kiocb, ret2);
2915 } else {
2916 iter.count = iov_count;
2917 iter.nr_segs = nr_segs;
2918 copy_iov:
2919 ret = io_setup_async_rw(req, io_size, iovec,
2920 inline_vecs, &iter);
2921 if (ret)
2922 goto out_free;
2923 /* if we can retry, do so with the callbacks armed */
2924 if (io_rw_should_retry(req)) {
2925 ret2 = io_iter_do_read(req, &iter);
2926 if (ret2 == -EIOCBQUEUED) {
2927 goto out_free;
2928 } else if (ret2 != -EAGAIN) {
2929 kiocb_done(kiocb, ret2);
2930 goto out_free;
2931 }
2932 }
2933 kiocb->ki_flags &= ~IOCB_WAITQ;
2934 return -EAGAIN;
2935 }
2936 }
2937 out_free:
2938 if (!(req->flags & REQ_F_NEED_CLEANUP))
2939 kfree(iovec);
2940 return ret;
2941 }
2942
2943 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2944 bool force_nonblock)
2945 {
2946 struct io_async_ctx *io;
2947 struct iov_iter iter;
2948 ssize_t ret;
2949
2950 ret = io_prep_rw(req, sqe, force_nonblock);
2951 if (ret)
2952 return ret;
2953
2954 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2955 return -EBADF;
2956
2957 req->fsize = rlimit(RLIMIT_FSIZE);
2958
2959 /* either don't need iovec imported or already have it */
2960 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2961 return 0;
2962
2963 io = req->io;
2964 io->rw.iov = io->rw.fast_iov;
2965 req->io = NULL;
2966 ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
2967 req->io = io;
2968 if (ret < 0)
2969 return ret;
2970
2971 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2972 return 0;
2973 }
2974
2975 static int io_write(struct io_kiocb *req, bool force_nonblock)
2976 {
2977 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2978 struct kiocb *kiocb = &req->rw.kiocb;
2979 struct iov_iter iter;
2980 size_t iov_count;
2981 ssize_t ret, io_size;
2982
2983 ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
2984 if (ret < 0)
2985 return ret;
2986
2987 /* Ensure we clear previously set non-block flag */
2988 if (!force_nonblock)
2989 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
2990
2991 req->result = 0;
2992 io_size = ret;
2993 if (req->flags & REQ_F_LINK_HEAD)
2994 req->result = io_size;
2995
2996 /*
2997 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2998 * we know to async punt it even if it was opened O_NONBLOCK
2999 */
3000 if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3001 goto copy_iov;
3002
3003 /* file path doesn't support NOWAIT for non-direct_IO */
3004 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3005 (req->flags & REQ_F_ISREG))
3006 goto copy_iov;
3007
3008 iov_count = iov_iter_count(&iter);
3009 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
3010 if (!ret) {
3011 unsigned long nr_segs = iter.nr_segs;
3012 ssize_t ret2;
3013
3014 /*
3015 * Open-code file_start_write here to grab freeze protection,
3016 * which will be released by another thread in
3017 * io_complete_rw(). Fool lockdep by telling it the lock got
3018 * released so that it doesn't complain about the held lock when
3019 * we return to userspace.
3020 */
3021 if (req->flags & REQ_F_ISREG) {
3022 __sb_start_write(file_inode(req->file)->i_sb,
3023 SB_FREEZE_WRITE, true);
3024 __sb_writers_release(file_inode(req->file)->i_sb,
3025 SB_FREEZE_WRITE);
3026 }
3027 kiocb->ki_flags |= IOCB_WRITE;
3028
3029 if (!force_nonblock)
3030 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
3031
3032 if (req->file->f_op->write_iter)
3033 ret2 = call_write_iter(req->file, kiocb, &iter);
3034 else
3035 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
3036
3037 if (!force_nonblock)
3038 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
3039
3040 /*
3041 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3042 * retry them without IOCB_NOWAIT.
3043 */
3044 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3045 ret2 = -EAGAIN;
3046 if (!force_nonblock || ret2 != -EAGAIN) {
3047 kiocb_done(kiocb, ret2);
3048 } else {
3049 iter.count = iov_count;
3050 iter.nr_segs = nr_segs;
3051 copy_iov:
3052 ret = io_setup_async_rw(req, io_size, iovec,
3053 inline_vecs, &iter);
3054 if (ret)
3055 goto out_free;
3056 return -EAGAIN;
3057 }
3058 }
3059 out_free:
3060 if (!(req->flags & REQ_F_NEED_CLEANUP))
3061 kfree(iovec);
3062 return ret;
3063 }
3064
3065 static int __io_splice_prep(struct io_kiocb *req,
3066 const struct io_uring_sqe *sqe)
3067 {
3068 struct io_splice* sp = &req->splice;
3069 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3070 int ret;
3071
3072 if (req->flags & REQ_F_NEED_CLEANUP)
3073 return 0;
3074 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3075 return -EINVAL;
3076
3077 sp->file_in = NULL;
3078 sp->len = READ_ONCE(sqe->len);
3079 sp->flags = READ_ONCE(sqe->splice_flags);
3080
3081 if (unlikely(sp->flags & ~valid_flags))
3082 return -EINVAL;
3083
3084 ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
3085 (sp->flags & SPLICE_F_FD_IN_FIXED));
3086 if (ret)
3087 return ret;
3088 req->flags |= REQ_F_NEED_CLEANUP;
3089
3090 if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3091 /*
3092 * Splice operation will be punted aync, and here need to
3093 * modify io_wq_work.flags, so initialize io_wq_work firstly.
3094 */
3095 io_req_init_async(req);
3096 req->work.flags |= IO_WQ_WORK_UNBOUND;
3097 }
3098
3099 return 0;
3100 }
3101
3102 static int io_tee_prep(struct io_kiocb *req,
3103 const struct io_uring_sqe *sqe)
3104 {
3105 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3106 return -EINVAL;
3107 return __io_splice_prep(req, sqe);
3108 }
3109
3110 static int io_tee(struct io_kiocb *req, bool force_nonblock)
3111 {
3112 struct io_splice *sp = &req->splice;
3113 struct file *in = sp->file_in;
3114 struct file *out = sp->file_out;
3115 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3116 long ret = 0;
3117
3118 if (force_nonblock)
3119 return -EAGAIN;
3120 if (sp->len)
3121 ret = do_tee(in, out, sp->len, flags);
3122
3123 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3124 req->flags &= ~REQ_F_NEED_CLEANUP;
3125
3126 io_cqring_add_event(req, ret);
3127 if (ret != sp->len)
3128 req_set_fail_links(req);
3129 io_put_req(req);
3130 return 0;
3131 }
3132
3133 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3134 {
3135 struct io_splice* sp = &req->splice;
3136
3137 sp->off_in = READ_ONCE(sqe->splice_off_in);
3138 sp->off_out = READ_ONCE(sqe->off);
3139 return __io_splice_prep(req, sqe);
3140 }
3141
3142 static int io_splice(struct io_kiocb *req, bool force_nonblock)
3143 {
3144 struct io_splice *sp = &req->splice;
3145 struct file *in = sp->file_in;
3146 struct file *out = sp->file_out;
3147 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3148 loff_t *poff_in, *poff_out;
3149 long ret = 0;
3150
3151 if (force_nonblock)
3152 return -EAGAIN;
3153
3154 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3155 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3156
3157 if (sp->len)
3158 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3159
3160 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3161 req->flags &= ~REQ_F_NEED_CLEANUP;
3162
3163 io_cqring_add_event(req, ret);
3164 if (ret != sp->len)
3165 req_set_fail_links(req);
3166 io_put_req(req);
3167 return 0;
3168 }
3169
3170 /*
3171 * IORING_OP_NOP just posts a completion event, nothing else.
3172 */
3173 static int io_nop(struct io_kiocb *req)
3174 {
3175 struct io_ring_ctx *ctx = req->ctx;
3176
3177 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3178 return -EINVAL;
3179
3180 io_cqring_add_event(req, 0);
3181 io_put_req(req);
3182 return 0;
3183 }
3184
3185 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3186 {
3187 struct io_ring_ctx *ctx = req->ctx;
3188
3189 if (!req->file)
3190 return -EBADF;
3191
3192 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3193 return -EINVAL;
3194 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3195 return -EINVAL;
3196
3197 req->sync.flags = READ_ONCE(sqe->fsync_flags);
3198 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3199 return -EINVAL;
3200
3201 req->sync.off = READ_ONCE(sqe->off);
3202 req->sync.len = READ_ONCE(sqe->len);
3203 return 0;
3204 }
3205
3206 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
3207 {
3208 loff_t end = req->sync.off + req->sync.len;
3209 int ret;
3210
3211 /* fsync always requires a blocking context */
3212 if (force_nonblock)
3213 return -EAGAIN;
3214
3215 ret = vfs_fsync_range(req->file, req->sync.off,
3216 end > 0 ? end : LLONG_MAX,
3217 req->sync.flags & IORING_FSYNC_DATASYNC);
3218 if (ret < 0)
3219 req_set_fail_links(req);
3220 io_cqring_add_event(req, ret);
3221 io_put_req(req);
3222 return 0;
3223 }
3224
3225 static int io_fallocate_prep(struct io_kiocb *req,
3226 const struct io_uring_sqe *sqe)
3227 {
3228 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3229 return -EINVAL;
3230 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3231 return -EINVAL;
3232
3233 req->sync.off = READ_ONCE(sqe->off);
3234 req->sync.len = READ_ONCE(sqe->addr);
3235 req->sync.mode = READ_ONCE(sqe->len);
3236 req->fsize = rlimit(RLIMIT_FSIZE);
3237 return 0;
3238 }
3239
3240 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
3241 {
3242 int ret;
3243
3244 /* fallocate always requiring blocking context */
3245 if (force_nonblock)
3246 return -EAGAIN;
3247
3248 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
3249 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3250 req->sync.len);
3251 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
3252 if (ret < 0)
3253 req_set_fail_links(req);
3254 io_cqring_add_event(req, ret);
3255 io_put_req(req);
3256 return 0;
3257 }
3258
3259 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3260 {
3261 const char __user *fname;
3262 int ret;
3263
3264 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3265 return -EINVAL;
3266 if (unlikely(sqe->ioprio || sqe->buf_index))
3267 return -EINVAL;
3268 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3269 return -EBADF;
3270
3271 /* open.how should be already initialised */
3272 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3273 req->open.how.flags |= O_LARGEFILE;
3274
3275 req->open.dfd = READ_ONCE(sqe->fd);
3276 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3277 req->open.filename = getname(fname);
3278 if (IS_ERR(req->open.filename)) {
3279 ret = PTR_ERR(req->open.filename);
3280 req->open.filename = NULL;
3281 return ret;
3282 }
3283 req->open.nofile = rlimit(RLIMIT_NOFILE);
3284 req->flags |= REQ_F_NEED_CLEANUP;
3285 return 0;
3286 }
3287
3288 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3289 {
3290 u64 flags, mode;
3291
3292 if (req->flags & REQ_F_NEED_CLEANUP)
3293 return 0;
3294 mode = READ_ONCE(sqe->len);
3295 flags = READ_ONCE(sqe->open_flags);
3296 req->open.how = build_open_how(flags, mode);
3297 return __io_openat_prep(req, sqe);
3298 }
3299
3300 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3301 {
3302 struct open_how __user *how;
3303 size_t len;
3304 int ret;
3305
3306 if (req->flags & REQ_F_NEED_CLEANUP)
3307 return 0;
3308 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3309 len = READ_ONCE(sqe->len);
3310 if (len < OPEN_HOW_SIZE_VER0)
3311 return -EINVAL;
3312
3313 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3314 len);
3315 if (ret)
3316 return ret;
3317
3318 return __io_openat_prep(req, sqe);
3319 }
3320
3321 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3322 {
3323 struct open_flags op;
3324 struct file *file;
3325 int ret;
3326
3327 if (force_nonblock)
3328 return -EAGAIN;
3329
3330 ret = build_open_flags(&req->open.how, &op);
3331 if (ret)
3332 goto err;
3333
3334 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3335 if (ret < 0)
3336 goto err;
3337
3338 file = do_filp_open(req->open.dfd, req->open.filename, &op);
3339 if (IS_ERR(file)) {
3340 put_unused_fd(ret);
3341 ret = PTR_ERR(file);
3342 } else {
3343 fsnotify_open(file);
3344 fd_install(ret, file);
3345 }
3346 err:
3347 putname(req->open.filename);
3348 req->flags &= ~REQ_F_NEED_CLEANUP;
3349 if (ret < 0)
3350 req_set_fail_links(req);
3351 io_cqring_add_event(req, ret);
3352 io_put_req(req);
3353 return 0;
3354 }
3355
3356 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3357 {
3358 return io_openat2(req, force_nonblock);
3359 }
3360
3361 static int io_remove_buffers_prep(struct io_kiocb *req,
3362 const struct io_uring_sqe *sqe)
3363 {
3364 struct io_provide_buf *p = &req->pbuf;
3365 u64 tmp;
3366
3367 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3368 return -EINVAL;
3369
3370 tmp = READ_ONCE(sqe->fd);
3371 if (!tmp || tmp > USHRT_MAX)
3372 return -EINVAL;
3373
3374 memset(p, 0, sizeof(*p));
3375 p->nbufs = tmp;
3376 p->bgid = READ_ONCE(sqe->buf_group);
3377 return 0;
3378 }
3379
3380 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3381 int bgid, unsigned nbufs)
3382 {
3383 unsigned i = 0;
3384
3385 /* shouldn't happen */
3386 if (!nbufs)
3387 return 0;
3388
3389 /* the head kbuf is the list itself */
3390 while (!list_empty(&buf->list)) {
3391 struct io_buffer *nxt;
3392
3393 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3394 list_del(&nxt->list);
3395 kfree(nxt);
3396 if (++i == nbufs)
3397 return i;
3398 }
3399 i++;
3400 kfree(buf);
3401 idr_remove(&ctx->io_buffer_idr, bgid);
3402
3403 return i;
3404 }
3405
3406 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
3407 {
3408 struct io_provide_buf *p = &req->pbuf;
3409 struct io_ring_ctx *ctx = req->ctx;
3410 struct io_buffer *head;
3411 int ret = 0;
3412
3413 io_ring_submit_lock(ctx, !force_nonblock);
3414
3415 lockdep_assert_held(&ctx->uring_lock);
3416
3417 ret = -ENOENT;
3418 head = idr_find(&ctx->io_buffer_idr, p->bgid);
3419 if (head)
3420 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3421
3422 io_ring_submit_lock(ctx, !force_nonblock);
3423 if (ret < 0)
3424 req_set_fail_links(req);
3425 io_cqring_add_event(req, ret);
3426 io_put_req(req);
3427 return 0;
3428 }
3429
3430 static int io_provide_buffers_prep(struct io_kiocb *req,
3431 const struct io_uring_sqe *sqe)
3432 {
3433 struct io_provide_buf *p = &req->pbuf;
3434 u64 tmp;
3435
3436 if (sqe->ioprio || sqe->rw_flags)
3437 return -EINVAL;
3438
3439 tmp = READ_ONCE(sqe->fd);
3440 if (!tmp || tmp > USHRT_MAX)
3441 return -E2BIG;
3442 p->nbufs = tmp;
3443 p->addr = READ_ONCE(sqe->addr);
3444 p->len = READ_ONCE(sqe->len);
3445
3446 if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3447 return -EFAULT;
3448
3449 p->bgid = READ_ONCE(sqe->buf_group);
3450 tmp = READ_ONCE(sqe->off);
3451 if (tmp > USHRT_MAX)
3452 return -E2BIG;
3453 p->bid = tmp;
3454 return 0;
3455 }
3456
3457 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3458 {
3459 struct io_buffer *buf;
3460 u64 addr = pbuf->addr;
3461 int i, bid = pbuf->bid;
3462
3463 for (i = 0; i < pbuf->nbufs; i++) {
3464 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3465 if (!buf)
3466 break;
3467
3468 buf->addr = addr;
3469 buf->len = pbuf->len;
3470 buf->bid = bid;
3471 addr += pbuf->len;
3472 bid++;
3473 if (!*head) {
3474 INIT_LIST_HEAD(&buf->list);
3475 *head = buf;
3476 } else {
3477 list_add_tail(&buf->list, &(*head)->list);
3478 }
3479 }
3480
3481 return i ? i : -ENOMEM;
3482 }
3483
3484 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
3485 {
3486 struct io_provide_buf *p = &req->pbuf;
3487 struct io_ring_ctx *ctx = req->ctx;
3488 struct io_buffer *head, *list;
3489 int ret = 0;
3490
3491 io_ring_submit_lock(ctx, !force_nonblock);
3492
3493 lockdep_assert_held(&ctx->uring_lock);
3494
3495 list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3496
3497 ret = io_add_buffers(p, &head);
3498 if (ret < 0)
3499 goto out;
3500
3501 if (!list) {
3502 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3503 GFP_KERNEL);
3504 if (ret < 0) {
3505 __io_remove_buffers(ctx, head, p->bgid, -1U);
3506 goto out;
3507 }
3508 }
3509 out:
3510 io_ring_submit_unlock(ctx, !force_nonblock);
3511 if (ret < 0)
3512 req_set_fail_links(req);
3513 io_cqring_add_event(req, ret);
3514 io_put_req(req);
3515 return 0;
3516 }
3517
3518 static int io_epoll_ctl_prep(struct io_kiocb *req,
3519 const struct io_uring_sqe *sqe)
3520 {
3521 #if defined(CONFIG_EPOLL)
3522 if (sqe->ioprio || sqe->buf_index)
3523 return -EINVAL;
3524 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3525 return -EINVAL;
3526
3527 req->epoll.epfd = READ_ONCE(sqe->fd);
3528 req->epoll.op = READ_ONCE(sqe->len);
3529 req->epoll.fd = READ_ONCE(sqe->off);
3530
3531 if (ep_op_has_event(req->epoll.op)) {
3532 struct epoll_event __user *ev;
3533
3534 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3535 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3536 return -EFAULT;
3537 }
3538
3539 return 0;
3540 #else
3541 return -EOPNOTSUPP;
3542 #endif
3543 }
3544
3545 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
3546 {
3547 #if defined(CONFIG_EPOLL)
3548 struct io_epoll *ie = &req->epoll;
3549 int ret;
3550
3551 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3552 if (force_nonblock && ret == -EAGAIN)
3553 return -EAGAIN;
3554
3555 if (ret < 0)
3556 req_set_fail_links(req);
3557 io_cqring_add_event(req, ret);
3558 io_put_req(req);
3559 return 0;
3560 #else
3561 return -EOPNOTSUPP;
3562 #endif
3563 }
3564
3565 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3566 {
3567 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3568 if (sqe->ioprio || sqe->buf_index || sqe->off)
3569 return -EINVAL;
3570 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3571 return -EINVAL;
3572
3573 req->madvise.addr = READ_ONCE(sqe->addr);
3574 req->madvise.len = READ_ONCE(sqe->len);
3575 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3576 return 0;
3577 #else
3578 return -EOPNOTSUPP;
3579 #endif
3580 }
3581
3582 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
3583 {
3584 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3585 struct io_madvise *ma = &req->madvise;
3586 int ret;
3587
3588 if (force_nonblock)
3589 return -EAGAIN;
3590
3591 ret = do_madvise(ma->addr, ma->len, ma->advice);
3592 if (ret < 0)
3593 req_set_fail_links(req);
3594 io_cqring_add_event(req, ret);
3595 io_put_req(req);
3596 return 0;
3597 #else
3598 return -EOPNOTSUPP;
3599 #endif
3600 }
3601
3602 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3603 {
3604 if (sqe->ioprio || sqe->buf_index || sqe->addr)
3605 return -EINVAL;
3606 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3607 return -EINVAL;
3608
3609 req->fadvise.offset = READ_ONCE(sqe->off);
3610 req->fadvise.len = READ_ONCE(sqe->len);
3611 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3612 return 0;
3613 }
3614
3615 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
3616 {
3617 struct io_fadvise *fa = &req->fadvise;
3618 int ret;
3619
3620 if (force_nonblock) {
3621 switch (fa->advice) {
3622 case POSIX_FADV_NORMAL:
3623 case POSIX_FADV_RANDOM:
3624 case POSIX_FADV_SEQUENTIAL:
3625 break;
3626 default:
3627 return -EAGAIN;
3628 }
3629 }
3630
3631 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3632 if (ret < 0)
3633 req_set_fail_links(req);
3634 io_cqring_add_event(req, ret);
3635 io_put_req(req);
3636 return 0;
3637 }
3638
3639 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3640 {
3641 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3642 return -EINVAL;
3643 if (sqe->ioprio || sqe->buf_index)
3644 return -EINVAL;
3645 if (req->flags & REQ_F_FIXED_FILE)
3646 return -EBADF;
3647
3648 req->statx.dfd = READ_ONCE(sqe->fd);
3649 req->statx.mask = READ_ONCE(sqe->len);
3650 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
3651 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3652 req->statx.flags = READ_ONCE(sqe->statx_flags);
3653
3654 return 0;
3655 }
3656
3657 static int io_statx(struct io_kiocb *req, bool force_nonblock)
3658 {
3659 struct io_statx *ctx = &req->statx;
3660 int ret;
3661
3662 if (force_nonblock) {
3663 /* only need file table for an actual valid fd */
3664 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
3665 req->flags |= REQ_F_NO_FILE_TABLE;
3666 return -EAGAIN;
3667 }
3668
3669 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
3670 ctx->buffer);
3671
3672 if (ret < 0)
3673 req_set_fail_links(req);
3674 io_cqring_add_event(req, ret);
3675 io_put_req(req);
3676 return 0;
3677 }
3678
3679 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3680 {
3681 /*
3682 * If we queue this for async, it must not be cancellable. That would
3683 * leave the 'file' in an undeterminate state, and here need to modify
3684 * io_wq_work.flags, so initialize io_wq_work firstly.
3685 */
3686 io_req_init_async(req);
3687 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3688
3689 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3690 return -EINVAL;
3691 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3692 sqe->rw_flags || sqe->buf_index)
3693 return -EINVAL;
3694 if (req->flags & REQ_F_FIXED_FILE)
3695 return -EBADF;
3696
3697 req->close.fd = READ_ONCE(sqe->fd);
3698 if ((req->file && req->file->f_op == &io_uring_fops) ||
3699 req->close.fd == req->ctx->ring_fd)
3700 return -EBADF;
3701
3702 req->close.put_file = NULL;
3703 return 0;
3704 }
3705
3706 static int io_close(struct io_kiocb *req, bool force_nonblock)
3707 {
3708 struct io_close *close = &req->close;
3709 int ret;
3710
3711 /* might be already done during nonblock submission */
3712 if (!close->put_file) {
3713 ret = __close_fd_get_file(close->fd, &close->put_file);
3714 if (ret < 0)
3715 return (ret == -ENOENT) ? -EBADF : ret;
3716 }
3717
3718 /* if the file has a flush method, be safe and punt to async */
3719 if (close->put_file->f_op->flush && force_nonblock) {
3720 /* avoid grabbing files - we don't need the files */
3721 req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
3722 return -EAGAIN;
3723 }
3724
3725 /* No ->flush() or already async, safely close from here */
3726 ret = filp_close(close->put_file, req->work.files);
3727 if (ret < 0)
3728 req_set_fail_links(req);
3729 io_cqring_add_event(req, ret);
3730 fput(close->put_file);
3731 close->put_file = NULL;
3732 io_put_req(req);
3733 return 0;
3734 }
3735
3736 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3737 {
3738 struct io_ring_ctx *ctx = req->ctx;
3739
3740 if (!req->file)
3741 return -EBADF;
3742
3743 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3744 return -EINVAL;
3745 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3746 return -EINVAL;
3747
3748 req->sync.off = READ_ONCE(sqe->off);
3749 req->sync.len = READ_ONCE(sqe->len);
3750 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
3751 return 0;
3752 }
3753
3754 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
3755 {
3756 int ret;
3757
3758 /* sync_file_range always requires a blocking context */
3759 if (force_nonblock)
3760 return -EAGAIN;
3761
3762 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
3763 req->sync.flags);
3764 if (ret < 0)
3765 req_set_fail_links(req);
3766 io_cqring_add_event(req, ret);
3767 io_put_req(req);
3768 return 0;
3769 }
3770
3771 #if defined(CONFIG_NET)
3772 static int io_setup_async_msg(struct io_kiocb *req,
3773 struct io_async_msghdr *kmsg)
3774 {
3775 if (req->io)
3776 return -EAGAIN;
3777 if (io_alloc_async_ctx(req)) {
3778 if (kmsg->iov != kmsg->fast_iov)
3779 kfree(kmsg->iov);
3780 return -ENOMEM;
3781 }
3782 req->flags |= REQ_F_NEED_CLEANUP;
3783 memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3784 return -EAGAIN;
3785 }
3786
3787 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3788 {
3789 struct io_sr_msg *sr = &req->sr_msg;
3790 struct io_async_ctx *io = req->io;
3791 int ret;
3792
3793 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3794 return -EINVAL;
3795
3796 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3797 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3798 sr->len = READ_ONCE(sqe->len);
3799
3800 #ifdef CONFIG_COMPAT
3801 if (req->ctx->compat)
3802 sr->msg_flags |= MSG_CMSG_COMPAT;
3803 #endif
3804
3805 if (!io || req->opcode == IORING_OP_SEND)
3806 return 0;
3807 /* iovec is already imported */
3808 if (req->flags & REQ_F_NEED_CLEANUP)
3809 return 0;
3810
3811 io->msg.iov = io->msg.fast_iov;
3812 ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
3813 &io->msg.iov);
3814 if (!ret)
3815 req->flags |= REQ_F_NEED_CLEANUP;
3816 return ret;
3817 }
3818
3819 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
3820 {
3821 struct io_async_msghdr *kmsg = NULL;
3822 struct socket *sock;
3823 int ret;
3824
3825 sock = sock_from_file(req->file, &ret);
3826 if (sock) {
3827 struct io_async_ctx io;
3828 unsigned flags;
3829
3830 if (req->io) {
3831 kmsg = &req->io->msg;
3832 kmsg->msg.msg_name = &req->io->msg.addr;
3833 /* if iov is set, it's allocated already */
3834 if (!kmsg->iov)
3835 kmsg->iov = kmsg->fast_iov;
3836 kmsg->msg.msg_iter.iov = kmsg->iov;
3837 } else {
3838 struct io_sr_msg *sr = &req->sr_msg;
3839
3840 kmsg = &io.msg;
3841 kmsg->msg.msg_name = &io.msg.addr;
3842
3843 io.msg.iov = io.msg.fast_iov;
3844 ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3845 sr->msg_flags, &io.msg.iov);
3846 if (ret)
3847 return ret;
3848 }
3849
3850 flags = req->sr_msg.msg_flags;
3851 if (flags & MSG_DONTWAIT)
3852 req->flags |= REQ_F_NOWAIT;
3853 else if (force_nonblock)
3854 flags |= MSG_DONTWAIT;
3855
3856 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
3857 if (force_nonblock && ret == -EAGAIN)
3858 return io_setup_async_msg(req, kmsg);
3859 if (ret == -ERESTARTSYS)
3860 ret = -EINTR;
3861 }
3862
3863 if (kmsg && kmsg->iov != kmsg->fast_iov)
3864 kfree(kmsg->iov);
3865 req->flags &= ~REQ_F_NEED_CLEANUP;
3866 io_cqring_add_event(req, ret);
3867 if (ret < 0)
3868 req_set_fail_links(req);
3869 io_put_req(req);
3870 return 0;
3871 }
3872
3873 static int io_send(struct io_kiocb *req, bool force_nonblock)
3874 {
3875 struct socket *sock;
3876 int ret;
3877
3878 sock = sock_from_file(req->file, &ret);
3879 if (sock) {
3880 struct io_sr_msg *sr = &req->sr_msg;
3881 struct msghdr msg;
3882 struct iovec iov;
3883 unsigned flags;
3884
3885 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3886 &msg.msg_iter);
3887 if (ret)
3888 return ret;
3889
3890 msg.msg_name = NULL;
3891 msg.msg_control = NULL;
3892 msg.msg_controllen = 0;
3893 msg.msg_namelen = 0;
3894
3895 flags = req->sr_msg.msg_flags;
3896 if (flags & MSG_DONTWAIT)
3897 req->flags |= REQ_F_NOWAIT;
3898 else if (force_nonblock)
3899 flags |= MSG_DONTWAIT;
3900
3901 msg.msg_flags = flags;
3902 ret = sock_sendmsg(sock, &msg);
3903 if (force_nonblock && ret == -EAGAIN)
3904 return -EAGAIN;
3905 if (ret == -ERESTARTSYS)
3906 ret = -EINTR;
3907 }
3908
3909 io_cqring_add_event(req, ret);
3910 if (ret < 0)
3911 req_set_fail_links(req);
3912 io_put_req(req);
3913 return 0;
3914 }
3915
3916 static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3917 {
3918 struct io_sr_msg *sr = &req->sr_msg;
3919 struct iovec __user *uiov;
3920 size_t iov_len;
3921 int ret;
3922
3923 ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
3924 &uiov, &iov_len);
3925 if (ret)
3926 return ret;
3927
3928 if (req->flags & REQ_F_BUFFER_SELECT) {
3929 if (iov_len > 1)
3930 return -EINVAL;
3931 if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
3932 return -EFAULT;
3933 sr->len = io->msg.iov[0].iov_len;
3934 iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
3935 sr->len);
3936 io->msg.iov = NULL;
3937 } else {
3938 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
3939 &io->msg.iov, &io->msg.msg.msg_iter);
3940 if (ret > 0)
3941 ret = 0;
3942 }
3943
3944 return ret;
3945 }
3946
3947 #ifdef CONFIG_COMPAT
3948 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
3949 struct io_async_ctx *io)
3950 {
3951 struct compat_msghdr __user *msg_compat;
3952 struct io_sr_msg *sr = &req->sr_msg;
3953 struct compat_iovec __user *uiov;
3954 compat_uptr_t ptr;
3955 compat_size_t len;
3956 int ret;
3957
3958 msg_compat = (struct compat_msghdr __user *) sr->msg;
3959 ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
3960 &ptr, &len);
3961 if (ret)
3962 return ret;
3963
3964 uiov = compat_ptr(ptr);
3965 if (req->flags & REQ_F_BUFFER_SELECT) {
3966 compat_ssize_t clen;
3967
3968 if (len > 1)
3969 return -EINVAL;
3970 if (!access_ok(uiov, sizeof(*uiov)))
3971 return -EFAULT;
3972 if (__get_user(clen, &uiov->iov_len))
3973 return -EFAULT;
3974 if (clen < 0)
3975 return -EINVAL;
3976 sr->len = io->msg.iov[0].iov_len;
3977 io->msg.iov = NULL;
3978 } else {
3979 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
3980 &io->msg.iov,
3981 &io->msg.msg.msg_iter);
3982 if (ret < 0)
3983 return ret;
3984 }
3985
3986 return 0;
3987 }
3988 #endif
3989
3990 static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3991 {
3992 io->msg.iov = io->msg.fast_iov;
3993
3994 #ifdef CONFIG_COMPAT
3995 if (req->ctx->compat)
3996 return __io_compat_recvmsg_copy_hdr(req, io);
3997 #endif
3998
3999 return __io_recvmsg_copy_hdr(req, io);
4000 }
4001
4002 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4003 int *cflags, bool needs_lock)
4004 {
4005 struct io_sr_msg *sr = &req->sr_msg;
4006 struct io_buffer *kbuf;
4007
4008 if (!(req->flags & REQ_F_BUFFER_SELECT))
4009 return NULL;
4010
4011 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4012 if (IS_ERR(kbuf))
4013 return kbuf;
4014
4015 sr->kbuf = kbuf;
4016 req->flags |= REQ_F_BUFFER_SELECTED;
4017
4018 *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
4019 *cflags |= IORING_CQE_F_BUFFER;
4020 return kbuf;
4021 }
4022
4023 static int io_recvmsg_prep(struct io_kiocb *req,
4024 const struct io_uring_sqe *sqe)
4025 {
4026 struct io_sr_msg *sr = &req->sr_msg;
4027 struct io_async_ctx *io = req->io;
4028 int ret;
4029
4030 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4031 return -EINVAL;
4032
4033 sr->msg_flags = READ_ONCE(sqe->msg_flags);
4034 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4035 sr->len = READ_ONCE(sqe->len);
4036 sr->bgid = READ_ONCE(sqe->buf_group);
4037
4038 #ifdef CONFIG_COMPAT
4039 if (req->ctx->compat)
4040 sr->msg_flags |= MSG_CMSG_COMPAT;
4041 #endif
4042
4043 if (!io || req->opcode == IORING_OP_RECV)
4044 return 0;
4045 /* iovec is already imported */
4046 if (req->flags & REQ_F_NEED_CLEANUP)
4047 return 0;
4048
4049 ret = io_recvmsg_copy_hdr(req, io);
4050 if (!ret)
4051 req->flags |= REQ_F_NEED_CLEANUP;
4052 return ret;
4053 }
4054
4055 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
4056 {
4057 struct io_async_msghdr *kmsg = NULL;
4058 struct socket *sock;
4059 int ret, cflags = 0;
4060
4061 sock = sock_from_file(req->file, &ret);
4062 if (sock) {
4063 struct io_buffer *kbuf;
4064 struct io_async_ctx io;
4065 unsigned flags;
4066
4067 if (req->io) {
4068 kmsg = &req->io->msg;
4069 kmsg->msg.msg_name = &req->io->msg.addr;
4070 /* if iov is set, it's allocated already */
4071 if (!kmsg->iov)
4072 kmsg->iov = kmsg->fast_iov;
4073 kmsg->msg.msg_iter.iov = kmsg->iov;
4074 } else {
4075 kmsg = &io.msg;
4076 kmsg->msg.msg_name = &io.msg.addr;
4077
4078 ret = io_recvmsg_copy_hdr(req, &io);
4079 if (ret)
4080 return ret;
4081 }
4082
4083 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
4084 if (IS_ERR(kbuf)) {
4085 return PTR_ERR(kbuf);
4086 } else if (kbuf) {
4087 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4088 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
4089 1, req->sr_msg.len);
4090 }
4091
4092 flags = req->sr_msg.msg_flags;
4093 if (flags & MSG_DONTWAIT)
4094 req->flags |= REQ_F_NOWAIT;
4095 else if (force_nonblock)
4096 flags |= MSG_DONTWAIT;
4097
4098 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
4099 kmsg->uaddr, flags);
4100 if (force_nonblock && ret == -EAGAIN)
4101 return io_setup_async_msg(req, kmsg);
4102 if (ret == -ERESTARTSYS)
4103 ret = -EINTR;
4104 }
4105
4106 if (kmsg && kmsg->iov != kmsg->fast_iov)
4107 kfree(kmsg->iov);
4108 req->flags &= ~REQ_F_NEED_CLEANUP;
4109 __io_cqring_add_event(req, ret, cflags);
4110 if (ret < 0)
4111 req_set_fail_links(req);
4112 io_put_req(req);
4113 return 0;
4114 }
4115
4116 static int io_recv(struct io_kiocb *req, bool force_nonblock)
4117 {
4118 struct io_buffer *kbuf = NULL;
4119 struct socket *sock;
4120 int ret, cflags = 0;
4121
4122 sock = sock_from_file(req->file, &ret);
4123 if (sock) {
4124 struct io_sr_msg *sr = &req->sr_msg;
4125 void __user *buf = sr->buf;
4126 struct msghdr msg;
4127 struct iovec iov;
4128 unsigned flags;
4129
4130 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
4131 if (IS_ERR(kbuf))
4132 return PTR_ERR(kbuf);
4133 else if (kbuf)
4134 buf = u64_to_user_ptr(kbuf->addr);
4135
4136 ret = import_single_range(READ, buf, sr->len, &iov,
4137 &msg.msg_iter);
4138 if (ret) {
4139 kfree(kbuf);
4140 return ret;
4141 }
4142
4143 req->flags |= REQ_F_NEED_CLEANUP;
4144 msg.msg_name = NULL;
4145 msg.msg_control = NULL;
4146 msg.msg_controllen = 0;
4147 msg.msg_namelen = 0;
4148 msg.msg_iocb = NULL;
4149 msg.msg_flags = 0;
4150
4151 flags = req->sr_msg.msg_flags;
4152 if (flags & MSG_DONTWAIT)
4153 req->flags |= REQ_F_NOWAIT;
4154 else if (force_nonblock)
4155 flags |= MSG_DONTWAIT;
4156
4157 ret = sock_recvmsg(sock, &msg, flags);
4158 if (force_nonblock && ret == -EAGAIN)
4159 return -EAGAIN;
4160 if (ret == -ERESTARTSYS)
4161 ret = -EINTR;
4162 }
4163
4164 kfree(kbuf);
4165 req->flags &= ~REQ_F_NEED_CLEANUP;
4166 __io_cqring_add_event(req, ret, cflags);
4167 if (ret < 0)
4168 req_set_fail_links(req);
4169 io_put_req(req);
4170 return 0;
4171 }
4172
4173 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4174 {
4175 struct io_accept *accept = &req->accept;
4176
4177 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4178 return -EINVAL;
4179 if (sqe->ioprio || sqe->len || sqe->buf_index)
4180 return -EINVAL;
4181
4182 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4183 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4184 accept->flags = READ_ONCE(sqe->accept_flags);
4185 accept->nofile = rlimit(RLIMIT_NOFILE);
4186 return 0;
4187 }
4188
4189 static int io_accept(struct io_kiocb *req, bool force_nonblock)
4190 {
4191 struct io_accept *accept = &req->accept;
4192 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4193 int ret;
4194
4195 if (req->file->f_flags & O_NONBLOCK)
4196 req->flags |= REQ_F_NOWAIT;
4197
4198 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4199 accept->addr_len, accept->flags,
4200 accept->nofile);
4201 if (ret == -EAGAIN && force_nonblock)
4202 return -EAGAIN;
4203 if (ret < 0) {
4204 if (ret == -ERESTARTSYS)
4205 ret = -EINTR;
4206 req_set_fail_links(req);
4207 }
4208 io_cqring_add_event(req, ret);
4209 io_put_req(req);
4210 return 0;
4211 }
4212
4213 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4214 {
4215 struct io_connect *conn = &req->connect;
4216 struct io_async_ctx *io = req->io;
4217
4218 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4219 return -EINVAL;
4220 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4221 return -EINVAL;
4222
4223 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4224 conn->addr_len = READ_ONCE(sqe->addr2);
4225
4226 if (!io)
4227 return 0;
4228
4229 return move_addr_to_kernel(conn->addr, conn->addr_len,
4230 &io->connect.address);
4231 }
4232
4233 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4234 {
4235 struct io_async_ctx __io, *io;
4236 unsigned file_flags;
4237 int ret;
4238
4239 if (req->io) {
4240 io = req->io;
4241 } else {
4242 ret = move_addr_to_kernel(req->connect.addr,
4243 req->connect.addr_len,
4244 &__io.connect.address);
4245 if (ret)
4246 goto out;
4247 io = &__io;
4248 }
4249
4250 file_flags = force_nonblock ? O_NONBLOCK : 0;
4251
4252 ret = __sys_connect_file(req->file, &io->connect.address,
4253 req->connect.addr_len, file_flags);
4254 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4255 if (req->io)
4256 return -EAGAIN;
4257 if (io_alloc_async_ctx(req)) {
4258 ret = -ENOMEM;
4259 goto out;
4260 }
4261 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
4262 return -EAGAIN;
4263 }
4264 if (ret == -ERESTARTSYS)
4265 ret = -EINTR;
4266 out:
4267 if (ret < 0)
4268 req_set_fail_links(req);
4269 io_cqring_add_event(req, ret);
4270 io_put_req(req);
4271 return 0;
4272 }
4273 #else /* !CONFIG_NET */
4274 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4275 {
4276 return -EOPNOTSUPP;
4277 }
4278
4279 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
4280 {
4281 return -EOPNOTSUPP;
4282 }
4283
4284 static int io_send(struct io_kiocb *req, bool force_nonblock)
4285 {
4286 return -EOPNOTSUPP;
4287 }
4288
4289 static int io_recvmsg_prep(struct io_kiocb *req,
4290 const struct io_uring_sqe *sqe)
4291 {
4292 return -EOPNOTSUPP;
4293 }
4294
4295 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
4296 {
4297 return -EOPNOTSUPP;
4298 }
4299
4300 static int io_recv(struct io_kiocb *req, bool force_nonblock)
4301 {
4302 return -EOPNOTSUPP;
4303 }
4304
4305 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4306 {
4307 return -EOPNOTSUPP;
4308 }
4309
4310 static int io_accept(struct io_kiocb *req, bool force_nonblock)
4311 {
4312 return -EOPNOTSUPP;
4313 }
4314
4315 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4316 {
4317 return -EOPNOTSUPP;
4318 }
4319
4320 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4321 {
4322 return -EOPNOTSUPP;
4323 }
4324 #endif /* CONFIG_NET */
4325
4326 struct io_poll_table {
4327 struct poll_table_struct pt;
4328 struct io_kiocb *req;
4329 int error;
4330 };
4331
4332 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4333 __poll_t mask, task_work_func_t func)
4334 {
4335 struct task_struct *tsk;
4336 int ret;
4337
4338 /* for instances that support it check for an event match first: */
4339 if (mask && !(mask & poll->events))
4340 return 0;
4341
4342 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4343
4344 list_del_init(&poll->wait.entry);
4345
4346 tsk = req->task;
4347 req->result = mask;
4348 init_task_work(&req->task_work, func);
4349 /*
4350 * If this fails, then the task is exiting. When a task exits, the
4351 * work gets canceled, so just cancel this request as well instead
4352 * of executing it. We can't safely execute it anyway, as we may not
4353 * have the needed state needed for it anyway.
4354 */
4355 ret = task_work_add(tsk, &req->task_work, true);
4356 if (unlikely(ret)) {
4357 WRITE_ONCE(poll->canceled, true);
4358 tsk = io_wq_get_task(req->ctx->io_wq);
4359 task_work_add(tsk, &req->task_work, true);
4360 }
4361 wake_up_process(tsk);
4362 return 1;
4363 }
4364
4365 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4366 __acquires(&req->ctx->completion_lock)
4367 {
4368 struct io_ring_ctx *ctx = req->ctx;
4369
4370 if (!req->result && !READ_ONCE(poll->canceled)) {
4371 struct poll_table_struct pt = { ._key = poll->events };
4372
4373 req->result = vfs_poll(req->file, &pt) & poll->events;
4374 }
4375
4376 spin_lock_irq(&ctx->completion_lock);
4377 if (!req->result && !READ_ONCE(poll->canceled)) {
4378 add_wait_queue(poll->head, &poll->wait);
4379 return true;
4380 }
4381
4382 return false;
4383 }
4384
4385 static void io_poll_remove_double(struct io_kiocb *req)
4386 {
4387 struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4388
4389 lockdep_assert_held(&req->ctx->completion_lock);
4390
4391 if (poll && poll->head) {
4392 struct wait_queue_head *head = poll->head;
4393
4394 spin_lock(&head->lock);
4395 list_del_init(&poll->wait.entry);
4396 if (poll->wait.private)
4397 refcount_dec(&req->refs);
4398 poll->head = NULL;
4399 spin_unlock(&head->lock);
4400 }
4401 }
4402
4403 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4404 {
4405 struct io_ring_ctx *ctx = req->ctx;
4406
4407 io_poll_remove_double(req);
4408 req->poll.done = true;
4409 io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4410 io_commit_cqring(ctx);
4411 }
4412
4413 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4414 {
4415 struct io_ring_ctx *ctx = req->ctx;
4416
4417 if (io_poll_rewait(req, &req->poll)) {
4418 spin_unlock_irq(&ctx->completion_lock);
4419 return;
4420 }
4421
4422 hash_del(&req->hash_node);
4423 io_poll_complete(req, req->result, 0);
4424 req->flags |= REQ_F_COMP_LOCKED;
4425 io_put_req_find_next(req, nxt);
4426 spin_unlock_irq(&ctx->completion_lock);
4427
4428 io_cqring_ev_posted(ctx);
4429 }
4430
4431 static void io_poll_task_func(struct callback_head *cb)
4432 {
4433 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4434 struct io_kiocb *nxt = NULL;
4435
4436 io_poll_task_handler(req, &nxt);
4437 if (nxt) {
4438 struct io_ring_ctx *ctx = nxt->ctx;
4439
4440 mutex_lock(&ctx->uring_lock);
4441 __io_queue_sqe(nxt, NULL);
4442 mutex_unlock(&ctx->uring_lock);
4443 }
4444 }
4445
4446 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4447 int sync, void *key)
4448 {
4449 struct io_kiocb *req = wait->private;
4450 struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4451 __poll_t mask = key_to_poll(key);
4452
4453 /* for instances that support it check for an event match first: */
4454 if (mask && !(mask & poll->events))
4455 return 0;
4456
4457 if (req->poll.head) {
4458 bool done;
4459
4460 spin_lock(&req->poll.head->lock);
4461 done = list_empty(&req->poll.wait.entry);
4462 if (!done)
4463 list_del_init(&req->poll.wait.entry);
4464 spin_unlock(&req->poll.head->lock);
4465 if (!done)
4466 __io_async_wake(req, poll, mask, io_poll_task_func);
4467 }
4468 refcount_dec(&req->refs);
4469 return 1;
4470 }
4471
4472 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4473 wait_queue_func_t wake_func)
4474 {
4475 poll->head = NULL;
4476 poll->done = false;
4477 poll->canceled = false;
4478 poll->events = events;
4479 INIT_LIST_HEAD(&poll->wait.entry);
4480 init_waitqueue_func_entry(&poll->wait, wake_func);
4481 }
4482
4483 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4484 struct wait_queue_head *head)
4485 {
4486 struct io_kiocb *req = pt->req;
4487
4488 /*
4489 * If poll->head is already set, it's because the file being polled
4490 * uses multiple waitqueues for poll handling (eg one for read, one
4491 * for write). Setup a separate io_poll_iocb if this happens.
4492 */
4493 if (unlikely(poll->head)) {
4494 /* already have a 2nd entry, fail a third attempt */
4495 if (req->io) {
4496 pt->error = -EINVAL;
4497 return;
4498 }
4499 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4500 if (!poll) {
4501 pt->error = -ENOMEM;
4502 return;
4503 }
4504 io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
4505 refcount_inc(&req->refs);
4506 poll->wait.private = req;
4507 req->io = (void *) poll;
4508 }
4509
4510 pt->error = 0;
4511 poll->head = head;
4512
4513 if (poll->events & EPOLLEXCLUSIVE)
4514 add_wait_queue_exclusive(head, &poll->wait);
4515 else
4516 add_wait_queue(head, &poll->wait);
4517 }
4518
4519 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4520 struct poll_table_struct *p)
4521 {
4522 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4523
4524 __io_queue_proc(&pt->req->apoll->poll, pt, head);
4525 }
4526
4527 static void io_async_task_func(struct callback_head *cb)
4528 {
4529 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4530 struct async_poll *apoll = req->apoll;
4531 struct io_ring_ctx *ctx = req->ctx;
4532 bool canceled = false;
4533
4534 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4535
4536 if (io_poll_rewait(req, &apoll->poll)) {
4537 spin_unlock_irq(&ctx->completion_lock);
4538 return;
4539 }
4540
4541 /* If req is still hashed, it cannot have been canceled. Don't check. */
4542 if (hash_hashed(&req->hash_node)) {
4543 hash_del(&req->hash_node);
4544 } else {
4545 canceled = READ_ONCE(apoll->poll.canceled);
4546 if (canceled) {
4547 io_cqring_fill_event(req, -ECANCELED);
4548 io_commit_cqring(ctx);
4549 }
4550 }
4551
4552 spin_unlock_irq(&ctx->completion_lock);
4553
4554 /* restore ->work in case we need to retry again */
4555 if (req->flags & REQ_F_WORK_INITIALIZED)
4556 memcpy(&req->work, &apoll->work, sizeof(req->work));
4557 kfree(apoll);
4558
4559 if (!canceled) {
4560 __set_current_state(TASK_RUNNING);
4561 if (io_sq_thread_acquire_mm(ctx, req)) {
4562 io_cqring_add_event(req, -EFAULT);
4563 goto end_req;
4564 }
4565 mutex_lock(&ctx->uring_lock);
4566 __io_queue_sqe(req, NULL);
4567 mutex_unlock(&ctx->uring_lock);
4568 } else {
4569 io_cqring_ev_posted(ctx);
4570 end_req:
4571 req_set_fail_links(req);
4572 io_double_put_req(req);
4573 }
4574 }
4575
4576 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4577 void *key)
4578 {
4579 struct io_kiocb *req = wait->private;
4580 struct io_poll_iocb *poll = &req->apoll->poll;
4581
4582 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4583 key_to_poll(key));
4584
4585 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4586 }
4587
4588 static void io_poll_req_insert(struct io_kiocb *req)
4589 {
4590 struct io_ring_ctx *ctx = req->ctx;
4591 struct hlist_head *list;
4592
4593 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4594 hlist_add_head(&req->hash_node, list);
4595 }
4596
4597 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4598 struct io_poll_iocb *poll,
4599 struct io_poll_table *ipt, __poll_t mask,
4600 wait_queue_func_t wake_func)
4601 __acquires(&ctx->completion_lock)
4602 {
4603 struct io_ring_ctx *ctx = req->ctx;
4604 bool cancel = false;
4605
4606 poll->file = req->file;
4607 io_init_poll_iocb(poll, mask, wake_func);
4608 poll->wait.private = req;
4609
4610 ipt->pt._key = mask;
4611 ipt->req = req;
4612 ipt->error = -EINVAL;
4613
4614 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4615
4616 spin_lock_irq(&ctx->completion_lock);
4617 if (likely(poll->head)) {
4618 spin_lock(&poll->head->lock);
4619 if (unlikely(list_empty(&poll->wait.entry))) {
4620 if (ipt->error)
4621 cancel = true;
4622 ipt->error = 0;
4623 mask = 0;
4624 }
4625 if (mask || ipt->error)
4626 list_del_init(&poll->wait.entry);
4627 else if (cancel)
4628 WRITE_ONCE(poll->canceled, true);
4629 else if (!poll->done) /* actually waiting for an event */
4630 io_poll_req_insert(req);
4631 spin_unlock(&poll->head->lock);
4632 }
4633
4634 return mask;
4635 }
4636
4637 static bool io_arm_poll_handler(struct io_kiocb *req)
4638 {
4639 const struct io_op_def *def = &io_op_defs[req->opcode];
4640 struct io_ring_ctx *ctx = req->ctx;
4641 struct async_poll *apoll;
4642 struct io_poll_table ipt;
4643 __poll_t mask, ret;
4644 bool had_io;
4645
4646 if (!req->file || !file_can_poll(req->file))
4647 return false;
4648 if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
4649 return false;
4650 if (!def->pollin && !def->pollout)
4651 return false;
4652
4653 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
4654 if (unlikely(!apoll))
4655 return false;
4656
4657 req->flags |= REQ_F_POLLED;
4658 if (req->flags & REQ_F_WORK_INITIALIZED)
4659 memcpy(&apoll->work, &req->work, sizeof(req->work));
4660 had_io = req->io != NULL;
4661
4662 io_get_req_task(req);
4663 req->apoll = apoll;
4664 INIT_HLIST_NODE(&req->hash_node);
4665
4666 mask = 0;
4667 if (def->pollin)
4668 mask |= POLLIN | POLLRDNORM;
4669 if (def->pollout)
4670 mask |= POLLOUT | POLLWRNORM;
4671 mask |= POLLERR | POLLPRI;
4672
4673 ipt.pt._qproc = io_async_queue_proc;
4674
4675 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
4676 io_async_wake);
4677 if (ret) {
4678 ipt.error = 0;
4679 /* only remove double add if we did it here */
4680 if (!had_io)
4681 io_poll_remove_double(req);
4682 spin_unlock_irq(&ctx->completion_lock);
4683 if (req->flags & REQ_F_WORK_INITIALIZED)
4684 memcpy(&req->work, &apoll->work, sizeof(req->work));
4685 kfree(apoll);
4686 return false;
4687 }
4688 spin_unlock_irq(&ctx->completion_lock);
4689 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
4690 apoll->poll.events);
4691 return true;
4692 }
4693
4694 static bool __io_poll_remove_one(struct io_kiocb *req,
4695 struct io_poll_iocb *poll)
4696 {
4697 bool do_complete = false;
4698
4699 spin_lock(&poll->head->lock);
4700 WRITE_ONCE(poll->canceled, true);
4701 if (!list_empty(&poll->wait.entry)) {
4702 list_del_init(&poll->wait.entry);
4703 do_complete = true;
4704 }
4705 spin_unlock(&poll->head->lock);
4706 hash_del(&req->hash_node);
4707 return do_complete;
4708 }
4709
4710 static bool io_poll_remove_one(struct io_kiocb *req)
4711 {
4712 bool do_complete;
4713
4714 if (req->opcode == IORING_OP_POLL_ADD) {
4715 io_poll_remove_double(req);
4716 do_complete = __io_poll_remove_one(req, &req->poll);
4717 } else {
4718 struct async_poll *apoll = req->apoll;
4719
4720 /* non-poll requests have submit ref still */
4721 do_complete = __io_poll_remove_one(req, &apoll->poll);
4722 if (do_complete) {
4723 io_put_req(req);
4724 /*
4725 * restore ->work because we will call
4726 * io_req_work_drop_env below when dropping the
4727 * final reference.
4728 */
4729 if (req->flags & REQ_F_WORK_INITIALIZED)
4730 memcpy(&req->work, &apoll->work,
4731 sizeof(req->work));
4732 kfree(apoll);
4733 }
4734 }
4735
4736 if (do_complete) {
4737 io_cqring_fill_event(req, -ECANCELED);
4738 io_commit_cqring(req->ctx);
4739 req->flags |= REQ_F_COMP_LOCKED;
4740 io_put_req(req);
4741 }
4742
4743 return do_complete;
4744 }
4745
4746 static void io_poll_remove_all(struct io_ring_ctx *ctx)
4747 {
4748 struct hlist_node *tmp;
4749 struct io_kiocb *req;
4750 int posted = 0, i;
4751
4752 spin_lock_irq(&ctx->completion_lock);
4753 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
4754 struct hlist_head *list;
4755
4756 list = &ctx->cancel_hash[i];
4757 hlist_for_each_entry_safe(req, tmp, list, hash_node)
4758 posted += io_poll_remove_one(req);
4759 }
4760 spin_unlock_irq(&ctx->completion_lock);
4761
4762 if (posted)
4763 io_cqring_ev_posted(ctx);
4764 }
4765
4766 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
4767 {
4768 struct hlist_head *list;
4769 struct io_kiocb *req;
4770
4771 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
4772 hlist_for_each_entry(req, list, hash_node) {
4773 if (sqe_addr != req->user_data)
4774 continue;
4775 if (io_poll_remove_one(req))
4776 return 0;
4777 return -EALREADY;
4778 }
4779
4780 return -ENOENT;
4781 }
4782
4783 static int io_poll_remove_prep(struct io_kiocb *req,
4784 const struct io_uring_sqe *sqe)
4785 {
4786 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4787 return -EINVAL;
4788 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4789 sqe->poll_events)
4790 return -EINVAL;
4791
4792 req->poll.addr = READ_ONCE(sqe->addr);
4793 return 0;
4794 }
4795
4796 /*
4797 * Find a running poll command that matches one specified in sqe->addr,
4798 * and remove it if found.
4799 */
4800 static int io_poll_remove(struct io_kiocb *req)
4801 {
4802 struct io_ring_ctx *ctx = req->ctx;
4803 u64 addr;
4804 int ret;
4805
4806 addr = req->poll.addr;
4807 spin_lock_irq(&ctx->completion_lock);
4808 ret = io_poll_cancel(ctx, addr);
4809 spin_unlock_irq(&ctx->completion_lock);
4810
4811 io_cqring_add_event(req, ret);
4812 if (ret < 0)
4813 req_set_fail_links(req);
4814 io_put_req(req);
4815 return 0;
4816 }
4817
4818 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4819 void *key)
4820 {
4821 struct io_kiocb *req = wait->private;
4822 struct io_poll_iocb *poll = &req->poll;
4823
4824 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
4825 }
4826
4827 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4828 struct poll_table_struct *p)
4829 {
4830 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4831
4832 __io_queue_proc(&pt->req->poll, pt, head);
4833 }
4834
4835 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4836 {
4837 struct io_poll_iocb *poll = &req->poll;
4838 u32 events;
4839
4840 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4841 return -EINVAL;
4842 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4843 return -EINVAL;
4844 if (!poll->file)
4845 return -EBADF;
4846
4847 events = READ_ONCE(sqe->poll32_events);
4848 #ifdef __BIG_ENDIAN
4849 events = swahw32(events);
4850 #endif
4851 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
4852 (events & EPOLLEXCLUSIVE);
4853
4854 io_get_req_task(req);
4855 return 0;
4856 }
4857
4858 static int io_poll_add(struct io_kiocb *req)
4859 {
4860 struct io_poll_iocb *poll = &req->poll;
4861 struct io_ring_ctx *ctx = req->ctx;
4862 struct io_poll_table ipt;
4863 __poll_t mask;
4864
4865 INIT_HLIST_NODE(&req->hash_node);
4866 INIT_LIST_HEAD(&req->list);
4867 ipt.pt._qproc = io_poll_queue_proc;
4868
4869 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4870 io_poll_wake);
4871
4872 if (mask) { /* no async, we'd stolen it */
4873 ipt.error = 0;
4874 io_poll_complete(req, mask, 0);
4875 }
4876 spin_unlock_irq(&ctx->completion_lock);
4877
4878 if (mask) {
4879 io_cqring_ev_posted(ctx);
4880 io_put_req(req);
4881 }
4882 return ipt.error;
4883 }
4884
4885 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4886 {
4887 struct io_timeout_data *data = container_of(timer,
4888 struct io_timeout_data, timer);
4889 struct io_kiocb *req = data->req;
4890 struct io_ring_ctx *ctx = req->ctx;
4891 unsigned long flags;
4892
4893 atomic_inc(&ctx->cq_timeouts);
4894
4895 spin_lock_irqsave(&ctx->completion_lock, flags);
4896 /*
4897 * We could be racing with timeout deletion. If the list is empty,
4898 * then timeout lookup already found it and will be handling it.
4899 */
4900 if (!list_empty(&req->list))
4901 list_del_init(&req->list);
4902
4903 io_cqring_fill_event(req, -ETIME);
4904 io_commit_cqring(ctx);
4905 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4906
4907 io_cqring_ev_posted(ctx);
4908 req_set_fail_links(req);
4909 io_put_req(req);
4910 return HRTIMER_NORESTART;
4911 }
4912
4913 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4914 {
4915 struct io_kiocb *req;
4916 int ret = -ENOENT;
4917
4918 list_for_each_entry(req, &ctx->timeout_list, list) {
4919 if (user_data == req->user_data) {
4920 list_del_init(&req->list);
4921 ret = 0;
4922 break;
4923 }
4924 }
4925
4926 if (ret == -ENOENT)
4927 return ret;
4928
4929 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
4930 if (ret == -1)
4931 return -EALREADY;
4932
4933 req_set_fail_links(req);
4934 io_cqring_fill_event(req, -ECANCELED);
4935 io_put_req(req);
4936 return 0;
4937 }
4938
4939 static int io_timeout_remove_prep(struct io_kiocb *req,
4940 const struct io_uring_sqe *sqe)
4941 {
4942 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4943 return -EINVAL;
4944 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4945 return -EINVAL;
4946
4947 req->timeout.addr = READ_ONCE(sqe->addr);
4948 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4949 if (req->timeout.flags)
4950 return -EINVAL;
4951
4952 return 0;
4953 }
4954
4955 /*
4956 * Remove or update an existing timeout command
4957 */
4958 static int io_timeout_remove(struct io_kiocb *req)
4959 {
4960 struct io_ring_ctx *ctx = req->ctx;
4961 int ret;
4962
4963 spin_lock_irq(&ctx->completion_lock);
4964 ret = io_timeout_cancel(ctx, req->timeout.addr);
4965
4966 io_cqring_fill_event(req, ret);
4967 io_commit_cqring(ctx);
4968 spin_unlock_irq(&ctx->completion_lock);
4969 io_cqring_ev_posted(ctx);
4970 if (ret < 0)
4971 req_set_fail_links(req);
4972 io_put_req(req);
4973 return 0;
4974 }
4975
4976 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4977 bool is_timeout_link)
4978 {
4979 struct io_timeout_data *data;
4980 unsigned flags;
4981 u32 off = READ_ONCE(sqe->off);
4982
4983 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4984 return -EINVAL;
4985 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
4986 return -EINVAL;
4987 if (off && is_timeout_link)
4988 return -EINVAL;
4989 flags = READ_ONCE(sqe->timeout_flags);
4990 if (flags & ~IORING_TIMEOUT_ABS)
4991 return -EINVAL;
4992
4993 req->timeout.off = off;
4994
4995 if (!req->io && io_alloc_async_ctx(req))
4996 return -ENOMEM;
4997
4998 data = &req->io->timeout;
4999 data->req = req;
5000 req->flags |= REQ_F_TIMEOUT;
5001
5002 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5003 return -EFAULT;
5004
5005 if (flags & IORING_TIMEOUT_ABS)
5006 data->mode = HRTIMER_MODE_ABS;
5007 else
5008 data->mode = HRTIMER_MODE_REL;
5009
5010 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5011 return 0;
5012 }
5013
5014 static int io_timeout(struct io_kiocb *req)
5015 {
5016 struct io_ring_ctx *ctx = req->ctx;
5017 struct io_timeout_data *data = &req->io->timeout;
5018 struct list_head *entry;
5019 u32 tail, off = req->timeout.off;
5020
5021 spin_lock_irq(&ctx->completion_lock);
5022
5023 /*
5024 * sqe->off holds how many events that need to occur for this
5025 * timeout event to be satisfied. If it isn't set, then this is
5026 * a pure timeout request, sequence isn't used.
5027 */
5028 if (!off) {
5029 req->flags |= REQ_F_TIMEOUT_NOSEQ;
5030 entry = ctx->timeout_list.prev;
5031 goto add;
5032 }
5033
5034 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5035 req->timeout.target_seq = tail + off;
5036
5037 /*
5038 * Insertion sort, ensuring the first entry in the list is always
5039 * the one we need first.
5040 */
5041 list_for_each_prev(entry, &ctx->timeout_list) {
5042 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5043
5044 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
5045 continue;
5046 /* nxt.seq is behind @tail, otherwise would've been completed */
5047 if (off >= nxt->timeout.target_seq - tail)
5048 break;
5049 }
5050 add:
5051 list_add(&req->list, entry);
5052 data->timer.function = io_timeout_fn;
5053 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5054 spin_unlock_irq(&ctx->completion_lock);
5055 return 0;
5056 }
5057
5058 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5059 {
5060 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5061
5062 return req->user_data == (unsigned long) data;
5063 }
5064
5065 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
5066 {
5067 enum io_wq_cancel cancel_ret;
5068 int ret = 0;
5069
5070 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
5071 switch (cancel_ret) {
5072 case IO_WQ_CANCEL_OK:
5073 ret = 0;
5074 break;
5075 case IO_WQ_CANCEL_RUNNING:
5076 ret = -EALREADY;
5077 break;
5078 case IO_WQ_CANCEL_NOTFOUND:
5079 ret = -ENOENT;
5080 break;
5081 }
5082
5083 return ret;
5084 }
5085
5086 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5087 struct io_kiocb *req, __u64 sqe_addr,
5088 int success_ret)
5089 {
5090 unsigned long flags;
5091 int ret;
5092
5093 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5094 if (ret != -ENOENT) {
5095 spin_lock_irqsave(&ctx->completion_lock, flags);
5096 goto done;
5097 }
5098
5099 spin_lock_irqsave(&ctx->completion_lock, flags);
5100 ret = io_timeout_cancel(ctx, sqe_addr);
5101 if (ret != -ENOENT)
5102 goto done;
5103 ret = io_poll_cancel(ctx, sqe_addr);
5104 done:
5105 if (!ret)
5106 ret = success_ret;
5107 io_cqring_fill_event(req, ret);
5108 io_commit_cqring(ctx);
5109 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5110 io_cqring_ev_posted(ctx);
5111
5112 if (ret < 0)
5113 req_set_fail_links(req);
5114 io_put_req(req);
5115 }
5116
5117 static int io_async_cancel_prep(struct io_kiocb *req,
5118 const struct io_uring_sqe *sqe)
5119 {
5120 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5121 return -EINVAL;
5122 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
5123 sqe->cancel_flags)
5124 return -EINVAL;
5125
5126 req->cancel.addr = READ_ONCE(sqe->addr);
5127 return 0;
5128 }
5129
5130 static int io_async_cancel(struct io_kiocb *req)
5131 {
5132 struct io_ring_ctx *ctx = req->ctx;
5133
5134 io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5135 return 0;
5136 }
5137
5138 static int io_files_update_prep(struct io_kiocb *req,
5139 const struct io_uring_sqe *sqe)
5140 {
5141 if (sqe->flags || sqe->ioprio || sqe->rw_flags)
5142 return -EINVAL;
5143
5144 req->files_update.offset = READ_ONCE(sqe->off);
5145 req->files_update.nr_args = READ_ONCE(sqe->len);
5146 if (!req->files_update.nr_args)
5147 return -EINVAL;
5148 req->files_update.arg = READ_ONCE(sqe->addr);
5149 return 0;
5150 }
5151
5152 static int io_files_update(struct io_kiocb *req, bool force_nonblock)
5153 {
5154 struct io_ring_ctx *ctx = req->ctx;
5155 struct io_uring_files_update up;
5156 int ret;
5157
5158 if (force_nonblock)
5159 return -EAGAIN;
5160
5161 up.offset = req->files_update.offset;
5162 up.fds = req->files_update.arg;
5163
5164 mutex_lock(&ctx->uring_lock);
5165 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
5166 mutex_unlock(&ctx->uring_lock);
5167
5168 if (ret < 0)
5169 req_set_fail_links(req);
5170 io_cqring_add_event(req, ret);
5171 io_put_req(req);
5172 return 0;
5173 }
5174
5175 static int io_req_defer_prep(struct io_kiocb *req,
5176 const struct io_uring_sqe *sqe)
5177 {
5178 ssize_t ret = 0;
5179
5180 if (!sqe)
5181 return 0;
5182
5183 io_req_init_async(req);
5184
5185 if (io_op_defs[req->opcode].file_table) {
5186 ret = io_grab_files(req);
5187 if (unlikely(ret))
5188 return ret;
5189 }
5190
5191 io_req_work_grab_env(req, &io_op_defs[req->opcode]);
5192
5193 switch (req->opcode) {
5194 case IORING_OP_NOP:
5195 break;
5196 case IORING_OP_READV:
5197 case IORING_OP_READ_FIXED:
5198 case IORING_OP_READ:
5199 ret = io_read_prep(req, sqe, true);
5200 break;
5201 case IORING_OP_WRITEV:
5202 case IORING_OP_WRITE_FIXED:
5203 case IORING_OP_WRITE:
5204 ret = io_write_prep(req, sqe, true);
5205 break;
5206 case IORING_OP_POLL_ADD:
5207 ret = io_poll_add_prep(req, sqe);
5208 break;
5209 case IORING_OP_POLL_REMOVE:
5210 ret = io_poll_remove_prep(req, sqe);
5211 break;
5212 case IORING_OP_FSYNC:
5213 ret = io_prep_fsync(req, sqe);
5214 break;
5215 case IORING_OP_SYNC_FILE_RANGE:
5216 ret = io_prep_sfr(req, sqe);
5217 break;
5218 case IORING_OP_SENDMSG:
5219 case IORING_OP_SEND:
5220 ret = io_sendmsg_prep(req, sqe);
5221 break;
5222 case IORING_OP_RECVMSG:
5223 case IORING_OP_RECV:
5224 ret = io_recvmsg_prep(req, sqe);
5225 break;
5226 case IORING_OP_CONNECT:
5227 ret = io_connect_prep(req, sqe);
5228 break;
5229 case IORING_OP_TIMEOUT:
5230 ret = io_timeout_prep(req, sqe, false);
5231 break;
5232 case IORING_OP_TIMEOUT_REMOVE:
5233 ret = io_timeout_remove_prep(req, sqe);
5234 break;
5235 case IORING_OP_ASYNC_CANCEL:
5236 ret = io_async_cancel_prep(req, sqe);
5237 break;
5238 case IORING_OP_LINK_TIMEOUT:
5239 ret = io_timeout_prep(req, sqe, true);
5240 break;
5241 case IORING_OP_ACCEPT:
5242 ret = io_accept_prep(req, sqe);
5243 break;
5244 case IORING_OP_FALLOCATE:
5245 ret = io_fallocate_prep(req, sqe);
5246 break;
5247 case IORING_OP_OPENAT:
5248 ret = io_openat_prep(req, sqe);
5249 break;
5250 case IORING_OP_CLOSE:
5251 ret = io_close_prep(req, sqe);
5252 break;
5253 case IORING_OP_FILES_UPDATE:
5254 ret = io_files_update_prep(req, sqe);
5255 break;
5256 case IORING_OP_STATX:
5257 ret = io_statx_prep(req, sqe);
5258 break;
5259 case IORING_OP_FADVISE:
5260 ret = io_fadvise_prep(req, sqe);
5261 break;
5262 case IORING_OP_MADVISE:
5263 ret = io_madvise_prep(req, sqe);
5264 break;
5265 case IORING_OP_OPENAT2:
5266 ret = io_openat2_prep(req, sqe);
5267 break;
5268 case IORING_OP_EPOLL_CTL:
5269 ret = io_epoll_ctl_prep(req, sqe);
5270 break;
5271 case IORING_OP_SPLICE:
5272 ret = io_splice_prep(req, sqe);
5273 break;
5274 case IORING_OP_PROVIDE_BUFFERS:
5275 ret = io_provide_buffers_prep(req, sqe);
5276 break;
5277 case IORING_OP_REMOVE_BUFFERS:
5278 ret = io_remove_buffers_prep(req, sqe);
5279 break;
5280 case IORING_OP_TEE:
5281 ret = io_tee_prep(req, sqe);
5282 break;
5283 default:
5284 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5285 req->opcode);
5286 ret = -EINVAL;
5287 break;
5288 }
5289
5290 return ret;
5291 }
5292
5293 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5294 {
5295 struct io_ring_ctx *ctx = req->ctx;
5296 int ret;
5297
5298 /* Still need defer if there is pending req in defer list. */
5299 if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
5300 return 0;
5301
5302 if (!req->io) {
5303 if (io_alloc_async_ctx(req))
5304 return -EAGAIN;
5305 ret = io_req_defer_prep(req, sqe);
5306 if (ret < 0)
5307 return ret;
5308 }
5309
5310 spin_lock_irq(&ctx->completion_lock);
5311 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
5312 spin_unlock_irq(&ctx->completion_lock);
5313 return 0;
5314 }
5315
5316 trace_io_uring_defer(ctx, req, req->user_data);
5317 list_add_tail(&req->list, &ctx->defer_list);
5318 spin_unlock_irq(&ctx->completion_lock);
5319 return -EIOCBQUEUED;
5320 }
5321
5322 static void io_cleanup_req(struct io_kiocb *req)
5323 {
5324 struct io_async_ctx *io = req->io;
5325
5326 switch (req->opcode) {
5327 case IORING_OP_READV:
5328 case IORING_OP_READ_FIXED:
5329 case IORING_OP_READ:
5330 if (req->flags & REQ_F_BUFFER_SELECTED)
5331 kfree((void *)(unsigned long)req->rw.addr);
5332 /* fallthrough */
5333 case IORING_OP_WRITEV:
5334 case IORING_OP_WRITE_FIXED:
5335 case IORING_OP_WRITE:
5336 if (io->rw.iov != io->rw.fast_iov)
5337 kfree(io->rw.iov);
5338 break;
5339 case IORING_OP_RECVMSG:
5340 if (req->flags & REQ_F_BUFFER_SELECTED)
5341 kfree(req->sr_msg.kbuf);
5342 /* fallthrough */
5343 case IORING_OP_SENDMSG:
5344 if (io->msg.iov != io->msg.fast_iov)
5345 kfree(io->msg.iov);
5346 break;
5347 case IORING_OP_RECV:
5348 if (req->flags & REQ_F_BUFFER_SELECTED)
5349 kfree(req->sr_msg.kbuf);
5350 break;
5351 case IORING_OP_OPENAT:
5352 case IORING_OP_OPENAT2:
5353 break;
5354 case IORING_OP_SPLICE:
5355 case IORING_OP_TEE:
5356 io_put_file(req, req->splice.file_in,
5357 (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5358 break;
5359 }
5360
5361 req->flags &= ~REQ_F_NEED_CLEANUP;
5362 }
5363
5364 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5365 bool force_nonblock)
5366 {
5367 struct io_ring_ctx *ctx = req->ctx;
5368 int ret;
5369
5370 switch (req->opcode) {
5371 case IORING_OP_NOP:
5372 ret = io_nop(req);
5373 break;
5374 case IORING_OP_READV:
5375 case IORING_OP_READ_FIXED:
5376 case IORING_OP_READ:
5377 if (sqe) {
5378 ret = io_read_prep(req, sqe, force_nonblock);
5379 if (ret < 0)
5380 break;
5381 }
5382 ret = io_read(req, force_nonblock);
5383 break;
5384 case IORING_OP_WRITEV:
5385 case IORING_OP_WRITE_FIXED:
5386 case IORING_OP_WRITE:
5387 if (sqe) {
5388 ret = io_write_prep(req, sqe, force_nonblock);
5389 if (ret < 0)
5390 break;
5391 }
5392 ret = io_write(req, force_nonblock);
5393 break;
5394 case IORING_OP_FSYNC:
5395 if (sqe) {
5396 ret = io_prep_fsync(req, sqe);
5397 if (ret < 0)
5398 break;
5399 }
5400 ret = io_fsync(req, force_nonblock);
5401 break;
5402 case IORING_OP_POLL_ADD:
5403 if (sqe) {
5404 ret = io_poll_add_prep(req, sqe);
5405 if (ret)
5406 break;
5407 }
5408 ret = io_poll_add(req);
5409 break;
5410 case IORING_OP_POLL_REMOVE:
5411 if (sqe) {
5412 ret = io_poll_remove_prep(req, sqe);
5413 if (ret < 0)
5414 break;
5415 }
5416 ret = io_poll_remove(req);
5417 break;
5418 case IORING_OP_SYNC_FILE_RANGE:
5419 if (sqe) {
5420 ret = io_prep_sfr(req, sqe);
5421 if (ret < 0)
5422 break;
5423 }
5424 ret = io_sync_file_range(req, force_nonblock);
5425 break;
5426 case IORING_OP_SENDMSG:
5427 case IORING_OP_SEND:
5428 if (sqe) {
5429 ret = io_sendmsg_prep(req, sqe);
5430 if (ret < 0)
5431 break;
5432 }
5433 if (req->opcode == IORING_OP_SENDMSG)
5434 ret = io_sendmsg(req, force_nonblock);
5435 else
5436 ret = io_send(req, force_nonblock);
5437 break;
5438 case IORING_OP_RECVMSG:
5439 case IORING_OP_RECV:
5440 if (sqe) {
5441 ret = io_recvmsg_prep(req, sqe);
5442 if (ret)
5443 break;
5444 }
5445 if (req->opcode == IORING_OP_RECVMSG)
5446 ret = io_recvmsg(req, force_nonblock);
5447 else
5448 ret = io_recv(req, force_nonblock);
5449 break;
5450 case IORING_OP_TIMEOUT:
5451 if (sqe) {
5452 ret = io_timeout_prep(req, sqe, false);
5453 if (ret)
5454 break;
5455 }
5456 ret = io_timeout(req);
5457 break;
5458 case IORING_OP_TIMEOUT_REMOVE:
5459 if (sqe) {
5460 ret = io_timeout_remove_prep(req, sqe);
5461 if (ret)
5462 break;
5463 }
5464 ret = io_timeout_remove(req);
5465 break;
5466 case IORING_OP_ACCEPT:
5467 if (sqe) {
5468 ret = io_accept_prep(req, sqe);
5469 if (ret)
5470 break;
5471 }
5472 ret = io_accept(req, force_nonblock);
5473 break;
5474 case IORING_OP_CONNECT:
5475 if (sqe) {
5476 ret = io_connect_prep(req, sqe);
5477 if (ret)
5478 break;
5479 }
5480 ret = io_connect(req, force_nonblock);
5481 break;
5482 case IORING_OP_ASYNC_CANCEL:
5483 if (sqe) {
5484 ret = io_async_cancel_prep(req, sqe);
5485 if (ret)
5486 break;
5487 }
5488 ret = io_async_cancel(req);
5489 break;
5490 case IORING_OP_FALLOCATE:
5491 if (sqe) {
5492 ret = io_fallocate_prep(req, sqe);
5493 if (ret)
5494 break;
5495 }
5496 ret = io_fallocate(req, force_nonblock);
5497 break;
5498 case IORING_OP_OPENAT:
5499 if (sqe) {
5500 ret = io_openat_prep(req, sqe);
5501 if (ret)
5502 break;
5503 }
5504 ret = io_openat(req, force_nonblock);
5505 break;
5506 case IORING_OP_CLOSE:
5507 if (sqe) {
5508 ret = io_close_prep(req, sqe);
5509 if (ret)
5510 break;
5511 }
5512 ret = io_close(req, force_nonblock);
5513 break;
5514 case IORING_OP_FILES_UPDATE:
5515 if (sqe) {
5516 ret = io_files_update_prep(req, sqe);
5517 if (ret)
5518 break;
5519 }
5520 ret = io_files_update(req, force_nonblock);
5521 break;
5522 case IORING_OP_STATX:
5523 if (sqe) {
5524 ret = io_statx_prep(req, sqe);
5525 if (ret)
5526 break;
5527 }
5528 ret = io_statx(req, force_nonblock);
5529 break;
5530 case IORING_OP_FADVISE:
5531 if (sqe) {
5532 ret = io_fadvise_prep(req, sqe);
5533 if (ret)
5534 break;
5535 }
5536 ret = io_fadvise(req, force_nonblock);
5537 break;
5538 case IORING_OP_MADVISE:
5539 if (sqe) {
5540 ret = io_madvise_prep(req, sqe);
5541 if (ret)
5542 break;
5543 }
5544 ret = io_madvise(req, force_nonblock);
5545 break;
5546 case IORING_OP_OPENAT2:
5547 if (sqe) {
5548 ret = io_openat2_prep(req, sqe);
5549 if (ret)
5550 break;
5551 }
5552 ret = io_openat2(req, force_nonblock);
5553 break;
5554 case IORING_OP_EPOLL_CTL:
5555 if (sqe) {
5556 ret = io_epoll_ctl_prep(req, sqe);
5557 if (ret)
5558 break;
5559 }
5560 ret = io_epoll_ctl(req, force_nonblock);
5561 break;
5562 case IORING_OP_SPLICE:
5563 if (sqe) {
5564 ret = io_splice_prep(req, sqe);
5565 if (ret < 0)
5566 break;
5567 }
5568 ret = io_splice(req, force_nonblock);
5569 break;
5570 case IORING_OP_PROVIDE_BUFFERS:
5571 if (sqe) {
5572 ret = io_provide_buffers_prep(req, sqe);
5573 if (ret)
5574 break;
5575 }
5576 ret = io_provide_buffers(req, force_nonblock);
5577 break;
5578 case IORING_OP_REMOVE_BUFFERS:
5579 if (sqe) {
5580 ret = io_remove_buffers_prep(req, sqe);
5581 if (ret)
5582 break;
5583 }
5584 ret = io_remove_buffers(req, force_nonblock);
5585 break;
5586 case IORING_OP_TEE:
5587 if (sqe) {
5588 ret = io_tee_prep(req, sqe);
5589 if (ret < 0)
5590 break;
5591 }
5592 ret = io_tee(req, force_nonblock);
5593 break;
5594 default:
5595 ret = -EINVAL;
5596 break;
5597 }
5598
5599 if (ret)
5600 return ret;
5601
5602 /* If the op doesn't have a file, we're not polling for it */
5603 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
5604 const bool in_async = io_wq_current_is_worker();
5605
5606 if (req->result == -EAGAIN)
5607 return -EAGAIN;
5608
5609 /* workqueue context doesn't hold uring_lock, grab it now */
5610 if (in_async)
5611 mutex_lock(&ctx->uring_lock);
5612
5613 io_iopoll_req_issued(req);
5614
5615 if (in_async)
5616 mutex_unlock(&ctx->uring_lock);
5617 }
5618
5619 return 0;
5620 }
5621
5622 static void io_arm_async_linked_timeout(struct io_kiocb *req)
5623 {
5624 struct io_kiocb *link;
5625
5626 /* link head's timeout is queued in io_queue_async_work() */
5627 if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
5628 return;
5629
5630 link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
5631 io_queue_linked_timeout(link);
5632 }
5633
5634 static void io_wq_submit_work(struct io_wq_work **workptr)
5635 {
5636 struct io_wq_work *work = *workptr;
5637 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5638 int ret = 0;
5639
5640 io_arm_async_linked_timeout(req);
5641
5642 /* if NO_CANCEL is set, we must still run the work */
5643 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
5644 IO_WQ_WORK_CANCEL) {
5645 ret = -ECANCELED;
5646 }
5647
5648 if (!ret) {
5649 do {
5650 ret = io_issue_sqe(req, NULL, false);
5651 /*
5652 * We can get EAGAIN for polled IO even though we're
5653 * forcing a sync submission from here, since we can't
5654 * wait for request slots on the block side.
5655 */
5656 if (ret != -EAGAIN)
5657 break;
5658 cond_resched();
5659 } while (1);
5660 }
5661
5662 if (ret) {
5663 req_set_fail_links(req);
5664 io_cqring_add_event(req, ret);
5665 io_put_req(req);
5666 }
5667
5668 io_steal_work(req, workptr);
5669 }
5670
5671 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
5672 int index)
5673 {
5674 struct fixed_file_table *table;
5675
5676 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
5677 return table->files[index & IORING_FILE_TABLE_MASK];
5678 }
5679
5680 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
5681 int fd, struct file **out_file, bool fixed)
5682 {
5683 struct io_ring_ctx *ctx = req->ctx;
5684 struct file *file;
5685
5686 if (fixed) {
5687 if (unlikely(!ctx->file_data ||
5688 (unsigned) fd >= ctx->nr_user_files))
5689 return -EBADF;
5690 fd = array_index_nospec(fd, ctx->nr_user_files);
5691 file = io_file_from_index(ctx, fd);
5692 if (file) {
5693 req->fixed_file_refs = ctx->file_data->cur_refs;
5694 percpu_ref_get(req->fixed_file_refs);
5695 }
5696 } else {
5697 trace_io_uring_file_get(ctx, fd);
5698 file = __io_file_get(state, fd);
5699 }
5700
5701 if (file || io_op_defs[req->opcode].needs_file_no_error) {
5702 *out_file = file;
5703 return 0;
5704 }
5705 return -EBADF;
5706 }
5707
5708 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
5709 int fd)
5710 {
5711 bool fixed;
5712
5713 fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
5714 if (unlikely(!fixed && io_async_submit(req->ctx)))
5715 return -EBADF;
5716
5717 return io_file_get(state, req, fd, &req->file, fixed);
5718 }
5719
5720 static int io_grab_files(struct io_kiocb *req)
5721 {
5722 int ret = -EBADF;
5723 struct io_ring_ctx *ctx = req->ctx;
5724
5725 if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
5726 return 0;
5727 if (!ctx->ring_file)
5728 return -EBADF;
5729
5730 rcu_read_lock();
5731 spin_lock_irq(&ctx->inflight_lock);
5732 /*
5733 * We use the f_ops->flush() handler to ensure that we can flush
5734 * out work accessing these files if the fd is closed. Check if
5735 * the fd has changed since we started down this path, and disallow
5736 * this operation if it has.
5737 */
5738 if (fcheck(ctx->ring_fd) == ctx->ring_file) {
5739 list_add(&req->inflight_entry, &ctx->inflight_list);
5740 req->flags |= REQ_F_INFLIGHT;
5741 req->work.files = current->files;
5742 ret = 0;
5743 }
5744 spin_unlock_irq(&ctx->inflight_lock);
5745 rcu_read_unlock();
5746
5747 return ret;
5748 }
5749
5750 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
5751 {
5752 struct io_timeout_data *data = container_of(timer,
5753 struct io_timeout_data, timer);
5754 struct io_kiocb *req = data->req;
5755 struct io_ring_ctx *ctx = req->ctx;
5756 struct io_kiocb *prev = NULL;
5757 unsigned long flags;
5758
5759 spin_lock_irqsave(&ctx->completion_lock, flags);
5760
5761 /*
5762 * We don't expect the list to be empty, that will only happen if we
5763 * race with the completion of the linked work.
5764 */
5765 if (!list_empty(&req->link_list)) {
5766 prev = list_entry(req->link_list.prev, struct io_kiocb,
5767 link_list);
5768 if (refcount_inc_not_zero(&prev->refs)) {
5769 list_del_init(&req->link_list);
5770 prev->flags &= ~REQ_F_LINK_TIMEOUT;
5771 } else
5772 prev = NULL;
5773 }
5774
5775 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5776
5777 if (prev) {
5778 req_set_fail_links(prev);
5779 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
5780 io_put_req(prev);
5781 } else {
5782 io_cqring_add_event(req, -ETIME);
5783 io_put_req(req);
5784 }
5785 return HRTIMER_NORESTART;
5786 }
5787
5788 static void io_queue_linked_timeout(struct io_kiocb *req)
5789 {
5790 struct io_ring_ctx *ctx = req->ctx;
5791
5792 /*
5793 * If the list is now empty, then our linked request finished before
5794 * we got a chance to setup the timer
5795 */
5796 spin_lock_irq(&ctx->completion_lock);
5797 if (!list_empty(&req->link_list)) {
5798 struct io_timeout_data *data = &req->io->timeout;
5799
5800 data->timer.function = io_link_timeout_fn;
5801 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5802 data->mode);
5803 }
5804 spin_unlock_irq(&ctx->completion_lock);
5805
5806 /* drop submission reference */
5807 io_put_req(req);
5808 }
5809
5810 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
5811 {
5812 struct io_kiocb *nxt;
5813
5814 if (!(req->flags & REQ_F_LINK_HEAD))
5815 return NULL;
5816 /* for polled retry, if flag is set, we already went through here */
5817 if (req->flags & REQ_F_POLLED)
5818 return NULL;
5819
5820 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5821 link_list);
5822 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
5823 return NULL;
5824
5825 req->flags |= REQ_F_LINK_TIMEOUT;
5826 return nxt;
5827 }
5828
5829 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5830 {
5831 struct io_kiocb *linked_timeout;
5832 struct io_kiocb *nxt;
5833 const struct cred *old_creds = NULL;
5834 int ret;
5835
5836 again:
5837 linked_timeout = io_prep_linked_timeout(req);
5838
5839 if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
5840 req->work.creds != current_cred()) {
5841 if (old_creds)
5842 revert_creds(old_creds);
5843 if (old_creds == req->work.creds)
5844 old_creds = NULL; /* restored original creds */
5845 else
5846 old_creds = override_creds(req->work.creds);
5847 }
5848
5849 ret = io_issue_sqe(req, sqe, true);
5850
5851 /*
5852 * We async punt it if the file wasn't marked NOWAIT, or if the file
5853 * doesn't support non-blocking read/write attempts
5854 */
5855 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5856 (req->flags & REQ_F_MUST_PUNT))) {
5857 if (io_arm_poll_handler(req)) {
5858 if (linked_timeout)
5859 io_queue_linked_timeout(linked_timeout);
5860 goto exit;
5861 }
5862 punt:
5863 io_req_init_async(req);
5864
5865 if (io_op_defs[req->opcode].file_table) {
5866 ret = io_grab_files(req);
5867 if (ret)
5868 goto err;
5869 }
5870
5871 /*
5872 * Queued up for async execution, worker will release
5873 * submit reference when the iocb is actually submitted.
5874 */
5875 io_queue_async_work(req);
5876 goto exit;
5877 }
5878
5879 err:
5880 nxt = NULL;
5881 /* drop submission reference */
5882 io_put_req_find_next(req, &nxt);
5883
5884 if (linked_timeout) {
5885 if (!ret)
5886 io_queue_linked_timeout(linked_timeout);
5887 else
5888 io_put_req(linked_timeout);
5889 }
5890
5891 /* and drop final reference, if we failed */
5892 if (ret) {
5893 io_cqring_add_event(req, ret);
5894 req_set_fail_links(req);
5895 io_put_req(req);
5896 }
5897 if (nxt) {
5898 req = nxt;
5899
5900 if (req->flags & REQ_F_FORCE_ASYNC)
5901 goto punt;
5902 goto again;
5903 }
5904 exit:
5905 if (old_creds)
5906 revert_creds(old_creds);
5907 }
5908
5909 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5910 {
5911 int ret;
5912
5913 ret = io_req_defer(req, sqe);
5914 if (ret) {
5915 if (ret != -EIOCBQUEUED) {
5916 fail_req:
5917 io_cqring_add_event(req, ret);
5918 req_set_fail_links(req);
5919 io_double_put_req(req);
5920 }
5921 } else if (req->flags & REQ_F_FORCE_ASYNC) {
5922 if (!req->io) {
5923 ret = -EAGAIN;
5924 if (io_alloc_async_ctx(req))
5925 goto fail_req;
5926 ret = io_req_defer_prep(req, sqe);
5927 if (unlikely(ret < 0))
5928 goto fail_req;
5929 }
5930
5931 /*
5932 * Never try inline submit of IOSQE_ASYNC is set, go straight
5933 * to async execution.
5934 */
5935 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5936 io_queue_async_work(req);
5937 } else {
5938 __io_queue_sqe(req, sqe);
5939 }
5940 }
5941
5942 static inline void io_queue_link_head(struct io_kiocb *req)
5943 {
5944 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
5945 io_cqring_add_event(req, -ECANCELED);
5946 io_double_put_req(req);
5947 } else
5948 io_queue_sqe(req, NULL);
5949 }
5950
5951 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5952 struct io_kiocb **link)
5953 {
5954 struct io_ring_ctx *ctx = req->ctx;
5955 int ret;
5956
5957 /*
5958 * If we already have a head request, queue this one for async
5959 * submittal once the head completes. If we don't have a head but
5960 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5961 * submitted sync once the chain is complete. If none of those
5962 * conditions are true (normal request), then just queue it.
5963 */
5964 if (*link) {
5965 struct io_kiocb *head = *link;
5966
5967 /*
5968 * Taking sequential execution of a link, draining both sides
5969 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5970 * requests in the link. So, it drains the head and the
5971 * next after the link request. The last one is done via
5972 * drain_next flag to persist the effect across calls.
5973 */
5974 if (req->flags & REQ_F_IO_DRAIN) {
5975 head->flags |= REQ_F_IO_DRAIN;
5976 ctx->drain_next = 1;
5977 }
5978 if (io_alloc_async_ctx(req))
5979 return -EAGAIN;
5980
5981 ret = io_req_defer_prep(req, sqe);
5982 if (ret) {
5983 /* fail even hard links since we don't submit */
5984 head->flags |= REQ_F_FAIL_LINK;
5985 return ret;
5986 }
5987 trace_io_uring_link(ctx, req, head);
5988 list_add_tail(&req->link_list, &head->link_list);
5989
5990 /* last request of a link, enqueue the link */
5991 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
5992 io_queue_link_head(head);
5993 *link = NULL;
5994 }
5995 } else {
5996 if (unlikely(ctx->drain_next)) {
5997 req->flags |= REQ_F_IO_DRAIN;
5998 ctx->drain_next = 0;
5999 }
6000 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6001 req->flags |= REQ_F_LINK_HEAD;
6002 INIT_LIST_HEAD(&req->link_list);
6003
6004 if (io_alloc_async_ctx(req))
6005 return -EAGAIN;
6006
6007 ret = io_req_defer_prep(req, sqe);
6008 if (ret)
6009 req->flags |= REQ_F_FAIL_LINK;
6010 *link = req;
6011 } else {
6012 io_queue_sqe(req, sqe);
6013 }
6014 }
6015
6016 return 0;
6017 }
6018
6019 /*
6020 * Batched submission is done, ensure local IO is flushed out.
6021 */
6022 static void io_submit_state_end(struct io_submit_state *state)
6023 {
6024 blk_finish_plug(&state->plug);
6025 io_state_file_put(state);
6026 if (state->free_reqs)
6027 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
6028 }
6029
6030 /*
6031 * Start submission side cache.
6032 */
6033 static void io_submit_state_start(struct io_submit_state *state,
6034 unsigned int max_ios)
6035 {
6036 blk_start_plug(&state->plug);
6037 #ifdef CONFIG_BLOCK
6038 state->plug.nowait = true;
6039 #endif
6040 state->free_reqs = 0;
6041 state->file = NULL;
6042 state->ios_left = max_ios;
6043 }
6044
6045 static void io_commit_sqring(struct io_ring_ctx *ctx)
6046 {
6047 struct io_rings *rings = ctx->rings;
6048
6049 /*
6050 * Ensure any loads from the SQEs are done at this point,
6051 * since once we write the new head, the application could
6052 * write new data to them.
6053 */
6054 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6055 }
6056
6057 /*
6058 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6059 * that is mapped by userspace. This means that care needs to be taken to
6060 * ensure that reads are stable, as we cannot rely on userspace always
6061 * being a good citizen. If members of the sqe are validated and then later
6062 * used, it's important that those reads are done through READ_ONCE() to
6063 * prevent a re-load down the line.
6064 */
6065 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6066 {
6067 u32 *sq_array = ctx->sq_array;
6068 unsigned head;
6069
6070 /*
6071 * The cached sq head (or cq tail) serves two purposes:
6072 *
6073 * 1) allows us to batch the cost of updating the user visible
6074 * head updates.
6075 * 2) allows the kernel side to track the head on its own, even
6076 * though the application is the one updating it.
6077 */
6078 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
6079 if (likely(head < ctx->sq_entries))
6080 return &ctx->sq_sqes[head];
6081
6082 /* drop invalid entries */
6083 ctx->cached_sq_dropped++;
6084 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6085 return NULL;
6086 }
6087
6088 static inline void io_consume_sqe(struct io_ring_ctx *ctx)
6089 {
6090 ctx->cached_sq_head++;
6091 }
6092
6093 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
6094 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
6095 IOSQE_BUFFER_SELECT)
6096
6097 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6098 const struct io_uring_sqe *sqe,
6099 struct io_submit_state *state)
6100 {
6101 unsigned int sqe_flags;
6102 int id;
6103
6104 /*
6105 * All io need record the previous position, if LINK vs DARIN,
6106 * it can be used to mark the position of the first IO in the
6107 * link list.
6108 */
6109 req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
6110 req->opcode = READ_ONCE(sqe->opcode);
6111 req->user_data = READ_ONCE(sqe->user_data);
6112 req->io = NULL;
6113 req->file = NULL;
6114 req->ctx = ctx;
6115 req->flags = 0;
6116 /* one is dropped after submission, the other at completion */
6117 refcount_set(&req->refs, 2);
6118 req->task = current;
6119 req->result = 0;
6120
6121 if (unlikely(req->opcode >= IORING_OP_LAST))
6122 return -EINVAL;
6123
6124 if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
6125 return -EFAULT;
6126
6127 sqe_flags = READ_ONCE(sqe->flags);
6128 /* enforce forwards compatibility on users */
6129 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6130 return -EINVAL;
6131
6132 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6133 !io_op_defs[req->opcode].buffer_select)
6134 return -EOPNOTSUPP;
6135
6136 id = READ_ONCE(sqe->personality);
6137 if (id) {
6138 io_req_init_async(req);
6139 req->work.creds = idr_find(&ctx->personality_idr, id);
6140 if (unlikely(!req->work.creds))
6141 return -EINVAL;
6142 get_cred(req->work.creds);
6143 }
6144
6145 /* same numerical values with corresponding REQ_F_*, safe to copy */
6146 req->flags |= sqe_flags;
6147
6148 if (!io_op_defs[req->opcode].needs_file)
6149 return 0;
6150
6151 return io_req_set_file(state, req, READ_ONCE(sqe->fd));
6152 }
6153
6154 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
6155 struct file *ring_file, int ring_fd)
6156 {
6157 struct io_submit_state state;
6158 struct io_kiocb *link = NULL;
6159 int i, submitted = 0;
6160
6161 /* if we have a backlog and couldn't flush it all, return BUSY */
6162 if (test_bit(0, &ctx->sq_check_overflow)) {
6163 if (!list_empty(&ctx->cq_overflow_list) &&
6164 !io_cqring_overflow_flush(ctx, false))
6165 return -EBUSY;
6166 }
6167
6168 /* make sure SQ entry isn't read before tail */
6169 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6170
6171 if (!percpu_ref_tryget_many(&ctx->refs, nr))
6172 return -EAGAIN;
6173
6174 io_submit_state_start(&state, nr);
6175
6176 ctx->ring_fd = ring_fd;
6177 ctx->ring_file = ring_file;
6178
6179 for (i = 0; i < nr; i++) {
6180 const struct io_uring_sqe *sqe;
6181 struct io_kiocb *req;
6182 int err;
6183
6184 sqe = io_get_sqe(ctx);
6185 if (unlikely(!sqe)) {
6186 io_consume_sqe(ctx);
6187 break;
6188 }
6189 req = io_alloc_req(ctx, &state);
6190 if (unlikely(!req)) {
6191 if (!submitted)
6192 submitted = -EAGAIN;
6193 break;
6194 }
6195
6196 err = io_init_req(ctx, req, sqe, &state);
6197 io_consume_sqe(ctx);
6198 /* will complete beyond this point, count as submitted */
6199 submitted++;
6200
6201 if (unlikely(err)) {
6202 fail_req:
6203 io_cqring_add_event(req, err);
6204 io_double_put_req(req);
6205 break;
6206 }
6207
6208 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6209 true, io_async_submit(ctx));
6210 err = io_submit_sqe(req, sqe, &link);
6211 if (err)
6212 goto fail_req;
6213 }
6214
6215 if (unlikely(submitted != nr)) {
6216 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6217
6218 percpu_ref_put_many(&ctx->refs, nr - ref_used);
6219 }
6220 if (link)
6221 io_queue_link_head(link);
6222 io_submit_state_end(&state);
6223
6224 /* Commit SQ ring head once we've consumed and submitted all SQEs */
6225 io_commit_sqring(ctx);
6226
6227 return submitted;
6228 }
6229
6230 static int io_sq_thread(void *data)
6231 {
6232 struct io_ring_ctx *ctx = data;
6233 const struct cred *old_cred;
6234 DEFINE_WAIT(wait);
6235 unsigned long timeout;
6236 int ret = 0;
6237
6238 complete(&ctx->sq_thread_comp);
6239
6240 old_cred = override_creds(ctx->creds);
6241
6242 timeout = jiffies + ctx->sq_thread_idle;
6243 while (!kthread_should_park()) {
6244 unsigned int to_submit;
6245
6246 if (!list_empty(&ctx->poll_list)) {
6247 unsigned nr_events = 0;
6248
6249 mutex_lock(&ctx->uring_lock);
6250 if (!list_empty(&ctx->poll_list))
6251 io_iopoll_getevents(ctx, &nr_events, 0);
6252 else
6253 timeout = jiffies + ctx->sq_thread_idle;
6254 mutex_unlock(&ctx->uring_lock);
6255 }
6256
6257 to_submit = io_sqring_entries(ctx);
6258
6259 /*
6260 * If submit got -EBUSY, flag us as needing the application
6261 * to enter the kernel to reap and flush events.
6262 */
6263 if (!to_submit || ret == -EBUSY) {
6264 /*
6265 * Drop cur_mm before scheduling, we can't hold it for
6266 * long periods (or over schedule()). Do this before
6267 * adding ourselves to the waitqueue, as the unuse/drop
6268 * may sleep.
6269 */
6270 io_sq_thread_drop_mm(ctx);
6271
6272 /*
6273 * We're polling. If we're within the defined idle
6274 * period, then let us spin without work before going
6275 * to sleep. The exception is if we got EBUSY doing
6276 * more IO, we should wait for the application to
6277 * reap events and wake us up.
6278 */
6279 if (!list_empty(&ctx->poll_list) ||
6280 (!time_after(jiffies, timeout) && ret != -EBUSY &&
6281 !percpu_ref_is_dying(&ctx->refs))) {
6282 if (current->task_works)
6283 task_work_run();
6284 cond_resched();
6285 continue;
6286 }
6287
6288 prepare_to_wait(&ctx->sqo_wait, &wait,
6289 TASK_INTERRUPTIBLE);
6290
6291 /*
6292 * While doing polled IO, before going to sleep, we need
6293 * to check if there are new reqs added to poll_list, it
6294 * is because reqs may have been punted to io worker and
6295 * will be added to poll_list later, hence check the
6296 * poll_list again.
6297 */
6298 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6299 !list_empty_careful(&ctx->poll_list)) {
6300 finish_wait(&ctx->sqo_wait, &wait);
6301 continue;
6302 }
6303
6304 /* Tell userspace we may need a wakeup call */
6305 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6306 /* make sure to read SQ tail after writing flags */
6307 smp_mb();
6308
6309 to_submit = io_sqring_entries(ctx);
6310 if (!to_submit || ret == -EBUSY) {
6311 if (kthread_should_park()) {
6312 finish_wait(&ctx->sqo_wait, &wait);
6313 break;
6314 }
6315 if (current->task_works) {
6316 task_work_run();
6317 finish_wait(&ctx->sqo_wait, &wait);
6318 continue;
6319 }
6320 if (signal_pending(current))
6321 flush_signals(current);
6322 schedule();
6323 finish_wait(&ctx->sqo_wait, &wait);
6324
6325 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6326 ret = 0;
6327 continue;
6328 }
6329 finish_wait(&ctx->sqo_wait, &wait);
6330
6331 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6332 }
6333
6334 mutex_lock(&ctx->uring_lock);
6335 if (likely(!percpu_ref_is_dying(&ctx->refs)))
6336 ret = io_submit_sqes(ctx, to_submit, NULL, -1);
6337 mutex_unlock(&ctx->uring_lock);
6338 timeout = jiffies + ctx->sq_thread_idle;
6339 }
6340
6341 if (current->task_works)
6342 task_work_run();
6343
6344 io_sq_thread_drop_mm(ctx);
6345 revert_creds(old_cred);
6346
6347 kthread_parkme();
6348
6349 return 0;
6350 }
6351
6352 struct io_wait_queue {
6353 struct wait_queue_entry wq;
6354 struct io_ring_ctx *ctx;
6355 unsigned to_wait;
6356 unsigned nr_timeouts;
6357 };
6358
6359 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6360 {
6361 struct io_ring_ctx *ctx = iowq->ctx;
6362
6363 /*
6364 * Wake up if we have enough events, or if a timeout occurred since we
6365 * started waiting. For timeouts, we always want to return to userspace,
6366 * regardless of event count.
6367 */
6368 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6369 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6370 }
6371
6372 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6373 int wake_flags, void *key)
6374 {
6375 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6376 wq);
6377
6378 /* use noflush == true, as we can't safely rely on locking context */
6379 if (!io_should_wake(iowq, true))
6380 return -1;
6381
6382 return autoremove_wake_function(curr, mode, wake_flags, key);
6383 }
6384
6385 /*
6386 * Wait until events become available, if we don't already have some. The
6387 * application must reap them itself, as they reside on the shared cq ring.
6388 */
6389 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6390 const sigset_t __user *sig, size_t sigsz)
6391 {
6392 struct io_wait_queue iowq = {
6393 .wq = {
6394 .private = current,
6395 .func = io_wake_function,
6396 .entry = LIST_HEAD_INIT(iowq.wq.entry),
6397 },
6398 .ctx = ctx,
6399 .to_wait = min_events,
6400 };
6401 struct io_rings *rings = ctx->rings;
6402 int ret = 0;
6403
6404 do {
6405 if (io_cqring_events(ctx, false) >= min_events)
6406 return 0;
6407 if (!current->task_works)
6408 break;
6409 task_work_run();
6410 } while (1);
6411
6412 if (sig) {
6413 #ifdef CONFIG_COMPAT
6414 if (in_compat_syscall())
6415 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6416 sigsz);
6417 else
6418 #endif
6419 ret = set_user_sigmask(sig, sigsz);
6420
6421 if (ret)
6422 return ret;
6423 }
6424
6425 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6426 trace_io_uring_cqring_wait(ctx, min_events);
6427 do {
6428 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6429 TASK_INTERRUPTIBLE);
6430 if (current->task_works)
6431 task_work_run();
6432 if (io_should_wake(&iowq, false))
6433 break;
6434 schedule();
6435 if (signal_pending(current)) {
6436 ret = -EINTR;
6437 break;
6438 }
6439 } while (1);
6440 finish_wait(&ctx->wait, &iowq.wq);
6441
6442 restore_saved_sigmask_unless(ret == -EINTR);
6443
6444 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6445 }
6446
6447 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6448 {
6449 #if defined(CONFIG_UNIX)
6450 if (ctx->ring_sock) {
6451 struct sock *sock = ctx->ring_sock->sk;
6452 struct sk_buff *skb;
6453
6454 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6455 kfree_skb(skb);
6456 }
6457 #else
6458 int i;
6459
6460 for (i = 0; i < ctx->nr_user_files; i++) {
6461 struct file *file;
6462
6463 file = io_file_from_index(ctx, i);
6464 if (file)
6465 fput(file);
6466 }
6467 #endif
6468 }
6469
6470 static void io_file_ref_kill(struct percpu_ref *ref)
6471 {
6472 struct fixed_file_data *data;
6473
6474 data = container_of(ref, struct fixed_file_data, refs);
6475 complete(&data->done);
6476 }
6477
6478 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6479 {
6480 struct fixed_file_data *data = ctx->file_data;
6481 struct fixed_file_ref_node *ref_node = NULL;
6482 unsigned nr_tables, i;
6483
6484 if (!data)
6485 return -ENXIO;
6486
6487 spin_lock(&data->lock);
6488 if (!list_empty(&data->ref_list))
6489 ref_node = list_first_entry(&data->ref_list,
6490 struct fixed_file_ref_node, node);
6491 spin_unlock(&data->lock);
6492 if (ref_node)
6493 percpu_ref_kill(&ref_node->refs);
6494
6495 percpu_ref_kill(&data->refs);
6496
6497 /* wait for all refs nodes to complete */
6498 flush_delayed_work(&ctx->file_put_work);
6499 wait_for_completion(&data->done);
6500
6501 __io_sqe_files_unregister(ctx);
6502 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6503 for (i = 0; i < nr_tables; i++)
6504 kfree(data->table[i].files);
6505 kfree(data->table);
6506 percpu_ref_exit(&data->refs);
6507 kfree(data);
6508 ctx->file_data = NULL;
6509 ctx->nr_user_files = 0;
6510 return 0;
6511 }
6512
6513 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
6514 {
6515 if (ctx->sqo_thread) {
6516 wait_for_completion(&ctx->sq_thread_comp);
6517 /*
6518 * The park is a bit of a work-around, without it we get
6519 * warning spews on shutdown with SQPOLL set and affinity
6520 * set to a single CPU.
6521 */
6522 kthread_park(ctx->sqo_thread);
6523 kthread_stop(ctx->sqo_thread);
6524 ctx->sqo_thread = NULL;
6525 }
6526 }
6527
6528 static void io_finish_async(struct io_ring_ctx *ctx)
6529 {
6530 io_sq_thread_stop(ctx);
6531
6532 if (ctx->io_wq) {
6533 io_wq_destroy(ctx->io_wq);
6534 ctx->io_wq = NULL;
6535 }
6536 }
6537
6538 #if defined(CONFIG_UNIX)
6539 /*
6540 * Ensure the UNIX gc is aware of our file set, so we are certain that
6541 * the io_uring can be safely unregistered on process exit, even if we have
6542 * loops in the file referencing.
6543 */
6544 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
6545 {
6546 struct sock *sk = ctx->ring_sock->sk;
6547 struct scm_fp_list *fpl;
6548 struct sk_buff *skb;
6549 int i, nr_files;
6550
6551 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
6552 if (!fpl)
6553 return -ENOMEM;
6554
6555 skb = alloc_skb(0, GFP_KERNEL);
6556 if (!skb) {
6557 kfree(fpl);
6558 return -ENOMEM;
6559 }
6560
6561 skb->sk = sk;
6562
6563 nr_files = 0;
6564 fpl->user = get_uid(ctx->user);
6565 for (i = 0; i < nr; i++) {
6566 struct file *file = io_file_from_index(ctx, i + offset);
6567
6568 if (!file)
6569 continue;
6570 fpl->fp[nr_files] = get_file(file);
6571 unix_inflight(fpl->user, fpl->fp[nr_files]);
6572 nr_files++;
6573 }
6574
6575 if (nr_files) {
6576 fpl->max = SCM_MAX_FD;
6577 fpl->count = nr_files;
6578 UNIXCB(skb).fp = fpl;
6579 skb->destructor = unix_destruct_scm;
6580 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6581 skb_queue_head(&sk->sk_receive_queue, skb);
6582
6583 for (i = 0; i < nr_files; i++)
6584 fput(fpl->fp[i]);
6585 } else {
6586 kfree_skb(skb);
6587 kfree(fpl);
6588 }
6589
6590 return 0;
6591 }
6592
6593 /*
6594 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
6595 * causes regular reference counting to break down. We rely on the UNIX
6596 * garbage collection to take care of this problem for us.
6597 */
6598 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6599 {
6600 unsigned left, total;
6601 int ret = 0;
6602
6603 total = 0;
6604 left = ctx->nr_user_files;
6605 while (left) {
6606 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6607
6608 ret = __io_sqe_files_scm(ctx, this_files, total);
6609 if (ret)
6610 break;
6611 left -= this_files;
6612 total += this_files;
6613 }
6614
6615 if (!ret)
6616 return 0;
6617
6618 while (total < ctx->nr_user_files) {
6619 struct file *file = io_file_from_index(ctx, total);
6620
6621 if (file)
6622 fput(file);
6623 total++;
6624 }
6625
6626 return ret;
6627 }
6628 #else
6629 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6630 {
6631 return 0;
6632 }
6633 #endif
6634
6635 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
6636 unsigned nr_files)
6637 {
6638 int i;
6639
6640 for (i = 0; i < nr_tables; i++) {
6641 struct fixed_file_table *table = &ctx->file_data->table[i];
6642 unsigned this_files;
6643
6644 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
6645 table->files = kcalloc(this_files, sizeof(struct file *),
6646 GFP_KERNEL);
6647 if (!table->files)
6648 break;
6649 nr_files -= this_files;
6650 }
6651
6652 if (i == nr_tables)
6653 return 0;
6654
6655 for (i = 0; i < nr_tables; i++) {
6656 struct fixed_file_table *table = &ctx->file_data->table[i];
6657 kfree(table->files);
6658 }
6659 return 1;
6660 }
6661
6662 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
6663 {
6664 #if defined(CONFIG_UNIX)
6665 struct sock *sock = ctx->ring_sock->sk;
6666 struct sk_buff_head list, *head = &sock->sk_receive_queue;
6667 struct sk_buff *skb;
6668 int i;
6669
6670 __skb_queue_head_init(&list);
6671
6672 /*
6673 * Find the skb that holds this file in its SCM_RIGHTS. When found,
6674 * remove this entry and rearrange the file array.
6675 */
6676 skb = skb_dequeue(head);
6677 while (skb) {
6678 struct scm_fp_list *fp;
6679
6680 fp = UNIXCB(skb).fp;
6681 for (i = 0; i < fp->count; i++) {
6682 int left;
6683
6684 if (fp->fp[i] != file)
6685 continue;
6686
6687 unix_notinflight(fp->user, fp->fp[i]);
6688 left = fp->count - 1 - i;
6689 if (left) {
6690 memmove(&fp->fp[i], &fp->fp[i + 1],
6691 left * sizeof(struct file *));
6692 }
6693 fp->count--;
6694 if (!fp->count) {
6695 kfree_skb(skb);
6696 skb = NULL;
6697 } else {
6698 __skb_queue_tail(&list, skb);
6699 }
6700 fput(file);
6701 file = NULL;
6702 break;
6703 }
6704
6705 if (!file)
6706 break;
6707
6708 __skb_queue_tail(&list, skb);
6709
6710 skb = skb_dequeue(head);
6711 }
6712
6713 if (skb_peek(&list)) {
6714 spin_lock_irq(&head->lock);
6715 while ((skb = __skb_dequeue(&list)) != NULL)
6716 __skb_queue_tail(head, skb);
6717 spin_unlock_irq(&head->lock);
6718 }
6719 #else
6720 fput(file);
6721 #endif
6722 }
6723
6724 struct io_file_put {
6725 struct list_head list;
6726 struct file *file;
6727 };
6728
6729 static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
6730 {
6731 struct fixed_file_data *file_data = ref_node->file_data;
6732 struct io_ring_ctx *ctx = file_data->ctx;
6733 struct io_file_put *pfile, *tmp;
6734
6735 list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
6736 list_del(&pfile->list);
6737 io_ring_file_put(ctx, pfile->file);
6738 kfree(pfile);
6739 }
6740
6741 spin_lock(&file_data->lock);
6742 list_del(&ref_node->node);
6743 spin_unlock(&file_data->lock);
6744
6745 percpu_ref_exit(&ref_node->refs);
6746 kfree(ref_node);
6747 percpu_ref_put(&file_data->refs);
6748 }
6749
6750 static void io_file_put_work(struct work_struct *work)
6751 {
6752 struct io_ring_ctx *ctx;
6753 struct llist_node *node;
6754
6755 ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
6756 node = llist_del_all(&ctx->file_put_llist);
6757
6758 while (node) {
6759 struct fixed_file_ref_node *ref_node;
6760 struct llist_node *next = node->next;
6761
6762 ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
6763 __io_file_put_work(ref_node);
6764 node = next;
6765 }
6766 }
6767
6768 static void io_file_data_ref_zero(struct percpu_ref *ref)
6769 {
6770 struct fixed_file_ref_node *ref_node;
6771 struct io_ring_ctx *ctx;
6772 bool first_add;
6773 int delay = HZ;
6774
6775 ref_node = container_of(ref, struct fixed_file_ref_node, refs);
6776 ctx = ref_node->file_data->ctx;
6777
6778 if (percpu_ref_is_dying(&ctx->file_data->refs))
6779 delay = 0;
6780
6781 first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
6782 if (!delay)
6783 mod_delayed_work(system_wq, &ctx->file_put_work, 0);
6784 else if (first_add)
6785 queue_delayed_work(system_wq, &ctx->file_put_work, delay);
6786 }
6787
6788 static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
6789 struct io_ring_ctx *ctx)
6790 {
6791 struct fixed_file_ref_node *ref_node;
6792
6793 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
6794 if (!ref_node)
6795 return ERR_PTR(-ENOMEM);
6796
6797 if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
6798 0, GFP_KERNEL)) {
6799 kfree(ref_node);
6800 return ERR_PTR(-ENOMEM);
6801 }
6802 INIT_LIST_HEAD(&ref_node->node);
6803 INIT_LIST_HEAD(&ref_node->file_list);
6804 ref_node->file_data = ctx->file_data;
6805 return ref_node;
6806 }
6807
6808 static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
6809 {
6810 percpu_ref_exit(&ref_node->refs);
6811 kfree(ref_node);
6812 }
6813
6814 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
6815 unsigned nr_args)
6816 {
6817 __s32 __user *fds = (__s32 __user *) arg;
6818 unsigned nr_tables;
6819 struct file *file;
6820 int fd, ret = 0;
6821 unsigned i;
6822 struct fixed_file_ref_node *ref_node;
6823
6824 if (ctx->file_data)
6825 return -EBUSY;
6826 if (!nr_args)
6827 return -EINVAL;
6828 if (nr_args > IORING_MAX_FIXED_FILES)
6829 return -EMFILE;
6830
6831 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
6832 if (!ctx->file_data)
6833 return -ENOMEM;
6834 ctx->file_data->ctx = ctx;
6835 init_completion(&ctx->file_data->done);
6836 INIT_LIST_HEAD(&ctx->file_data->ref_list);
6837 spin_lock_init(&ctx->file_data->lock);
6838
6839 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
6840 ctx->file_data->table = kcalloc(nr_tables,
6841 sizeof(struct fixed_file_table),
6842 GFP_KERNEL);
6843 if (!ctx->file_data->table) {
6844 kfree(ctx->file_data);
6845 ctx->file_data = NULL;
6846 return -ENOMEM;
6847 }
6848
6849 if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
6850 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6851 kfree(ctx->file_data->table);
6852 kfree(ctx->file_data);
6853 ctx->file_data = NULL;
6854 return -ENOMEM;
6855 }
6856
6857 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
6858 percpu_ref_exit(&ctx->file_data->refs);
6859 kfree(ctx->file_data->table);
6860 kfree(ctx->file_data);
6861 ctx->file_data = NULL;
6862 return -ENOMEM;
6863 }
6864
6865 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
6866 struct fixed_file_table *table;
6867 unsigned index;
6868
6869 ret = -EFAULT;
6870 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6871 break;
6872 /* allow sparse sets */
6873 if (fd == -1) {
6874 ret = 0;
6875 continue;
6876 }
6877
6878 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6879 index = i & IORING_FILE_TABLE_MASK;
6880 file = fget(fd);
6881
6882 ret = -EBADF;
6883 if (!file)
6884 break;
6885
6886 /*
6887 * Don't allow io_uring instances to be registered. If UNIX
6888 * isn't enabled, then this causes a reference cycle and this
6889 * instance can never get freed. If UNIX is enabled we'll
6890 * handle it just fine, but there's still no point in allowing
6891 * a ring fd as it doesn't support regular read/write anyway.
6892 */
6893 if (file->f_op == &io_uring_fops) {
6894 fput(file);
6895 break;
6896 }
6897 ret = 0;
6898 table->files[index] = file;
6899 }
6900
6901 if (ret) {
6902 for (i = 0; i < ctx->nr_user_files; i++) {
6903 file = io_file_from_index(ctx, i);
6904 if (file)
6905 fput(file);
6906 }
6907 for (i = 0; i < nr_tables; i++)
6908 kfree(ctx->file_data->table[i].files);
6909
6910 kfree(ctx->file_data->table);
6911 kfree(ctx->file_data);
6912 ctx->file_data = NULL;
6913 ctx->nr_user_files = 0;
6914 return ret;
6915 }
6916
6917 ret = io_sqe_files_scm(ctx);
6918 if (ret) {
6919 io_sqe_files_unregister(ctx);
6920 return ret;
6921 }
6922
6923 ref_node = alloc_fixed_file_ref_node(ctx);
6924 if (IS_ERR(ref_node)) {
6925 io_sqe_files_unregister(ctx);
6926 return PTR_ERR(ref_node);
6927 }
6928
6929 ctx->file_data->cur_refs = &ref_node->refs;
6930 spin_lock(&ctx->file_data->lock);
6931 list_add(&ref_node->node, &ctx->file_data->ref_list);
6932 spin_unlock(&ctx->file_data->lock);
6933 percpu_ref_get(&ctx->file_data->refs);
6934 return ret;
6935 }
6936
6937 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6938 int index)
6939 {
6940 #if defined(CONFIG_UNIX)
6941 struct sock *sock = ctx->ring_sock->sk;
6942 struct sk_buff_head *head = &sock->sk_receive_queue;
6943 struct sk_buff *skb;
6944
6945 /*
6946 * See if we can merge this file into an existing skb SCM_RIGHTS
6947 * file set. If there's no room, fall back to allocating a new skb
6948 * and filling it in.
6949 */
6950 spin_lock_irq(&head->lock);
6951 skb = skb_peek(head);
6952 if (skb) {
6953 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6954
6955 if (fpl->count < SCM_MAX_FD) {
6956 __skb_unlink(skb, head);
6957 spin_unlock_irq(&head->lock);
6958 fpl->fp[fpl->count] = get_file(file);
6959 unix_inflight(fpl->user, fpl->fp[fpl->count]);
6960 fpl->count++;
6961 spin_lock_irq(&head->lock);
6962 __skb_queue_head(head, skb);
6963 } else {
6964 skb = NULL;
6965 }
6966 }
6967 spin_unlock_irq(&head->lock);
6968
6969 if (skb) {
6970 fput(file);
6971 return 0;
6972 }
6973
6974 return __io_sqe_files_scm(ctx, 1, index);
6975 #else
6976 return 0;
6977 #endif
6978 }
6979
6980 static int io_queue_file_removal(struct fixed_file_data *data,
6981 struct file *file)
6982 {
6983 struct io_file_put *pfile;
6984 struct percpu_ref *refs = data->cur_refs;
6985 struct fixed_file_ref_node *ref_node;
6986
6987 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6988 if (!pfile)
6989 return -ENOMEM;
6990
6991 ref_node = container_of(refs, struct fixed_file_ref_node, refs);
6992 pfile->file = file;
6993 list_add(&pfile->list, &ref_node->file_list);
6994
6995 return 0;
6996 }
6997
6998 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6999 struct io_uring_files_update *up,
7000 unsigned nr_args)
7001 {
7002 struct fixed_file_data *data = ctx->file_data;
7003 struct fixed_file_ref_node *ref_node;
7004 struct file *file;
7005 __s32 __user *fds;
7006 int fd, i, err;
7007 __u32 done;
7008 bool needs_switch = false;
7009
7010 if (check_add_overflow(up->offset, nr_args, &done))
7011 return -EOVERFLOW;
7012 if (done > ctx->nr_user_files)
7013 return -EINVAL;
7014
7015 ref_node = alloc_fixed_file_ref_node(ctx);
7016 if (IS_ERR(ref_node))
7017 return PTR_ERR(ref_node);
7018
7019 done = 0;
7020 fds = u64_to_user_ptr(up->fds);
7021 while (nr_args) {
7022 struct fixed_file_table *table;
7023 unsigned index;
7024
7025 err = 0;
7026 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7027 err = -EFAULT;
7028 break;
7029 }
7030 i = array_index_nospec(up->offset, ctx->nr_user_files);
7031 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7032 index = i & IORING_FILE_TABLE_MASK;
7033 if (table->files[index]) {
7034 file = io_file_from_index(ctx, index);
7035 err = io_queue_file_removal(data, file);
7036 if (err)
7037 break;
7038 table->files[index] = NULL;
7039 needs_switch = true;
7040 }
7041 if (fd != -1) {
7042 file = fget(fd);
7043 if (!file) {
7044 err = -EBADF;
7045 break;
7046 }
7047 /*
7048 * Don't allow io_uring instances to be registered. If
7049 * UNIX isn't enabled, then this causes a reference
7050 * cycle and this instance can never get freed. If UNIX
7051 * is enabled we'll handle it just fine, but there's
7052 * still no point in allowing a ring fd as it doesn't
7053 * support regular read/write anyway.
7054 */
7055 if (file->f_op == &io_uring_fops) {
7056 fput(file);
7057 err = -EBADF;
7058 break;
7059 }
7060 table->files[index] = file;
7061 err = io_sqe_file_register(ctx, file, i);
7062 if (err)
7063 break;
7064 }
7065 nr_args--;
7066 done++;
7067 up->offset++;
7068 }
7069
7070 if (needs_switch) {
7071 percpu_ref_kill(data->cur_refs);
7072 spin_lock(&data->lock);
7073 list_add(&ref_node->node, &data->ref_list);
7074 data->cur_refs = &ref_node->refs;
7075 spin_unlock(&data->lock);
7076 percpu_ref_get(&ctx->file_data->refs);
7077 } else
7078 destroy_fixed_file_ref_node(ref_node);
7079
7080 return done ? done : err;
7081 }
7082
7083 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7084 unsigned nr_args)
7085 {
7086 struct io_uring_files_update up;
7087
7088 if (!ctx->file_data)
7089 return -ENXIO;
7090 if (!nr_args)
7091 return -EINVAL;
7092 if (copy_from_user(&up, arg, sizeof(up)))
7093 return -EFAULT;
7094 if (up.resv)
7095 return -EINVAL;
7096
7097 return __io_sqe_files_update(ctx, &up, nr_args);
7098 }
7099
7100 static void io_free_work(struct io_wq_work *work)
7101 {
7102 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7103
7104 /* Consider that io_steal_work() relies on this ref */
7105 io_put_req(req);
7106 }
7107
7108 static int io_init_wq_offload(struct io_ring_ctx *ctx,
7109 struct io_uring_params *p)
7110 {
7111 struct io_wq_data data;
7112 struct fd f;
7113 struct io_ring_ctx *ctx_attach;
7114 unsigned int concurrency;
7115 int ret = 0;
7116
7117 data.user = ctx->user;
7118 data.free_work = io_free_work;
7119 data.do_work = io_wq_submit_work;
7120
7121 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
7122 /* Do QD, or 4 * CPUS, whatever is smallest */
7123 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7124
7125 ctx->io_wq = io_wq_create(concurrency, &data);
7126 if (IS_ERR(ctx->io_wq)) {
7127 ret = PTR_ERR(ctx->io_wq);
7128 ctx->io_wq = NULL;
7129 }
7130 return ret;
7131 }
7132
7133 f = fdget(p->wq_fd);
7134 if (!f.file)
7135 return -EBADF;
7136
7137 if (f.file->f_op != &io_uring_fops) {
7138 ret = -EINVAL;
7139 goto out_fput;
7140 }
7141
7142 ctx_attach = f.file->private_data;
7143 /* @io_wq is protected by holding the fd */
7144 if (!io_wq_get(ctx_attach->io_wq, &data)) {
7145 ret = -EINVAL;
7146 goto out_fput;
7147 }
7148
7149 ctx->io_wq = ctx_attach->io_wq;
7150 out_fput:
7151 fdput(f);
7152 return ret;
7153 }
7154
7155 static int io_sq_offload_start(struct io_ring_ctx *ctx,
7156 struct io_uring_params *p)
7157 {
7158 int ret;
7159
7160 mmgrab(current->mm);
7161 ctx->sqo_mm = current->mm;
7162
7163 if (ctx->flags & IORING_SETUP_SQPOLL) {
7164 ret = -EPERM;
7165 if (!capable(CAP_SYS_ADMIN))
7166 goto err;
7167
7168 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7169 if (!ctx->sq_thread_idle)
7170 ctx->sq_thread_idle = HZ;
7171
7172 if (p->flags & IORING_SETUP_SQ_AFF) {
7173 int cpu = p->sq_thread_cpu;
7174
7175 ret = -EINVAL;
7176 if (cpu >= nr_cpu_ids)
7177 goto err;
7178 if (!cpu_online(cpu))
7179 goto err;
7180
7181 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
7182 ctx, cpu,
7183 "io_uring-sq");
7184 } else {
7185 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
7186 "io_uring-sq");
7187 }
7188 if (IS_ERR(ctx->sqo_thread)) {
7189 ret = PTR_ERR(ctx->sqo_thread);
7190 ctx->sqo_thread = NULL;
7191 goto err;
7192 }
7193 wake_up_process(ctx->sqo_thread);
7194 } else if (p->flags & IORING_SETUP_SQ_AFF) {
7195 /* Can't have SQ_AFF without SQPOLL */
7196 ret = -EINVAL;
7197 goto err;
7198 }
7199
7200 ret = io_init_wq_offload(ctx, p);
7201 if (ret)
7202 goto err;
7203
7204 return 0;
7205 err:
7206 io_finish_async(ctx);
7207 mmdrop(ctx->sqo_mm);
7208 ctx->sqo_mm = NULL;
7209 return ret;
7210 }
7211
7212 static inline void __io_unaccount_mem(struct user_struct *user,
7213 unsigned long nr_pages)
7214 {
7215 atomic_long_sub(nr_pages, &user->locked_vm);
7216 }
7217
7218 static inline int __io_account_mem(struct user_struct *user,
7219 unsigned long nr_pages)
7220 {
7221 unsigned long page_limit, cur_pages, new_pages;
7222
7223 /* Don't allow more pages than we can safely lock */
7224 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7225
7226 do {
7227 cur_pages = atomic_long_read(&user->locked_vm);
7228 new_pages = cur_pages + nr_pages;
7229 if (new_pages > page_limit)
7230 return -ENOMEM;
7231 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7232 new_pages) != cur_pages);
7233
7234 return 0;
7235 }
7236
7237 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7238 enum io_mem_account acct)
7239 {
7240 if (ctx->limit_mem)
7241 __io_unaccount_mem(ctx->user, nr_pages);
7242
7243 if (ctx->sqo_mm) {
7244 if (acct == ACCT_LOCKED)
7245 ctx->sqo_mm->locked_vm -= nr_pages;
7246 else if (acct == ACCT_PINNED)
7247 atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
7248 }
7249 }
7250
7251 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7252 enum io_mem_account acct)
7253 {
7254 int ret;
7255
7256 if (ctx->limit_mem) {
7257 ret = __io_account_mem(ctx->user, nr_pages);
7258 if (ret)
7259 return ret;
7260 }
7261
7262 if (ctx->sqo_mm) {
7263 if (acct == ACCT_LOCKED)
7264 ctx->sqo_mm->locked_vm += nr_pages;
7265 else if (acct == ACCT_PINNED)
7266 atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
7267 }
7268
7269 return 0;
7270 }
7271
7272 static void io_mem_free(void *ptr)
7273 {
7274 struct page *page;
7275
7276 if (!ptr)
7277 return;
7278
7279 page = virt_to_head_page(ptr);
7280 if (put_page_testzero(page))
7281 free_compound_page(page);
7282 }
7283
7284 static void *io_mem_alloc(size_t size)
7285 {
7286 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7287 __GFP_NORETRY;
7288
7289 return (void *) __get_free_pages(gfp_flags, get_order(size));
7290 }
7291
7292 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7293 size_t *sq_offset)
7294 {
7295 struct io_rings *rings;
7296 size_t off, sq_array_size;
7297
7298 off = struct_size(rings, cqes, cq_entries);
7299 if (off == SIZE_MAX)
7300 return SIZE_MAX;
7301
7302 #ifdef CONFIG_SMP
7303 off = ALIGN(off, SMP_CACHE_BYTES);
7304 if (off == 0)
7305 return SIZE_MAX;
7306 #endif
7307
7308 sq_array_size = array_size(sizeof(u32), sq_entries);
7309 if (sq_array_size == SIZE_MAX)
7310 return SIZE_MAX;
7311
7312 if (check_add_overflow(off, sq_array_size, &off))
7313 return SIZE_MAX;
7314
7315 if (sq_offset)
7316 *sq_offset = off;
7317
7318 return off;
7319 }
7320
7321 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7322 {
7323 size_t pages;
7324
7325 pages = (size_t)1 << get_order(
7326 rings_size(sq_entries, cq_entries, NULL));
7327 pages += (size_t)1 << get_order(
7328 array_size(sizeof(struct io_uring_sqe), sq_entries));
7329
7330 return pages;
7331 }
7332
7333 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7334 {
7335 int i, j;
7336
7337 if (!ctx->user_bufs)
7338 return -ENXIO;
7339
7340 for (i = 0; i < ctx->nr_user_bufs; i++) {
7341 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7342
7343 for (j = 0; j < imu->nr_bvecs; j++)
7344 unpin_user_page(imu->bvec[j].bv_page);
7345
7346 io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
7347 kvfree(imu->bvec);
7348 imu->nr_bvecs = 0;
7349 }
7350
7351 kfree(ctx->user_bufs);
7352 ctx->user_bufs = NULL;
7353 ctx->nr_user_bufs = 0;
7354 return 0;
7355 }
7356
7357 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
7358 void __user *arg, unsigned index)
7359 {
7360 struct iovec __user *src;
7361
7362 #ifdef CONFIG_COMPAT
7363 if (ctx->compat) {
7364 struct compat_iovec __user *ciovs;
7365 struct compat_iovec ciov;
7366
7367 ciovs = (struct compat_iovec __user *) arg;
7368 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
7369 return -EFAULT;
7370
7371 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
7372 dst->iov_len = ciov.iov_len;
7373 return 0;
7374 }
7375 #endif
7376 src = (struct iovec __user *) arg;
7377 if (copy_from_user(dst, &src[index], sizeof(*dst)))
7378 return -EFAULT;
7379 return 0;
7380 }
7381
7382 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
7383 unsigned nr_args)
7384 {
7385 struct vm_area_struct **vmas = NULL;
7386 struct page **pages = NULL;
7387 int i, j, got_pages = 0;
7388 int ret = -EINVAL;
7389
7390 if (ctx->user_bufs)
7391 return -EBUSY;
7392 if (!nr_args || nr_args > UIO_MAXIOV)
7393 return -EINVAL;
7394
7395 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
7396 GFP_KERNEL);
7397 if (!ctx->user_bufs)
7398 return -ENOMEM;
7399
7400 for (i = 0; i < nr_args; i++) {
7401 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7402 unsigned long off, start, end, ubuf;
7403 int pret, nr_pages;
7404 struct iovec iov;
7405 size_t size;
7406
7407 ret = io_copy_iov(ctx, &iov, arg, i);
7408 if (ret)
7409 goto err;
7410
7411 /*
7412 * Don't impose further limits on the size and buffer
7413 * constraints here, we'll -EINVAL later when IO is
7414 * submitted if they are wrong.
7415 */
7416 ret = -EFAULT;
7417 if (!iov.iov_base || !iov.iov_len)
7418 goto err;
7419
7420 /* arbitrary limit, but we need something */
7421 if (iov.iov_len > SZ_1G)
7422 goto err;
7423
7424 ubuf = (unsigned long) iov.iov_base;
7425 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
7426 start = ubuf >> PAGE_SHIFT;
7427 nr_pages = end - start;
7428
7429 ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
7430 if (ret)
7431 goto err;
7432
7433 ret = 0;
7434 if (!pages || nr_pages > got_pages) {
7435 kvfree(vmas);
7436 kvfree(pages);
7437 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
7438 GFP_KERNEL);
7439 vmas = kvmalloc_array(nr_pages,
7440 sizeof(struct vm_area_struct *),
7441 GFP_KERNEL);
7442 if (!pages || !vmas) {
7443 ret = -ENOMEM;
7444 io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
7445 goto err;
7446 }
7447 got_pages = nr_pages;
7448 }
7449
7450 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
7451 GFP_KERNEL);
7452 ret = -ENOMEM;
7453 if (!imu->bvec) {
7454 io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
7455 goto err;
7456 }
7457
7458 ret = 0;
7459 mmap_read_lock(current->mm);
7460 pret = pin_user_pages(ubuf, nr_pages,
7461 FOLL_WRITE | FOLL_LONGTERM,
7462 pages, vmas);
7463 if (pret == nr_pages) {
7464 /* don't support file backed memory */
7465 for (j = 0; j < nr_pages; j++) {
7466 struct vm_area_struct *vma = vmas[j];
7467
7468 if (vma->vm_file &&
7469 !is_file_hugepages(vma->vm_file)) {
7470 ret = -EOPNOTSUPP;
7471 break;
7472 }
7473 }
7474 } else {
7475 ret = pret < 0 ? pret : -EFAULT;
7476 }
7477 mmap_read_unlock(current->mm);
7478 if (ret) {
7479 /*
7480 * if we did partial map, or found file backed vmas,
7481 * release any pages we did get
7482 */
7483 if (pret > 0)
7484 unpin_user_pages(pages, pret);
7485 io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
7486 kvfree(imu->bvec);
7487 goto err;
7488 }
7489
7490 off = ubuf & ~PAGE_MASK;
7491 size = iov.iov_len;
7492 for (j = 0; j < nr_pages; j++) {
7493 size_t vec_len;
7494
7495 vec_len = min_t(size_t, size, PAGE_SIZE - off);
7496 imu->bvec[j].bv_page = pages[j];
7497 imu->bvec[j].bv_len = vec_len;
7498 imu->bvec[j].bv_offset = off;
7499 off = 0;
7500 size -= vec_len;
7501 }
7502 /* store original address for later verification */
7503 imu->ubuf = ubuf;
7504 imu->len = iov.iov_len;
7505 imu->nr_bvecs = nr_pages;
7506
7507 ctx->nr_user_bufs++;
7508 }
7509 kvfree(pages);
7510 kvfree(vmas);
7511 return 0;
7512 err:
7513 kvfree(pages);
7514 kvfree(vmas);
7515 io_sqe_buffer_unregister(ctx);
7516 return ret;
7517 }
7518
7519 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
7520 {
7521 __s32 __user *fds = arg;
7522 int fd;
7523
7524 if (ctx->cq_ev_fd)
7525 return -EBUSY;
7526
7527 if (copy_from_user(&fd, fds, sizeof(*fds)))
7528 return -EFAULT;
7529
7530 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
7531 if (IS_ERR(ctx->cq_ev_fd)) {
7532 int ret = PTR_ERR(ctx->cq_ev_fd);
7533 ctx->cq_ev_fd = NULL;
7534 return ret;
7535 }
7536
7537 return 0;
7538 }
7539
7540 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
7541 {
7542 if (ctx->cq_ev_fd) {
7543 eventfd_ctx_put(ctx->cq_ev_fd);
7544 ctx->cq_ev_fd = NULL;
7545 return 0;
7546 }
7547
7548 return -ENXIO;
7549 }
7550
7551 static int __io_destroy_buffers(int id, void *p, void *data)
7552 {
7553 struct io_ring_ctx *ctx = data;
7554 struct io_buffer *buf = p;
7555
7556 __io_remove_buffers(ctx, buf, id, -1U);
7557 return 0;
7558 }
7559
7560 static void io_destroy_buffers(struct io_ring_ctx *ctx)
7561 {
7562 idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
7563 idr_destroy(&ctx->io_buffer_idr);
7564 }
7565
7566 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
7567 {
7568 io_finish_async(ctx);
7569 if (ctx->sqo_mm) {
7570 mmdrop(ctx->sqo_mm);
7571 ctx->sqo_mm = NULL;
7572 }
7573
7574 io_iopoll_reap_events(ctx);
7575 io_sqe_buffer_unregister(ctx);
7576 io_sqe_files_unregister(ctx);
7577 io_eventfd_unregister(ctx);
7578 io_destroy_buffers(ctx);
7579 idr_destroy(&ctx->personality_idr);
7580
7581 #if defined(CONFIG_UNIX)
7582 if (ctx->ring_sock) {
7583 ctx->ring_sock->file = NULL; /* so that iput() is called */
7584 sock_release(ctx->ring_sock);
7585 }
7586 #endif
7587
7588 io_mem_free(ctx->rings);
7589 io_mem_free(ctx->sq_sqes);
7590
7591 percpu_ref_exit(&ctx->refs);
7592 io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
7593 ACCT_LOCKED);
7594 free_uid(ctx->user);
7595 put_cred(ctx->creds);
7596 kfree(ctx->cancel_hash);
7597 kmem_cache_free(req_cachep, ctx->fallback_req);
7598 kfree(ctx);
7599 }
7600
7601 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
7602 {
7603 struct io_ring_ctx *ctx = file->private_data;
7604 __poll_t mask = 0;
7605
7606 poll_wait(file, &ctx->cq_wait, wait);
7607 /*
7608 * synchronizes with barrier from wq_has_sleeper call in
7609 * io_commit_cqring
7610 */
7611 smp_rmb();
7612 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
7613 ctx->rings->sq_ring_entries)
7614 mask |= EPOLLOUT | EPOLLWRNORM;
7615 if (io_cqring_events(ctx, false))
7616 mask |= EPOLLIN | EPOLLRDNORM;
7617
7618 return mask;
7619 }
7620
7621 static int io_uring_fasync(int fd, struct file *file, int on)
7622 {
7623 struct io_ring_ctx *ctx = file->private_data;
7624
7625 return fasync_helper(fd, file, on, &ctx->cq_fasync);
7626 }
7627
7628 static int io_remove_personalities(int id, void *p, void *data)
7629 {
7630 struct io_ring_ctx *ctx = data;
7631 const struct cred *cred;
7632
7633 cred = idr_remove(&ctx->personality_idr, id);
7634 if (cred)
7635 put_cred(cred);
7636 return 0;
7637 }
7638
7639 static void io_ring_exit_work(struct work_struct *work)
7640 {
7641 struct io_ring_ctx *ctx;
7642
7643 ctx = container_of(work, struct io_ring_ctx, exit_work);
7644 if (ctx->rings)
7645 io_cqring_overflow_flush(ctx, true);
7646
7647 /*
7648 * If we're doing polled IO and end up having requests being
7649 * submitted async (out-of-line), then completions can come in while
7650 * we're waiting for refs to drop. We need to reap these manually,
7651 * as nobody else will be looking for them.
7652 */
7653 while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
7654 io_iopoll_reap_events(ctx);
7655 if (ctx->rings)
7656 io_cqring_overflow_flush(ctx, true);
7657 }
7658 io_ring_ctx_free(ctx);
7659 }
7660
7661 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7662 {
7663 mutex_lock(&ctx->uring_lock);
7664 percpu_ref_kill(&ctx->refs);
7665 mutex_unlock(&ctx->uring_lock);
7666
7667 io_kill_timeouts(ctx);
7668 io_poll_remove_all(ctx);
7669
7670 if (ctx->io_wq)
7671 io_wq_cancel_all(ctx->io_wq);
7672
7673 io_iopoll_reap_events(ctx);
7674 /* if we failed setting up the ctx, we might not have any rings */
7675 if (ctx->rings)
7676 io_cqring_overflow_flush(ctx, true);
7677 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
7678 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
7679 queue_work(system_wq, &ctx->exit_work);
7680 }
7681
7682 static int io_uring_release(struct inode *inode, struct file *file)
7683 {
7684 struct io_ring_ctx *ctx = file->private_data;
7685
7686 file->private_data = NULL;
7687 io_ring_ctx_wait_and_kill(ctx);
7688 return 0;
7689 }
7690
7691 static bool io_wq_files_match(struct io_wq_work *work, void *data)
7692 {
7693 struct files_struct *files = data;
7694
7695 return work->files == files;
7696 }
7697
7698 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
7699 struct files_struct *files)
7700 {
7701 if (list_empty_careful(&ctx->inflight_list))
7702 return;
7703
7704 /* cancel all at once, should be faster than doing it one by one*/
7705 io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
7706
7707 while (!list_empty_careful(&ctx->inflight_list)) {
7708 struct io_kiocb *cancel_req = NULL, *req;
7709 DEFINE_WAIT(wait);
7710
7711 spin_lock_irq(&ctx->inflight_lock);
7712 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
7713 if (req->work.files != files)
7714 continue;
7715 /* req is being completed, ignore */
7716 if (!refcount_inc_not_zero(&req->refs))
7717 continue;
7718 cancel_req = req;
7719 break;
7720 }
7721 if (cancel_req)
7722 prepare_to_wait(&ctx->inflight_wait, &wait,
7723 TASK_UNINTERRUPTIBLE);
7724 spin_unlock_irq(&ctx->inflight_lock);
7725
7726 /* We need to keep going until we don't find a matching req */
7727 if (!cancel_req)
7728 break;
7729
7730 if (cancel_req->flags & REQ_F_OVERFLOW) {
7731 spin_lock_irq(&ctx->completion_lock);
7732 list_del(&cancel_req->list);
7733 cancel_req->flags &= ~REQ_F_OVERFLOW;
7734 if (list_empty(&ctx->cq_overflow_list)) {
7735 clear_bit(0, &ctx->sq_check_overflow);
7736 clear_bit(0, &ctx->cq_check_overflow);
7737 }
7738 spin_unlock_irq(&ctx->completion_lock);
7739
7740 WRITE_ONCE(ctx->rings->cq_overflow,
7741 atomic_inc_return(&ctx->cached_cq_overflow));
7742
7743 /*
7744 * Put inflight ref and overflow ref. If that's
7745 * all we had, then we're done with this request.
7746 */
7747 if (refcount_sub_and_test(2, &cancel_req->refs)) {
7748 io_free_req(cancel_req);
7749 finish_wait(&ctx->inflight_wait, &wait);
7750 continue;
7751 }
7752 } else {
7753 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
7754 io_put_req(cancel_req);
7755 }
7756
7757 schedule();
7758 finish_wait(&ctx->inflight_wait, &wait);
7759 }
7760 }
7761
7762 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
7763 {
7764 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7765 struct task_struct *task = data;
7766
7767 return req->task == task;
7768 }
7769
7770 static int io_uring_flush(struct file *file, void *data)
7771 {
7772 struct io_ring_ctx *ctx = file->private_data;
7773
7774 io_uring_cancel_files(ctx, data);
7775
7776 /*
7777 * If the task is going away, cancel work it may have pending
7778 */
7779 if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
7780 io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
7781
7782 return 0;
7783 }
7784
7785 static void *io_uring_validate_mmap_request(struct file *file,
7786 loff_t pgoff, size_t sz)
7787 {
7788 struct io_ring_ctx *ctx = file->private_data;
7789 loff_t offset = pgoff << PAGE_SHIFT;
7790 struct page *page;
7791 void *ptr;
7792
7793 switch (offset) {
7794 case IORING_OFF_SQ_RING:
7795 case IORING_OFF_CQ_RING:
7796 ptr = ctx->rings;
7797 break;
7798 case IORING_OFF_SQES:
7799 ptr = ctx->sq_sqes;
7800 break;
7801 default:
7802 return ERR_PTR(-EINVAL);
7803 }
7804
7805 page = virt_to_head_page(ptr);
7806 if (sz > page_size(page))
7807 return ERR_PTR(-EINVAL);
7808
7809 return ptr;
7810 }
7811
7812 #ifdef CONFIG_MMU
7813
7814 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7815 {
7816 size_t sz = vma->vm_end - vma->vm_start;
7817 unsigned long pfn;
7818 void *ptr;
7819
7820 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
7821 if (IS_ERR(ptr))
7822 return PTR_ERR(ptr);
7823
7824 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
7825 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
7826 }
7827
7828 #else /* !CONFIG_MMU */
7829
7830 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7831 {
7832 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
7833 }
7834
7835 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
7836 {
7837 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
7838 }
7839
7840 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
7841 unsigned long addr, unsigned long len,
7842 unsigned long pgoff, unsigned long flags)
7843 {
7844 void *ptr;
7845
7846 ptr = io_uring_validate_mmap_request(file, pgoff, len);
7847 if (IS_ERR(ptr))
7848 return PTR_ERR(ptr);
7849
7850 return (unsigned long) ptr;
7851 }
7852
7853 #endif /* !CONFIG_MMU */
7854
7855 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7856 u32, min_complete, u32, flags, const sigset_t __user *, sig,
7857 size_t, sigsz)
7858 {
7859 struct io_ring_ctx *ctx;
7860 long ret = -EBADF;
7861 int submitted = 0;
7862 struct fd f;
7863
7864 if (current->task_works)
7865 task_work_run();
7866
7867 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
7868 return -EINVAL;
7869
7870 f = fdget(fd);
7871 if (!f.file)
7872 return -EBADF;
7873
7874 ret = -EOPNOTSUPP;
7875 if (f.file->f_op != &io_uring_fops)
7876 goto out_fput;
7877
7878 ret = -ENXIO;
7879 ctx = f.file->private_data;
7880 if (!percpu_ref_tryget(&ctx->refs))
7881 goto out_fput;
7882
7883 /*
7884 * For SQ polling, the thread will do all submissions and completions.
7885 * Just return the requested submit count, and wake the thread if
7886 * we were asked to.
7887 */
7888 ret = 0;
7889 if (ctx->flags & IORING_SETUP_SQPOLL) {
7890 if (!list_empty_careful(&ctx->cq_overflow_list))
7891 io_cqring_overflow_flush(ctx, false);
7892 if (flags & IORING_ENTER_SQ_WAKEUP)
7893 wake_up(&ctx->sqo_wait);
7894 submitted = to_submit;
7895 } else if (to_submit) {
7896 mutex_lock(&ctx->uring_lock);
7897 submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
7898 mutex_unlock(&ctx->uring_lock);
7899
7900 if (submitted != to_submit)
7901 goto out;
7902 }
7903 if (flags & IORING_ENTER_GETEVENTS) {
7904 unsigned nr_events = 0;
7905
7906 min_complete = min(min_complete, ctx->cq_entries);
7907
7908 /*
7909 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
7910 * space applications don't need to do io completion events
7911 * polling again, they can rely on io_sq_thread to do polling
7912 * work, which can reduce cpu usage and uring_lock contention.
7913 */
7914 if (ctx->flags & IORING_SETUP_IOPOLL &&
7915 !(ctx->flags & IORING_SETUP_SQPOLL)) {
7916 ret = io_iopoll_check(ctx, &nr_events, min_complete);
7917 } else {
7918 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7919 }
7920 }
7921
7922 out:
7923 percpu_ref_put(&ctx->refs);
7924 out_fput:
7925 fdput(f);
7926 return submitted ? submitted : ret;
7927 }
7928
7929 #ifdef CONFIG_PROC_FS
7930 static int io_uring_show_cred(int id, void *p, void *data)
7931 {
7932 const struct cred *cred = p;
7933 struct seq_file *m = data;
7934 struct user_namespace *uns = seq_user_ns(m);
7935 struct group_info *gi;
7936 kernel_cap_t cap;
7937 unsigned __capi;
7938 int g;
7939
7940 seq_printf(m, "%5d\n", id);
7941 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7942 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7943 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7944 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7945 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7946 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7947 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7948 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7949 seq_puts(m, "\n\tGroups:\t");
7950 gi = cred->group_info;
7951 for (g = 0; g < gi->ngroups; g++) {
7952 seq_put_decimal_ull(m, g ? " " : "",
7953 from_kgid_munged(uns, gi->gid[g]));
7954 }
7955 seq_puts(m, "\n\tCapEff:\t");
7956 cap = cred->cap_effective;
7957 CAP_FOR_EACH_U32(__capi)
7958 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7959 seq_putc(m, '\n');
7960 return 0;
7961 }
7962
7963 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7964 {
7965 int i;
7966
7967 mutex_lock(&ctx->uring_lock);
7968 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7969 for (i = 0; i < ctx->nr_user_files; i++) {
7970 struct fixed_file_table *table;
7971 struct file *f;
7972
7973 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7974 f = table->files[i & IORING_FILE_TABLE_MASK];
7975 if (f)
7976 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7977 else
7978 seq_printf(m, "%5u: <none>\n", i);
7979 }
7980 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7981 for (i = 0; i < ctx->nr_user_bufs; i++) {
7982 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7983
7984 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7985 (unsigned int) buf->len);
7986 }
7987 if (!idr_is_empty(&ctx->personality_idr)) {
7988 seq_printf(m, "Personalities:\n");
7989 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7990 }
7991 seq_printf(m, "PollList:\n");
7992 spin_lock_irq(&ctx->completion_lock);
7993 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7994 struct hlist_head *list = &ctx->cancel_hash[i];
7995 struct io_kiocb *req;
7996
7997 hlist_for_each_entry(req, list, hash_node)
7998 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
7999 req->task->task_works != NULL);
8000 }
8001 spin_unlock_irq(&ctx->completion_lock);
8002 mutex_unlock(&ctx->uring_lock);
8003 }
8004
8005 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
8006 {
8007 struct io_ring_ctx *ctx = f->private_data;
8008
8009 if (percpu_ref_tryget(&ctx->refs)) {
8010 __io_uring_show_fdinfo(ctx, m);
8011 percpu_ref_put(&ctx->refs);
8012 }
8013 }
8014 #endif
8015
8016 static const struct file_operations io_uring_fops = {
8017 .release = io_uring_release,
8018 .flush = io_uring_flush,
8019 .mmap = io_uring_mmap,
8020 #ifndef CONFIG_MMU
8021 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
8022 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
8023 #endif
8024 .poll = io_uring_poll,
8025 .fasync = io_uring_fasync,
8026 #ifdef CONFIG_PROC_FS
8027 .show_fdinfo = io_uring_show_fdinfo,
8028 #endif
8029 };
8030
8031 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
8032 struct io_uring_params *p)
8033 {
8034 struct io_rings *rings;
8035 size_t size, sq_array_offset;
8036
8037 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
8038 if (size == SIZE_MAX)
8039 return -EOVERFLOW;
8040
8041 rings = io_mem_alloc(size);
8042 if (!rings)
8043 return -ENOMEM;
8044
8045 ctx->rings = rings;
8046 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
8047 rings->sq_ring_mask = p->sq_entries - 1;
8048 rings->cq_ring_mask = p->cq_entries - 1;
8049 rings->sq_ring_entries = p->sq_entries;
8050 rings->cq_ring_entries = p->cq_entries;
8051 ctx->sq_mask = rings->sq_ring_mask;
8052 ctx->cq_mask = rings->cq_ring_mask;
8053 ctx->sq_entries = rings->sq_ring_entries;
8054 ctx->cq_entries = rings->cq_ring_entries;
8055
8056 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
8057 if (size == SIZE_MAX) {
8058 io_mem_free(ctx->rings);
8059 ctx->rings = NULL;
8060 return -EOVERFLOW;
8061 }
8062
8063 ctx->sq_sqes = io_mem_alloc(size);
8064 if (!ctx->sq_sqes) {
8065 io_mem_free(ctx->rings);
8066 ctx->rings = NULL;
8067 return -ENOMEM;
8068 }
8069
8070 return 0;
8071 }
8072
8073 /*
8074 * Allocate an anonymous fd, this is what constitutes the application
8075 * visible backing of an io_uring instance. The application mmaps this
8076 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
8077 * we have to tie this fd to a socket for file garbage collection purposes.
8078 */
8079 static int io_uring_get_fd(struct io_ring_ctx *ctx)
8080 {
8081 struct file *file;
8082 int ret;
8083
8084 #if defined(CONFIG_UNIX)
8085 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
8086 &ctx->ring_sock);
8087 if (ret)
8088 return ret;
8089 #endif
8090
8091 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
8092 if (ret < 0)
8093 goto err;
8094
8095 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
8096 O_RDWR | O_CLOEXEC);
8097 if (IS_ERR(file)) {
8098 put_unused_fd(ret);
8099 ret = PTR_ERR(file);
8100 goto err;
8101 }
8102
8103 #if defined(CONFIG_UNIX)
8104 ctx->ring_sock->file = file;
8105 #endif
8106 fd_install(ret, file);
8107 return ret;
8108 err:
8109 #if defined(CONFIG_UNIX)
8110 sock_release(ctx->ring_sock);
8111 ctx->ring_sock = NULL;
8112 #endif
8113 return ret;
8114 }
8115
8116 static int io_uring_create(unsigned entries, struct io_uring_params *p,
8117 struct io_uring_params __user *params)
8118 {
8119 struct user_struct *user = NULL;
8120 struct io_ring_ctx *ctx;
8121 bool limit_mem;
8122 int ret;
8123
8124 if (!entries)
8125 return -EINVAL;
8126 if (entries > IORING_MAX_ENTRIES) {
8127 if (!(p->flags & IORING_SETUP_CLAMP))
8128 return -EINVAL;
8129 entries = IORING_MAX_ENTRIES;
8130 }
8131
8132 /*
8133 * Use twice as many entries for the CQ ring. It's possible for the
8134 * application to drive a higher depth than the size of the SQ ring,
8135 * since the sqes are only used at submission time. This allows for
8136 * some flexibility in overcommitting a bit. If the application has
8137 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
8138 * of CQ ring entries manually.
8139 */
8140 p->sq_entries = roundup_pow_of_two(entries);
8141 if (p->flags & IORING_SETUP_CQSIZE) {
8142 /*
8143 * If IORING_SETUP_CQSIZE is set, we do the same roundup
8144 * to a power-of-two, if it isn't already. We do NOT impose
8145 * any cq vs sq ring sizing.
8146 */
8147 if (p->cq_entries < p->sq_entries)
8148 return -EINVAL;
8149 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
8150 if (!(p->flags & IORING_SETUP_CLAMP))
8151 return -EINVAL;
8152 p->cq_entries = IORING_MAX_CQ_ENTRIES;
8153 }
8154 p->cq_entries = roundup_pow_of_two(p->cq_entries);
8155 } else {
8156 p->cq_entries = 2 * p->sq_entries;
8157 }
8158
8159 user = get_uid(current_user());
8160 limit_mem = !capable(CAP_IPC_LOCK);
8161
8162 if (limit_mem) {
8163 ret = __io_account_mem(user,
8164 ring_pages(p->sq_entries, p->cq_entries));
8165 if (ret) {
8166 free_uid(user);
8167 return ret;
8168 }
8169 }
8170
8171 ctx = io_ring_ctx_alloc(p);
8172 if (!ctx) {
8173 if (limit_mem)
8174 __io_unaccount_mem(user, ring_pages(p->sq_entries,
8175 p->cq_entries));
8176 free_uid(user);
8177 return -ENOMEM;
8178 }
8179 ctx->compat = in_compat_syscall();
8180 ctx->user = user;
8181 ctx->creds = get_current_cred();
8182
8183 ret = io_allocate_scq_urings(ctx, p);
8184 if (ret)
8185 goto err;
8186
8187 ret = io_sq_offload_start(ctx, p);
8188 if (ret)
8189 goto err;
8190
8191 memset(&p->sq_off, 0, sizeof(p->sq_off));
8192 p->sq_off.head = offsetof(struct io_rings, sq.head);
8193 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
8194 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
8195 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
8196 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
8197 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
8198 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
8199
8200 memset(&p->cq_off, 0, sizeof(p->cq_off));
8201 p->cq_off.head = offsetof(struct io_rings, cq.head);
8202 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
8203 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
8204 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
8205 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
8206 p->cq_off.cqes = offsetof(struct io_rings, cqes);
8207 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
8208
8209 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
8210 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
8211 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
8212 IORING_FEAT_POLL_32BITS;
8213
8214 if (copy_to_user(params, p, sizeof(*p))) {
8215 ret = -EFAULT;
8216 goto err;
8217 }
8218 /*
8219 * Install ring fd as the very last thing, so we don't risk someone
8220 * having closed it before we finish setup
8221 */
8222 ret = io_uring_get_fd(ctx);
8223 if (ret < 0)
8224 goto err;
8225
8226 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
8227 io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
8228 ACCT_LOCKED);
8229 ctx->limit_mem = limit_mem;
8230 return ret;
8231 err:
8232 io_ring_ctx_wait_and_kill(ctx);
8233 return ret;
8234 }
8235
8236 /*
8237 * Sets up an aio uring context, and returns the fd. Applications asks for a
8238 * ring size, we return the actual sq/cq ring sizes (among other things) in the
8239 * params structure passed in.
8240 */
8241 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
8242 {
8243 struct io_uring_params p;
8244 int i;
8245
8246 if (copy_from_user(&p, params, sizeof(p)))
8247 return -EFAULT;
8248 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
8249 if (p.resv[i])
8250 return -EINVAL;
8251 }
8252
8253 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8254 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
8255 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
8256 return -EINVAL;
8257
8258 return io_uring_create(entries, &p, params);
8259 }
8260
8261 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
8262 struct io_uring_params __user *, params)
8263 {
8264 return io_uring_setup(entries, params);
8265 }
8266
8267 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
8268 {
8269 struct io_uring_probe *p;
8270 size_t size;
8271 int i, ret;
8272
8273 size = struct_size(p, ops, nr_args);
8274 if (size == SIZE_MAX)
8275 return -EOVERFLOW;
8276 p = kzalloc(size, GFP_KERNEL);
8277 if (!p)
8278 return -ENOMEM;
8279
8280 ret = -EFAULT;
8281 if (copy_from_user(p, arg, size))
8282 goto out;
8283 ret = -EINVAL;
8284 if (memchr_inv(p, 0, size))
8285 goto out;
8286
8287 p->last_op = IORING_OP_LAST - 1;
8288 if (nr_args > IORING_OP_LAST)
8289 nr_args = IORING_OP_LAST;
8290
8291 for (i = 0; i < nr_args; i++) {
8292 p->ops[i].op = i;
8293 if (!io_op_defs[i].not_supported)
8294 p->ops[i].flags = IO_URING_OP_SUPPORTED;
8295 }
8296 p->ops_len = i;
8297
8298 ret = 0;
8299 if (copy_to_user(arg, p, size))
8300 ret = -EFAULT;
8301 out:
8302 kfree(p);
8303 return ret;
8304 }
8305
8306 static int io_register_personality(struct io_ring_ctx *ctx)
8307 {
8308 const struct cred *creds = get_current_cred();
8309 int id;
8310
8311 id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
8312 USHRT_MAX, GFP_KERNEL);
8313 if (id < 0)
8314 put_cred(creds);
8315 return id;
8316 }
8317
8318 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8319 {
8320 const struct cred *old_creds;
8321
8322 old_creds = idr_remove(&ctx->personality_idr, id);
8323 if (old_creds) {
8324 put_cred(old_creds);
8325 return 0;
8326 }
8327
8328 return -EINVAL;
8329 }
8330
8331 static bool io_register_op_must_quiesce(int op)
8332 {
8333 switch (op) {
8334 case IORING_UNREGISTER_FILES:
8335 case IORING_REGISTER_FILES_UPDATE:
8336 case IORING_REGISTER_PROBE:
8337 case IORING_REGISTER_PERSONALITY:
8338 case IORING_UNREGISTER_PERSONALITY:
8339 return false;
8340 default:
8341 return true;
8342 }
8343 }
8344
8345 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
8346 void __user *arg, unsigned nr_args)
8347 __releases(ctx->uring_lock)
8348 __acquires(ctx->uring_lock)
8349 {
8350 int ret;
8351
8352 /*
8353 * We're inside the ring mutex, if the ref is already dying, then
8354 * someone else killed the ctx or is already going through
8355 * io_uring_register().
8356 */
8357 if (percpu_ref_is_dying(&ctx->refs))
8358 return -ENXIO;
8359
8360 if (io_register_op_must_quiesce(opcode)) {
8361 percpu_ref_kill(&ctx->refs);
8362
8363 /*
8364 * Drop uring mutex before waiting for references to exit. If
8365 * another thread is currently inside io_uring_enter() it might
8366 * need to grab the uring_lock to make progress. If we hold it
8367 * here across the drain wait, then we can deadlock. It's safe
8368 * to drop the mutex here, since no new references will come in
8369 * after we've killed the percpu ref.
8370 */
8371 mutex_unlock(&ctx->uring_lock);
8372 ret = wait_for_completion_interruptible(&ctx->ref_comp);
8373 mutex_lock(&ctx->uring_lock);
8374 if (ret) {
8375 percpu_ref_resurrect(&ctx->refs);
8376 ret = -EINTR;
8377 goto out;
8378 }
8379 }
8380
8381 switch (opcode) {
8382 case IORING_REGISTER_BUFFERS:
8383 ret = io_sqe_buffer_register(ctx, arg, nr_args);
8384 break;
8385 case IORING_UNREGISTER_BUFFERS:
8386 ret = -EINVAL;
8387 if (arg || nr_args)
8388 break;
8389 ret = io_sqe_buffer_unregister(ctx);
8390 break;
8391 case IORING_REGISTER_FILES:
8392 ret = io_sqe_files_register(ctx, arg, nr_args);
8393 break;
8394 case IORING_UNREGISTER_FILES:
8395 ret = -EINVAL;
8396 if (arg || nr_args)
8397 break;
8398 ret = io_sqe_files_unregister(ctx);
8399 break;
8400 case IORING_REGISTER_FILES_UPDATE:
8401 ret = io_sqe_files_update(ctx, arg, nr_args);
8402 break;
8403 case IORING_REGISTER_EVENTFD:
8404 case IORING_REGISTER_EVENTFD_ASYNC:
8405 ret = -EINVAL;
8406 if (nr_args != 1)
8407 break;
8408 ret = io_eventfd_register(ctx, arg);
8409 if (ret)
8410 break;
8411 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
8412 ctx->eventfd_async = 1;
8413 else
8414 ctx->eventfd_async = 0;
8415 break;
8416 case IORING_UNREGISTER_EVENTFD:
8417 ret = -EINVAL;
8418 if (arg || nr_args)
8419 break;
8420 ret = io_eventfd_unregister(ctx);
8421 break;
8422 case IORING_REGISTER_PROBE:
8423 ret = -EINVAL;
8424 if (!arg || nr_args > 256)
8425 break;
8426 ret = io_probe(ctx, arg, nr_args);
8427 break;
8428 case IORING_REGISTER_PERSONALITY:
8429 ret = -EINVAL;
8430 if (arg || nr_args)
8431 break;
8432 ret = io_register_personality(ctx);
8433 break;
8434 case IORING_UNREGISTER_PERSONALITY:
8435 ret = -EINVAL;
8436 if (arg)
8437 break;
8438 ret = io_unregister_personality(ctx, nr_args);
8439 break;
8440 default:
8441 ret = -EINVAL;
8442 break;
8443 }
8444
8445 if (io_register_op_must_quiesce(opcode)) {
8446 /* bring the ctx back to life */
8447 percpu_ref_reinit(&ctx->refs);
8448 out:
8449 reinit_completion(&ctx->ref_comp);
8450 }
8451 return ret;
8452 }
8453
8454 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
8455 void __user *, arg, unsigned int, nr_args)
8456 {
8457 struct io_ring_ctx *ctx;
8458 long ret = -EBADF;
8459 struct fd f;
8460
8461 f = fdget(fd);
8462 if (!f.file)
8463 return -EBADF;
8464
8465 ret = -EOPNOTSUPP;
8466 if (f.file->f_op != &io_uring_fops)
8467 goto out_fput;
8468
8469 ctx = f.file->private_data;
8470
8471 mutex_lock(&ctx->uring_lock);
8472 ret = __io_uring_register(ctx, opcode, arg, nr_args);
8473 mutex_unlock(&ctx->uring_lock);
8474 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
8475 ctx->cq_ev_fd != NULL, ret);
8476 out_fput:
8477 fdput(f);
8478 return ret;
8479 }
8480
8481 static int __init io_uring_init(void)
8482 {
8483 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
8484 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
8485 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
8486 } while (0)
8487
8488 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
8489 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
8490 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
8491 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
8492 BUILD_BUG_SQE_ELEM(1, __u8, flags);
8493 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
8494 BUILD_BUG_SQE_ELEM(4, __s32, fd);
8495 BUILD_BUG_SQE_ELEM(8, __u64, off);
8496 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
8497 BUILD_BUG_SQE_ELEM(16, __u64, addr);
8498 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
8499 BUILD_BUG_SQE_ELEM(24, __u32, len);
8500 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
8501 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
8502 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
8503 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
8504 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
8505 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
8506 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
8507 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
8508 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
8509 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
8510 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
8511 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
8512 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
8513 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
8514 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
8515 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
8516 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
8517 BUILD_BUG_SQE_ELEM(42, __u16, personality);
8518 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
8519
8520 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
8521 BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
8522 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
8523 return 0;
8524 };
8525 __initcall(io_uring_init);