fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/kthread.h>
  61 #include <linux/blkdev.h>
  62 #include <linux/bvec.h>
  63 #include <linux/net.h>
  64 #include <net/sock.h>
  65 #include <net/af_unix.h>
  66 #include <net/scm.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/highmem.h>
  74 #include <linux/namei.h>
  75 #include <linux/fsnotify.h>
  76 #include <linux/fadvise.h>
  77 #include <linux/eventpoll.h>
  78 #include <linux/fs_struct.h>
  79 #include <linux/splice.h>
  80 #include <linux/task_work.h>
  81 #include <linux/pagemap.h>
  82 #include <linux/io_uring.h>
  83 #include <linux/blk-cgroup.h>
  84 #include <linux/audit.h>
  85
  86 #define CREATE_TRACE_POINTS
  87 #include <trace/events/io_uring.h>
  88
  89 #include <uapi/linux/io_uring.h>
  90
  91 #include "internal.h"
  92 #include "io-wq.h"
  93
  94 #define IORING_MAX_ENTRIES      32768
  95 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  96
  97 /*
  98  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  99  */
 100 #define IORING_FILE_TABLE_SHIFT 9
 101 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
 102 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 103 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 104 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 105                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 106
 107 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
 108                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 109                                 IOSQE_BUFFER_SELECT)
 110
 111 struct io_uring {
 112         u32 head ____cacheline_aligned_in_smp;
 113         u32 tail ____cacheline_aligned_in_smp;
 114 };
 115
 116 /*
 117  * This data is shared with the application through the mmap at offsets
 118  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 119  *
 120  * The offsets to the member fields are published through struct
 121  * io_sqring_offsets when calling io_uring_setup.
 122  */
 123 struct io_rings {
 124         /*
 125          * Head and tail offsets into the ring; the offsets need to be
 126          * masked to get valid indices.
 127          *
 128          * The kernel controls head of the sq ring and the tail of the cq ring,
 129          * and the application controls tail of the sq ring and the head of the
 130          * cq ring.
 131          */
 132         struct io_uring         sq, cq;
 133         /*
 134          * Bitmasks to apply to head and tail offsets (constant, equals
 135          * ring_entries - 1)
 136          */
 137         u32                     sq_ring_mask, cq_ring_mask;
 138         /* Ring sizes (constant, power of 2) */
 139         u32                     sq_ring_entries, cq_ring_entries;
 140         /*
 141          * Number of invalid entries dropped by the kernel due to
 142          * invalid index stored in array
 143          *
 144          * Written by the kernel, shouldn't be modified by the
 145          * application (i.e. get number of "new events" by comparing to
 146          * cached value).
 147          *
 148          * After a new SQ head value was read by the application this
 149          * counter includes all submissions that were dropped reaching
 150          * the new SQ head (and possibly more).
 151          */
 152         u32                     sq_dropped;
 153         /*
 154          * Runtime SQ flags
 155          *
 156          * Written by the kernel, shouldn't be modified by the
 157          * application.
 158          *
 159          * The application needs a full memory barrier before checking
 160          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 161          */
 162         u32                     sq_flags;
 163         /*
 164          * Runtime CQ flags
 165          *
 166          * Written by the application, shouldn't be modified by the
 167          * kernel.
 168          */
 169         u32                     cq_flags;
 170         /*
 171          * Number of completion events lost because the queue was full;
 172          * this should be avoided by the application by making sure
 173          * there are not more requests pending than there is space in
 174          * the completion queue.
 175          *
 176          * Written by the kernel, shouldn't be modified by the
 177          * application (i.e. get number of "new events" by comparing to
 178          * cached value).
 179          *
 180          * As completion events come in out of order this counter is not
 181          * ordered with any other data.
 182          */
 183         u32                     cq_overflow;
 184         /*
 185          * Ring buffer of completion events.
 186          *
 187          * The kernel writes completion events fresh every time they are
 188          * produced, so the application is allowed to modify pending
 189          * entries.
 190          */
 191         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 192 };
 193
 194 enum io_uring_cmd_flags {
 195         IO_URING_F_NONBLOCK             = 1,
 196         IO_URING_F_COMPLETE_DEFER       = 2,
 197 };
 198
 199 struct io_mapped_ubuf {
 200         u64             ubuf;
 201         size_t          len;
 202         struct          bio_vec *bvec;
 203         unsigned int    nr_bvecs;
 204         unsigned long   acct_pages;
 205 };
 206
 207 struct io_ring_ctx;
 208
 209 struct io_rsrc_put {
 210         struct list_head list;
 211         union {
 212                 void *rsrc;
 213                 struct file *file;
 214         };
 215 };
 216
 217 struct fixed_rsrc_table {
 218         struct file             **files;
 219 };
 220
 221 struct fixed_rsrc_ref_node {
 222         struct percpu_ref               refs;
 223         struct list_head                node;
 224         struct list_head                rsrc_list;
 225         struct fixed_rsrc_data          *rsrc_data;
 226         void                            (*rsrc_put)(struct io_ring_ctx *ctx,
 227                                                     struct io_rsrc_put *prsrc);
 228         struct llist_node               llist;
 229         bool                            done;
 230 };
 231
 232 struct fixed_rsrc_data {
 233         struct fixed_rsrc_table         *table;
 234         struct io_ring_ctx              *ctx;
 235
 236         struct fixed_rsrc_ref_node      *node;
 237         struct percpu_ref               refs;
 238         struct completion               done;
 239 };
 240
 241 struct io_buffer {
 242         struct list_head list;
 243         __u64 addr;
 244         __s32 len;
 245         __u16 bid;
 246 };
 247
 248 struct io_restriction {
 249         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 250         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 251         u8 sqe_flags_allowed;
 252         u8 sqe_flags_required;
 253         bool registered;
 254 };
 255
 256 struct io_sq_data {
 257         refcount_t              refs;
 258         struct mutex            lock;
 259
 260         /* ctx's that are using this sqd */
 261         struct list_head        ctx_list;
 262         struct list_head        ctx_new_list;
 263         struct mutex            ctx_lock;
 264
 265         struct task_struct      *thread;
 266         struct wait_queue_head  wait;
 267
 268         unsigned                sq_thread_idle;
 269 };
 270
 271 #define IO_IOPOLL_BATCH                 8
 272 #define IO_COMPL_BATCH                  32
 273 #define IO_REQ_CACHE_SIZE               32
 274 #define IO_REQ_ALLOC_BATCH              8
 275
 276 struct io_comp_state {
 277         struct io_kiocb         *reqs[IO_COMPL_BATCH];
 278         unsigned int            nr;
 279         unsigned int            locked_free_nr;
 280         /* inline/task_work completion list, under ->uring_lock */
 281         struct list_head        free_list;
 282         /* IRQ completion list, under ->completion_lock */
 283         struct list_head        locked_free_list;
 284 };
 285
 286 struct io_submit_link {
 287         struct io_kiocb         *head;
 288         struct io_kiocb         *last;
 289 };
 290
 291 struct io_submit_state {
 292         struct blk_plug         plug;
 293         struct io_submit_link   link;
 294
 295         /*
 296          * io_kiocb alloc cache
 297          */
 298         void                    *reqs[IO_REQ_CACHE_SIZE];
 299         unsigned int            free_reqs;
 300
 301         bool                    plug_started;
 302
 303         /*
 304          * Batch completion logic
 305          */
 306         struct io_comp_state    comp;
 307
 308         /*
 309          * File reference cache
 310          */
 311         struct file             *file;
 312         unsigned int            fd;
 313         unsigned int            file_refs;
 314         unsigned int            ios_left;
 315 };
 316
 317 struct io_ring_ctx {
 318         struct {
 319                 struct percpu_ref       refs;
 320         } ____cacheline_aligned_in_smp;
 321
 322         struct {
 323                 unsigned int            flags;
 324                 unsigned int            compat: 1;
 325                 unsigned int            limit_mem: 1;
 326                 unsigned int            cq_overflow_flushed: 1;
 327                 unsigned int            drain_next: 1;
 328                 unsigned int            eventfd_async: 1;
 329                 unsigned int            restricted: 1;
 330                 unsigned int            sqo_dead: 1;
 331
 332                 /*
 333                  * Ring buffer of indices into array of io_uring_sqe, which is
 334                  * mmapped by the application using the IORING_OFF_SQES offset.
 335                  *
 336                  * This indirection could e.g. be used to assign fixed
 337                  * io_uring_sqe entries to operations and only submit them to
 338                  * the queue when needed.
 339                  *
 340                  * The kernel modifies neither the indices array nor the entries
 341                  * array.
 342                  */
 343                 u32                     *sq_array;
 344                 unsigned                cached_sq_head;
 345                 unsigned                sq_entries;
 346                 unsigned                sq_mask;
 347                 unsigned                sq_thread_idle;
 348                 unsigned                cached_sq_dropped;
 349                 unsigned                cached_cq_overflow;
 350                 unsigned long           sq_check_overflow;
 351
 352                 struct list_head        defer_list;
 353                 struct list_head        timeout_list;
 354                 struct list_head        cq_overflow_list;
 355
 356                 struct io_uring_sqe     *sq_sqes;
 357         } ____cacheline_aligned_in_smp;
 358
 359         struct {
 360                 struct mutex            uring_lock;
 361                 wait_queue_head_t       wait;
 362         } ____cacheline_aligned_in_smp;
 363
 364         struct io_submit_state          submit_state;
 365
 366         struct io_rings *rings;
 367
 368         /* IO offload */
 369         struct io_wq            *io_wq;
 370
 371         /*
 372          * For SQPOLL usage - we hold a reference to the parent task, so we
 373          * have access to the ->files
 374          */
 375         struct task_struct      *sqo_task;
 376
 377         /* Only used for accounting purposes */
 378         struct mm_struct        *mm_account;
 379
 380 #ifdef CONFIG_BLK_CGROUP
 381         struct cgroup_subsys_state      *sqo_blkcg_css;
 382 #endif
 383
 384         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 385
 386         struct wait_queue_head  sqo_sq_wait;
 387         struct list_head        sqd_list;
 388
 389         /*
 390          * If used, fixed file set. Writers must ensure that ->refs is dead,
 391          * readers must ensure that ->refs is alive as long as the file* is
 392          * used. Only updated through io_uring_register(2).
 393          */
 394         struct fixed_rsrc_data  *file_data;
 395         unsigned                nr_user_files;
 396
 397         /* if used, fixed mapped user buffers */
 398         unsigned                nr_user_bufs;
 399         struct io_mapped_ubuf   *user_bufs;
 400
 401         struct user_struct      *user;
 402
 403         const struct cred       *creds;
 404
 405 #ifdef CONFIG_AUDIT
 406         kuid_t                  loginuid;
 407         unsigned int            sessionid;
 408 #endif
 409
 410         struct completion       ref_comp;
 411         struct completion       sq_thread_comp;
 412
 413 #if defined(CONFIG_UNIX)
 414         struct socket           *ring_sock;
 415 #endif
 416
 417         struct idr              io_buffer_idr;
 418
 419         struct idr              personality_idr;
 420
 421         struct {
 422                 unsigned                cached_cq_tail;
 423                 unsigned                cq_entries;
 424                 unsigned                cq_mask;
 425                 atomic_t                cq_timeouts;
 426                 unsigned                cq_last_tm_flush;
 427                 unsigned long           cq_check_overflow;
 428                 struct wait_queue_head  cq_wait;
 429                 struct fasync_struct    *cq_fasync;
 430                 struct eventfd_ctx      *cq_ev_fd;
 431         } ____cacheline_aligned_in_smp;
 432
 433         struct {
 434                 spinlock_t              completion_lock;
 435
 436                 /*
 437                  * ->iopoll_list is protected by the ctx->uring_lock for
 438                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 439                  * For SQPOLL, only the single threaded io_sq_thread() will
 440                  * manipulate the list, hence no extra locking is needed there.
 441                  */
 442                 struct list_head        iopoll_list;
 443                 struct hlist_head       *cancel_hash;
 444                 unsigned                cancel_hash_bits;
 445                 bool                    poll_multi_file;
 446
 447                 spinlock_t              inflight_lock;
 448                 struct list_head        inflight_list;
 449         } ____cacheline_aligned_in_smp;
 450
 451         struct delayed_work             rsrc_put_work;
 452         struct llist_head               rsrc_put_llist;
 453         struct list_head                rsrc_ref_list;
 454         spinlock_t                      rsrc_ref_lock;
 455
 456         struct io_restriction           restrictions;
 457
 458         /* Keep this last, we don't need it for the fast path */
 459         struct work_struct              exit_work;
 460 };
 461
 462 /*
 463  * First field must be the file pointer in all the
 464  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 465  */
 466 struct io_poll_iocb {
 467         struct file                     *file;
 468         struct wait_queue_head          *head;
 469         __poll_t                        events;
 470         bool                            done;
 471         bool                            canceled;
 472         struct wait_queue_entry         wait;
 473 };
 474
 475 struct io_poll_remove {
 476         struct file                     *file;
 477         u64                             addr;
 478 };
 479
 480 struct io_close {
 481         struct file                     *file;
 482         int                             fd;
 483 };
 484
 485 struct io_timeout_data {
 486         struct io_kiocb                 *req;
 487         struct hrtimer                  timer;
 488         struct timespec64               ts;
 489         enum hrtimer_mode               mode;
 490 };
 491
 492 struct io_accept {
 493         struct file                     *file;
 494         struct sockaddr __user          *addr;
 495         int __user                      *addr_len;
 496         int                             flags;
 497         unsigned long                   nofile;
 498 };
 499
 500 struct io_sync {
 501         struct file                     *file;
 502         loff_t                          len;
 503         loff_t                          off;
 504         int                             flags;
 505         int                             mode;
 506 };
 507
 508 struct io_cancel {
 509         struct file                     *file;
 510         u64                             addr;
 511 };
 512
 513 struct io_timeout {
 514         struct file                     *file;
 515         u32                             off;
 516         u32                             target_seq;
 517         struct list_head                list;
 518         /* head of the link, used by linked timeouts only */
 519         struct io_kiocb                 *head;
 520 };
 521
 522 struct io_timeout_rem {
 523         struct file                     *file;
 524         u64                             addr;
 525
 526         /* timeout update */
 527         struct timespec64               ts;
 528         u32                             flags;
 529 };
 530
 531 struct io_rw {
 532         /* NOTE: kiocb has the file as the first member, so don't do it here */
 533         struct kiocb                    kiocb;
 534         u64                             addr;
 535         u64                             len;
 536 };
 537
 538 struct io_connect {
 539         struct file                     *file;
 540         struct sockaddr __user          *addr;
 541         int                             addr_len;
 542 };
 543
 544 struct io_sr_msg {
 545         struct file                     *file;
 546         union {
 547                 struct user_msghdr __user *umsg;
 548                 void __user             *buf;
 549         };
 550         int                             msg_flags;
 551         int                             bgid;
 552         size_t                          len;
 553         struct io_buffer                *kbuf;
 554 };
 555
 556 struct io_open {
 557         struct file                     *file;
 558         int                             dfd;
 559         struct filename                 *filename;
 560         struct open_how                 how;
 561         unsigned long                   nofile;
 562 };
 563
 564 struct io_rsrc_update {
 565         struct file                     *file;
 566         u64                             arg;
 567         u32                             nr_args;
 568         u32                             offset;
 569 };
 570
 571 struct io_fadvise {
 572         struct file                     *file;
 573         u64                             offset;
 574         u32                             len;
 575         u32                             advice;
 576 };
 577
 578 struct io_madvise {
 579         struct file                     *file;
 580         u64                             addr;
 581         u32                             len;
 582         u32                             advice;
 583 };
 584
 585 struct io_epoll {
 586         struct file                     *file;
 587         int                             epfd;
 588         int                             op;
 589         int                             fd;
 590         struct epoll_event              event;
 591 };
 592
 593 struct io_splice {
 594         struct file                     *file_out;
 595         struct file                     *file_in;
 596         loff_t                          off_out;
 597         loff_t                          off_in;
 598         u64                             len;
 599         unsigned int                    flags;
 600 };
 601
 602 struct io_provide_buf {
 603         struct file                     *file;
 604         __u64                           addr;
 605         __s32                           len;
 606         __u32                           bgid;
 607         __u16                           nbufs;
 608         __u16                           bid;
 609 };
 610
 611 struct io_statx {
 612         struct file                     *file;
 613         int                             dfd;
 614         unsigned int                    mask;
 615         unsigned int                    flags;
 616         const char __user               *filename;
 617         struct statx __user             *buffer;
 618 };
 619
 620 struct io_shutdown {
 621         struct file                     *file;
 622         int                             how;
 623 };
 624
 625 struct io_rename {
 626         struct file                     *file;
 627         int                             old_dfd;
 628         int                             new_dfd;
 629         struct filename                 *oldpath;
 630         struct filename                 *newpath;
 631         int                             flags;
 632 };
 633
 634 struct io_unlink {
 635         struct file                     *file;
 636         int                             dfd;
 637         int                             flags;
 638         struct filename                 *filename;
 639 };
 640
 641 struct io_completion {
 642         struct file                     *file;
 643         struct list_head                list;
 644         int                             cflags;
 645 };
 646
 647 struct io_async_connect {
 648         struct sockaddr_storage         address;
 649 };
 650
 651 struct io_async_msghdr {
 652         struct iovec                    fast_iov[UIO_FASTIOV];
 653         /* points to an allocated iov, if NULL we use fast_iov instead */
 654         struct iovec                    *free_iov;
 655         struct sockaddr __user          *uaddr;
 656         struct msghdr                   msg;
 657         struct sockaddr_storage         addr;
 658 };
 659
 660 struct io_async_rw {
 661         struct iovec                    fast_iov[UIO_FASTIOV];
 662         const struct iovec              *free_iovec;
 663         struct iov_iter                 iter;
 664         size_t                          bytes_done;
 665         struct wait_page_queue          wpq;
 666 };
 667
 668 enum {
 669         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 670         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 671         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 672         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 673         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 674         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 675
 676         REQ_F_FAIL_LINK_BIT,
 677         REQ_F_INFLIGHT_BIT,
 678         REQ_F_CUR_POS_BIT,
 679         REQ_F_NOWAIT_BIT,
 680         REQ_F_LINK_TIMEOUT_BIT,
 681         REQ_F_ISREG_BIT,
 682         REQ_F_NEED_CLEANUP_BIT,
 683         REQ_F_POLLED_BIT,
 684         REQ_F_BUFFER_SELECTED_BIT,
 685         REQ_F_NO_FILE_TABLE_BIT,
 686         REQ_F_WORK_INITIALIZED_BIT,
 687         REQ_F_LTIMEOUT_ACTIVE_BIT,
 688         REQ_F_COMPLETE_INLINE_BIT,
 689
 690         /* not a real bit, just to check we're not overflowing the space */
 691         __REQ_F_LAST_BIT,
 692 };
 693
 694 enum {
 695         /* ctx owns file */
 696         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 697         /* drain existing IO first */
 698         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 699         /* linked sqes */
 700         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 701         /* doesn't sever on completion < 0 */
 702         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 703         /* IOSQE_ASYNC */
 704         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 705         /* IOSQE_BUFFER_SELECT */
 706         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 707
 708         /* fail rest of links */
 709         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 710         /* on inflight list */
 711         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 712         /* read/write uses file position */
 713         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 714         /* must not punt to workers */
 715         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 716         /* has or had linked timeout */
 717         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 718         /* regular file */
 719         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 720         /* needs cleanup */
 721         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 722         /* already went through poll handler */
 723         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 724         /* buffer already selected */
 725         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 726         /* doesn't need file table for this request */
 727         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 728         /* io_wq_work is initialized */
 729         REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
 730         /* linked timeout is active, i.e. prepared by link's head */
 731         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 732         /* completion is deferred through io_comp_state */
 733         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 734 };
 735
 736 struct async_poll {
 737         struct io_poll_iocb     poll;
 738         struct io_poll_iocb     *double_poll;
 739 };
 740
 741 struct io_task_work {
 742         struct io_wq_work_node  node;
 743         task_work_func_t        func;
 744 };
 745
 746 /*
 747  * NOTE! Each of the iocb union members has the file pointer
 748  * as the first entry in their struct definition. So you can
 749  * access the file pointer through any of the sub-structs,
 750  * or directly as just 'ki_filp' in this struct.
 751  */
 752 struct io_kiocb {
 753         union {
 754                 struct file             *file;
 755                 struct io_rw            rw;
 756                 struct io_poll_iocb     poll;
 757                 struct io_poll_remove   poll_remove;
 758                 struct io_accept        accept;
 759                 struct io_sync          sync;
 760                 struct io_cancel        cancel;
 761                 struct io_timeout       timeout;
 762                 struct io_timeout_rem   timeout_rem;
 763                 struct io_connect       connect;
 764                 struct io_sr_msg        sr_msg;
 765                 struct io_open          open;
 766                 struct io_close         close;
 767                 struct io_rsrc_update   rsrc_update;
 768                 struct io_fadvise       fadvise;
 769                 struct io_madvise       madvise;
 770                 struct io_epoll         epoll;
 771                 struct io_splice        splice;
 772                 struct io_provide_buf   pbuf;
 773                 struct io_statx         statx;
 774                 struct io_shutdown      shutdown;
 775                 struct io_rename        rename;
 776                 struct io_unlink        unlink;
 777                 /* use only after cleaning per-op data, see io_clean_op() */
 778                 struct io_completion    compl;
 779         };
 780
 781         /* opcode allocated if it needs to store data for async defer */
 782         void                            *async_data;
 783         u8                              opcode;
 784         /* polled IO has completed */
 785         u8                              iopoll_completed;
 786
 787         u16                             buf_index;
 788         u32                             result;
 789
 790         struct io_ring_ctx              *ctx;
 791         unsigned int                    flags;
 792         refcount_t                      refs;
 793         struct task_struct              *task;
 794         u64                             user_data;
 795
 796         struct io_kiocb                 *link;
 797         struct percpu_ref               *fixed_rsrc_refs;
 798
 799         /*
 800          * 1. used with ctx->iopoll_list with reads/writes
 801          * 2. to track reqs with ->files (see io_op_def::file_table)
 802          */
 803         struct list_head                inflight_entry;
 804         union {
 805                 struct io_task_work     io_task_work;
 806                 struct callback_head    task_work;
 807         };
 808         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 809         struct hlist_node               hash_node;
 810         struct async_poll               *apoll;
 811         struct io_wq_work               work;
 812 };
 813
 814 struct io_defer_entry {
 815         struct list_head        list;
 816         struct io_kiocb         *req;
 817         u32                     seq;
 818 };
 819
 820 struct io_op_def {
 821         /* needs req->file assigned */
 822         unsigned                needs_file : 1;
 823         /* hash wq insertion if file is a regular file */
 824         unsigned                hash_reg_file : 1;
 825         /* unbound wq insertion if file is a non-regular file */
 826         unsigned                unbound_nonreg_file : 1;
 827         /* opcode is not supported by this kernel */
 828         unsigned                not_supported : 1;
 829         /* set if opcode supports polled "wait" */
 830         unsigned                pollin : 1;
 831         unsigned                pollout : 1;
 832         /* op supports buffer selection */
 833         unsigned                buffer_select : 1;
 834         /* must always have async data allocated */
 835         unsigned                needs_async_data : 1;
 836         /* should block plug */
 837         unsigned                plug : 1;
 838         /* size of async data needed, if any */
 839         unsigned short          async_size;
 840         unsigned                work_flags;
 841 };
 842
 843 static const struct io_op_def io_op_defs[] = {
 844         [IORING_OP_NOP] = {},
 845         [IORING_OP_READV] = {
 846                 .needs_file             = 1,
 847                 .unbound_nonreg_file    = 1,
 848                 .pollin                 = 1,
 849                 .buffer_select          = 1,
 850                 .needs_async_data       = 1,
 851                 .plug                   = 1,
 852                 .async_size             = sizeof(struct io_async_rw),
 853                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 854         },
 855         [IORING_OP_WRITEV] = {
 856                 .needs_file             = 1,
 857                 .hash_reg_file          = 1,
 858                 .unbound_nonreg_file    = 1,
 859                 .pollout                = 1,
 860                 .needs_async_data       = 1,
 861                 .plug                   = 1,
 862                 .async_size             = sizeof(struct io_async_rw),
 863                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 864                                                 IO_WQ_WORK_FSIZE,
 865         },
 866         [IORING_OP_FSYNC] = {
 867                 .needs_file             = 1,
 868                 .work_flags             = IO_WQ_WORK_BLKCG,
 869         },
 870         [IORING_OP_READ_FIXED] = {
 871                 .needs_file             = 1,
 872                 .unbound_nonreg_file    = 1,
 873                 .pollin                 = 1,
 874                 .plug                   = 1,
 875                 .async_size             = sizeof(struct io_async_rw),
 876                 .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
 877         },
 878         [IORING_OP_WRITE_FIXED] = {
 879                 .needs_file             = 1,
 880                 .hash_reg_file          = 1,
 881                 .unbound_nonreg_file    = 1,
 882                 .pollout                = 1,
 883                 .plug                   = 1,
 884                 .async_size             = sizeof(struct io_async_rw),
 885                 .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
 886                                                 IO_WQ_WORK_MM,
 887         },
 888         [IORING_OP_POLL_ADD] = {
 889                 .needs_file             = 1,
 890                 .unbound_nonreg_file    = 1,
 891         },
 892         [IORING_OP_POLL_REMOVE] = {},
 893         [IORING_OP_SYNC_FILE_RANGE] = {
 894                 .needs_file             = 1,
 895                 .work_flags             = IO_WQ_WORK_BLKCG,
 896         },
 897         [IORING_OP_SENDMSG] = {
 898                 .needs_file             = 1,
 899                 .unbound_nonreg_file    = 1,
 900                 .pollout                = 1,
 901                 .needs_async_data       = 1,
 902                 .async_size             = sizeof(struct io_async_msghdr),
 903                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 904         },
 905         [IORING_OP_RECVMSG] = {
 906                 .needs_file             = 1,
 907                 .unbound_nonreg_file    = 1,
 908                 .pollin                 = 1,
 909                 .buffer_select          = 1,
 910                 .needs_async_data       = 1,
 911                 .async_size             = sizeof(struct io_async_msghdr),
 912                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 913         },
 914         [IORING_OP_TIMEOUT] = {
 915                 .needs_async_data       = 1,
 916                 .async_size             = sizeof(struct io_timeout_data),
 917                 .work_flags             = IO_WQ_WORK_MM,
 918         },
 919         [IORING_OP_TIMEOUT_REMOVE] = {
 920                 /* used by timeout updates' prep() */
 921                 .work_flags             = IO_WQ_WORK_MM,
 922         },
 923         [IORING_OP_ACCEPT] = {
 924                 .needs_file             = 1,
 925                 .unbound_nonreg_file    = 1,
 926                 .pollin                 = 1,
 927                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
 928         },
 929         [IORING_OP_ASYNC_CANCEL] = {},
 930         [IORING_OP_LINK_TIMEOUT] = {
 931                 .needs_async_data       = 1,
 932                 .async_size             = sizeof(struct io_timeout_data),
 933                 .work_flags             = IO_WQ_WORK_MM,
 934         },
 935         [IORING_OP_CONNECT] = {
 936                 .needs_file             = 1,
 937                 .unbound_nonreg_file    = 1,
 938                 .pollout                = 1,
 939                 .needs_async_data       = 1,
 940                 .async_size             = sizeof(struct io_async_connect),
 941                 .work_flags             = IO_WQ_WORK_MM,
 942         },
 943         [IORING_OP_FALLOCATE] = {
 944                 .needs_file             = 1,
 945                 .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
 946         },
 947         [IORING_OP_OPENAT] = {
 948                 .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
 949                                                 IO_WQ_WORK_FS | IO_WQ_WORK_MM,
 950         },
 951         [IORING_OP_CLOSE] = {
 952                 .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
 953         },
 954         [IORING_OP_FILES_UPDATE] = {
 955                 .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
 956         },
 957         [IORING_OP_STATX] = {
 958                 .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
 959                                                 IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
 960         },
 961         [IORING_OP_READ] = {
 962                 .needs_file             = 1,
 963                 .unbound_nonreg_file    = 1,
 964                 .pollin                 = 1,
 965                 .buffer_select          = 1,
 966                 .plug                   = 1,
 967                 .async_size             = sizeof(struct io_async_rw),
 968                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 969         },
 970         [IORING_OP_WRITE] = {
 971                 .needs_file             = 1,
 972                 .unbound_nonreg_file    = 1,
 973                 .pollout                = 1,
 974                 .plug                   = 1,
 975                 .async_size             = sizeof(struct io_async_rw),
 976                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 977                                                 IO_WQ_WORK_FSIZE,
 978         },
 979         [IORING_OP_FADVISE] = {
 980                 .needs_file             = 1,
 981                 .work_flags             = IO_WQ_WORK_BLKCG,
 982         },
 983         [IORING_OP_MADVISE] = {
 984                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 985         },
 986         [IORING_OP_SEND] = {
 987                 .needs_file             = 1,
 988                 .unbound_nonreg_file    = 1,
 989                 .pollout                = 1,
 990                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 991         },
 992         [IORING_OP_RECV] = {
 993                 .needs_file             = 1,
 994                 .unbound_nonreg_file    = 1,
 995                 .pollin                 = 1,
 996                 .buffer_select          = 1,
 997                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 998         },
 999         [IORING_OP_OPENAT2] = {
1000                 .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
1001                                                 IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
1002         },
1003         [IORING_OP_EPOLL_CTL] = {
1004                 .unbound_nonreg_file    = 1,
1005                 .work_flags             = IO_WQ_WORK_FILES,
1006         },
1007         [IORING_OP_SPLICE] = {
1008                 .needs_file             = 1,
1009                 .hash_reg_file          = 1,
1010                 .unbound_nonreg_file    = 1,
1011                 .work_flags             = IO_WQ_WORK_BLKCG,
1012         },
1013         [IORING_OP_PROVIDE_BUFFERS] = {},
1014         [IORING_OP_REMOVE_BUFFERS] = {},
1015         [IORING_OP_TEE] = {
1016                 .needs_file             = 1,
1017                 .hash_reg_file          = 1,
1018                 .unbound_nonreg_file    = 1,
1019         },
1020         [IORING_OP_SHUTDOWN] = {
1021                 .needs_file             = 1,
1022         },
1023         [IORING_OP_RENAMEAT] = {
1024                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
1025                                                 IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
1026         },
1027         [IORING_OP_UNLINKAT] = {
1028                 .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
1029                                                 IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
1030         },
1031 };
1032
1033 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1034                                          struct task_struct *task,
1035                                          struct files_struct *files);
1036 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
1037 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
1038                         struct io_ring_ctx *ctx);
1039 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
1040                                      struct fixed_rsrc_ref_node *ref_node);
1041
1042 static bool io_rw_reissue(struct io_kiocb *req);
1043 static void io_cqring_fill_event(struct io_kiocb *req, long res);
1044 static void io_put_req(struct io_kiocb *req);
1045 static void io_put_req_deferred(struct io_kiocb *req, int nr);
1046 static void io_double_put_req(struct io_kiocb *req);
1047 static void io_dismantle_req(struct io_kiocb *req);
1048 static void io_put_task(struct task_struct *task, int nr);
1049 static void io_queue_next(struct io_kiocb *req);
1050 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1051 static void __io_queue_linked_timeout(struct io_kiocb *req);
1052 static void io_queue_linked_timeout(struct io_kiocb *req);
1053 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
1054                                  struct io_uring_rsrc_update *ip,
1055                                  unsigned nr_args);
1056 static void __io_clean_op(struct io_kiocb *req);
1057 static struct file *io_file_get(struct io_submit_state *state,
1058                                 struct io_kiocb *req, int fd, bool fixed);
1059 static void __io_queue_sqe(struct io_kiocb *req);
1060 static void io_rsrc_put_work(struct work_struct *work);
1061
1062 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
1063                            struct iov_iter *iter, bool needs_lock);
1064 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
1065                              const struct iovec *fast_iov,
1066                              struct iov_iter *iter, bool force);
1067 static void io_req_task_queue(struct io_kiocb *req);
1068 static void io_submit_flush_completions(struct io_comp_state *cs,
1069                                         struct io_ring_ctx *ctx);
1070
1071 static struct kmem_cache *req_cachep;
1072
1073 static const struct file_operations io_uring_fops;
1074
1075 struct sock *io_uring_get_socket(struct file *file)
1076 {
1077 #if defined(CONFIG_UNIX)
1078         if (file->f_op == &io_uring_fops) {
1079                 struct io_ring_ctx *ctx = file->private_data;
1080
1081                 return ctx->ring_sock->sk;
1082         }
1083 #endif
1084         return NULL;
1085 }
1086 EXPORT_SYMBOL(io_uring_get_socket);
1087
1088 #define io_for_each_link(pos, head) \
1089         for (pos = (head); pos; pos = pos->link)
1090
1091 static inline void io_clean_op(struct io_kiocb *req)
1092 {
1093         if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
1094                 __io_clean_op(req);
1095 }
1096
1097 static inline void io_set_resource_node(struct io_kiocb *req)
1098 {
1099         struct io_ring_ctx *ctx = req->ctx;
1100
1101         if (!req->fixed_rsrc_refs) {
1102                 req->fixed_rsrc_refs = &ctx->file_data->node->refs;
1103                 percpu_ref_get(req->fixed_rsrc_refs);
1104         }
1105 }
1106
1107 static bool io_match_task(struct io_kiocb *head,
1108                           struct task_struct *task,
1109                           struct files_struct *files)
1110 {
1111         struct io_kiocb *req;
1112
1113         if (task && head->task != task) {
1114                 /* in terms of cancelation, always match if req task is dead */
1115                 if (head->task->flags & PF_EXITING)
1116                         return true;
1117                 return false;
1118         }
1119         if (!files)
1120                 return true;
1121
1122         io_for_each_link(req, head) {
1123                 if (!(req->flags & REQ_F_WORK_INITIALIZED))
1124                         continue;
1125                 if (req->file && req->file->f_op == &io_uring_fops)
1126                         return true;
1127                 if ((req->work.flags & IO_WQ_WORK_FILES) &&
1128                     req->work.identity->files == files)
1129                         return true;
1130         }
1131         return false;
1132 }
1133
1134 static void io_sq_thread_drop_mm_files(void)
1135 {
1136         struct files_struct *files = current->files;
1137         struct mm_struct *mm = current->mm;
1138
1139         if (mm) {
1140                 kthread_unuse_mm(mm);
1141                 mmput(mm);
1142                 current->mm = NULL;
1143         }
1144         if (files) {
1145                 struct nsproxy *nsproxy = current->nsproxy;
1146
1147                 task_lock(current);
1148                 current->files = NULL;
1149                 current->nsproxy = NULL;
1150                 task_unlock(current);
1151                 put_files_struct(files);
1152                 put_nsproxy(nsproxy);
1153         }
1154 }
1155
1156 static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
1157 {
1158         if (!current->files) {
1159                 struct files_struct *files;
1160                 struct nsproxy *nsproxy;
1161
1162                 task_lock(ctx->sqo_task);
1163                 files = ctx->sqo_task->files;
1164                 if (!files) {
1165                         task_unlock(ctx->sqo_task);
1166                         return -EOWNERDEAD;
1167                 }
1168                 atomic_inc(&files->count);
1169                 get_nsproxy(ctx->sqo_task->nsproxy);
1170                 nsproxy = ctx->sqo_task->nsproxy;
1171                 task_unlock(ctx->sqo_task);
1172
1173                 task_lock(current);
1174                 current->files = files;
1175                 current->nsproxy = nsproxy;
1176                 task_unlock(current);
1177         }
1178         return 0;
1179 }
1180
1181 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
1182 {
1183         struct mm_struct *mm;
1184
1185         if (current->mm)
1186                 return 0;
1187
1188         task_lock(ctx->sqo_task);
1189         mm = ctx->sqo_task->mm;
1190         if (unlikely(!mm || !mmget_not_zero(mm)))
1191                 mm = NULL;
1192         task_unlock(ctx->sqo_task);
1193
1194         if (mm) {
1195                 kthread_use_mm(mm);
1196                 return 0;
1197         }
1198
1199         return -EFAULT;
1200 }
1201
1202 static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
1203                                            struct io_kiocb *req)
1204 {
1205         const struct io_op_def *def = &io_op_defs[req->opcode];
1206         int ret;
1207
1208         if (def->work_flags & IO_WQ_WORK_MM) {
1209                 ret = __io_sq_thread_acquire_mm(ctx);
1210                 if (unlikely(ret))
1211                         return ret;
1212         }
1213
1214         if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
1215                 ret = __io_sq_thread_acquire_files(ctx);
1216                 if (unlikely(ret))
1217                         return ret;
1218         }
1219
1220         return 0;
1221 }
1222
1223 static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
1224                                                 struct io_kiocb *req)
1225 {
1226         if (!(ctx->flags & IORING_SETUP_SQPOLL))
1227                 return 0;
1228         return __io_sq_thread_acquire_mm_files(ctx, req);
1229 }
1230
1231 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
1232                                          struct cgroup_subsys_state **cur_css)
1233
1234 {
1235 #ifdef CONFIG_BLK_CGROUP
1236         /* puts the old one when swapping */
1237         if (*cur_css != ctx->sqo_blkcg_css) {
1238                 kthread_associate_blkcg(ctx->sqo_blkcg_css);
1239                 *cur_css = ctx->sqo_blkcg_css;
1240         }
1241 #endif
1242 }
1243
1244 static void io_sq_thread_unassociate_blkcg(void)
1245 {
1246 #ifdef CONFIG_BLK_CGROUP
1247         kthread_associate_blkcg(NULL);
1248 #endif
1249 }
1250
1251 static inline void req_set_fail_links(struct io_kiocb *req)
1252 {
1253         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1254                 req->flags |= REQ_F_FAIL_LINK;
1255 }
1256
1257 /*
1258  * None of these are dereferenced, they are simply used to check if any of
1259  * them have changed. If we're under current and check they are still the
1260  * same, we're fine to grab references to them for actual out-of-line use.
1261  */
1262 static void io_init_identity(struct io_identity *id)
1263 {
1264         id->files = current->files;
1265         id->mm = current->mm;
1266 #ifdef CONFIG_BLK_CGROUP
1267         rcu_read_lock();
1268         id->blkcg_css = blkcg_css();
1269         rcu_read_unlock();
1270 #endif
1271         id->creds = current_cred();
1272         id->nsproxy = current->nsproxy;
1273         id->fs = current->fs;
1274         id->fsize = rlimit(RLIMIT_FSIZE);
1275 #ifdef CONFIG_AUDIT
1276         id->loginuid = current->loginuid;
1277         id->sessionid = current->sessionid;
1278 #endif
1279         refcount_set(&id->count, 1);
1280 }
1281
1282 static inline void __io_req_init_async(struct io_kiocb *req)
1283 {
1284         memset(&req->work, 0, sizeof(req->work));
1285         req->flags |= REQ_F_WORK_INITIALIZED;
1286 }
1287
1288 /*
1289  * Note: must call io_req_init_async() for the first time you
1290  * touch any members of io_wq_work.
1291  */
1292 static inline void io_req_init_async(struct io_kiocb *req)
1293 {
1294         struct io_uring_task *tctx = current->io_uring;
1295
1296         if (req->flags & REQ_F_WORK_INITIALIZED)
1297                 return;
1298
1299         __io_req_init_async(req);
1300
1301         /* Grab a ref if this isn't our static identity */
1302         req->work.identity = tctx->identity;
1303         if (tctx->identity != &tctx->__identity)
1304                 refcount_inc(&req->work.identity->count);
1305 }
1306
1307 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1308 {
1309         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1310
1311         complete(&ctx->ref_comp);
1312 }
1313
1314 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1315 {
1316         return !req->timeout.off;
1317 }
1318
1319 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1320 {
1321         struct io_ring_ctx *ctx;
1322         int hash_bits;
1323
1324         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1325         if (!ctx)
1326                 return NULL;
1327
1328         /*
1329          * Use 5 bits less than the max cq entries, that should give us around
1330          * 32 entries per hash list if totally full and uniformly spread.
1331          */
1332         hash_bits = ilog2(p->cq_entries);
1333         hash_bits -= 5;
1334         if (hash_bits <= 0)
1335                 hash_bits = 1;
1336         ctx->cancel_hash_bits = hash_bits;
1337         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1338                                         GFP_KERNEL);
1339         if (!ctx->cancel_hash)
1340                 goto err;
1341         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1342
1343         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1344                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1345                 goto err;
1346
1347         ctx->flags = p->flags;
1348         init_waitqueue_head(&ctx->sqo_sq_wait);
1349         INIT_LIST_HEAD(&ctx->sqd_list);
1350         init_waitqueue_head(&ctx->cq_wait);
1351         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1352         init_completion(&ctx->ref_comp);
1353         init_completion(&ctx->sq_thread_comp);
1354         idr_init(&ctx->io_buffer_idr);
1355         idr_init(&ctx->personality_idr);
1356         mutex_init(&ctx->uring_lock);
1357         init_waitqueue_head(&ctx->wait);
1358         spin_lock_init(&ctx->completion_lock);
1359         INIT_LIST_HEAD(&ctx->iopoll_list);
1360         INIT_LIST_HEAD(&ctx->defer_list);
1361         INIT_LIST_HEAD(&ctx->timeout_list);
1362         spin_lock_init(&ctx->inflight_lock);
1363         INIT_LIST_HEAD(&ctx->inflight_list);
1364         spin_lock_init(&ctx->rsrc_ref_lock);
1365         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1366         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1367         init_llist_head(&ctx->rsrc_put_llist);
1368         INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
1369         INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
1370         return ctx;
1371 err:
1372         kfree(ctx->cancel_hash);
1373         kfree(ctx);
1374         return NULL;
1375 }
1376
1377 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1378 {
1379         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1380                 struct io_ring_ctx *ctx = req->ctx;
1381
1382                 return seq != ctx->cached_cq_tail
1383                                 + READ_ONCE(ctx->cached_cq_overflow);
1384         }
1385
1386         return false;
1387 }
1388
1389 static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
1390 {
1391         if (req->work.identity == &tctx->__identity)
1392                 return;
1393         if (refcount_dec_and_test(&req->work.identity->count))
1394                 kfree(req->work.identity);
1395 }
1396
1397 static void io_req_clean_work(struct io_kiocb *req)
1398 {
1399         if (!(req->flags & REQ_F_WORK_INITIALIZED))
1400                 return;
1401
1402         if (req->work.flags & IO_WQ_WORK_MM)
1403                 mmdrop(req->work.identity->mm);
1404 #ifdef CONFIG_BLK_CGROUP
1405         if (req->work.flags & IO_WQ_WORK_BLKCG)
1406                 css_put(req->work.identity->blkcg_css);
1407 #endif
1408         if (req->work.flags & IO_WQ_WORK_CREDS)
1409                 put_cred(req->work.identity->creds);
1410         if (req->work.flags & IO_WQ_WORK_FS) {
1411                 struct fs_struct *fs = req->work.identity->fs;
1412
1413                 spin_lock(&req->work.identity->fs->lock);
1414                 if (--fs->users)
1415                         fs = NULL;
1416                 spin_unlock(&req->work.identity->fs->lock);
1417                 if (fs)
1418                         free_fs_struct(fs);
1419         }
1420         if (req->work.flags & IO_WQ_WORK_FILES) {
1421                 put_files_struct(req->work.identity->files);
1422                 put_nsproxy(req->work.identity->nsproxy);
1423         }
1424         if (req->flags & REQ_F_INFLIGHT) {
1425                 struct io_ring_ctx *ctx = req->ctx;
1426                 struct io_uring_task *tctx = req->task->io_uring;
1427                 unsigned long flags;
1428
1429                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1430                 list_del(&req->inflight_entry);
1431                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1432                 req->flags &= ~REQ_F_INFLIGHT;
1433                 if (atomic_read(&tctx->in_idle))
1434                         wake_up(&tctx->wait);
1435         }
1436
1437         req->flags &= ~REQ_F_WORK_INITIALIZED;
1438         req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
1439                              IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
1440         io_put_identity(req->task->io_uring, req);
1441 }
1442
1443 /*
1444  * Create a private copy of io_identity, since some fields don't match
1445  * the current context.
1446  */
1447 static bool io_identity_cow(struct io_kiocb *req)
1448 {
1449         struct io_uring_task *tctx = current->io_uring;
1450         const struct cred *creds = NULL;
1451         struct io_identity *id;
1452
1453         if (req->work.flags & IO_WQ_WORK_CREDS)
1454                 creds = req->work.identity->creds;
1455
1456         id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
1457         if (unlikely(!id)) {
1458                 req->work.flags |= IO_WQ_WORK_CANCEL;
1459                 return false;
1460         }
1461
1462         /*
1463          * We can safely just re-init the creds we copied  Either the field
1464          * matches the current one, or we haven't grabbed it yet. The only
1465          * exception is ->creds, through registered personalities, so handle
1466          * that one separately.
1467          */
1468         io_init_identity(id);
1469         if (creds)
1470                 id->creds = creds;
1471
1472         /* add one for this request */
1473         refcount_inc(&id->count);
1474
1475         /* drop tctx and req identity references, if needed */
1476         if (tctx->identity != &tctx->__identity &&
1477             refcount_dec_and_test(&tctx->identity->count))
1478                 kfree(tctx->identity);
1479         if (req->work.identity != &tctx->__identity &&
1480             refcount_dec_and_test(&req->work.identity->count))
1481                 kfree(req->work.identity);
1482
1483         req->work.identity = id;
1484         tctx->identity = id;
1485         return true;
1486 }
1487
1488 static void io_req_track_inflight(struct io_kiocb *req)
1489 {
1490         struct io_ring_ctx *ctx = req->ctx;
1491
1492         if (!(req->flags & REQ_F_INFLIGHT)) {
1493                 io_req_init_async(req);
1494                 req->flags |= REQ_F_INFLIGHT;
1495
1496                 spin_lock_irq(&ctx->inflight_lock);
1497                 list_add(&req->inflight_entry, &ctx->inflight_list);
1498                 spin_unlock_irq(&ctx->inflight_lock);
1499         }
1500 }
1501
1502 static bool io_grab_identity(struct io_kiocb *req)
1503 {
1504         const struct io_op_def *def = &io_op_defs[req->opcode];
1505         struct io_identity *id = req->work.identity;
1506
1507         if (def->work_flags & IO_WQ_WORK_FSIZE) {
1508                 if (id->fsize != rlimit(RLIMIT_FSIZE))
1509                         return false;
1510                 req->work.flags |= IO_WQ_WORK_FSIZE;
1511         }
1512 #ifdef CONFIG_BLK_CGROUP
1513         if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
1514             (def->work_flags & IO_WQ_WORK_BLKCG)) {
1515                 rcu_read_lock();
1516                 if (id->blkcg_css != blkcg_css()) {
1517                         rcu_read_unlock();
1518                         return false;
1519                 }
1520                 /*
1521                  * This should be rare, either the cgroup is dying or the task
1522                  * is moving cgroups. Just punt to root for the handful of ios.
1523                  */
1524                 if (css_tryget_online(id->blkcg_css))
1525                         req->work.flags |= IO_WQ_WORK_BLKCG;
1526                 rcu_read_unlock();
1527         }
1528 #endif
1529         if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
1530                 if (id->creds != current_cred())
1531                         return false;
1532                 get_cred(id->creds);
1533                 req->work.flags |= IO_WQ_WORK_CREDS;
1534         }
1535 #ifdef CONFIG_AUDIT
1536         if (!uid_eq(current->loginuid, id->loginuid) ||
1537             current->sessionid != id->sessionid)
1538                 return false;
1539 #endif
1540         if (!(req->work.flags & IO_WQ_WORK_FS) &&
1541             (def->work_flags & IO_WQ_WORK_FS)) {
1542                 if (current->fs != id->fs)
1543                         return false;
1544                 spin_lock(&id->fs->lock);
1545                 if (!id->fs->in_exec) {
1546                         id->fs->users++;
1547                         req->work.flags |= IO_WQ_WORK_FS;
1548                 } else {
1549                         req->work.flags |= IO_WQ_WORK_CANCEL;
1550                 }
1551                 spin_unlock(&current->fs->lock);
1552         }
1553         if (!(req->work.flags & IO_WQ_WORK_FILES) &&
1554             (def->work_flags & IO_WQ_WORK_FILES) &&
1555             !(req->flags & REQ_F_NO_FILE_TABLE)) {
1556                 if (id->files != current->files ||
1557                     id->nsproxy != current->nsproxy)
1558                         return false;
1559                 atomic_inc(&id->files->count);
1560                 get_nsproxy(id->nsproxy);
1561                 req->work.flags |= IO_WQ_WORK_FILES;
1562                 io_req_track_inflight(req);
1563         }
1564         if (!(req->work.flags & IO_WQ_WORK_MM) &&
1565             (def->work_flags & IO_WQ_WORK_MM)) {
1566                 if (id->mm != current->mm)
1567                         return false;
1568                 mmgrab(id->mm);
1569                 req->work.flags |= IO_WQ_WORK_MM;
1570         }
1571
1572         return true;
1573 }
1574
1575 static void io_prep_async_work(struct io_kiocb *req)
1576 {
1577         const struct io_op_def *def = &io_op_defs[req->opcode];
1578         struct io_ring_ctx *ctx = req->ctx;
1579
1580         io_req_init_async(req);
1581
1582         if (req->flags & REQ_F_FORCE_ASYNC)
1583                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1584
1585         if (req->flags & REQ_F_ISREG) {
1586                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1587                         io_wq_hash_work(&req->work, file_inode(req->file));
1588         } else {
1589                 if (def->unbound_nonreg_file)
1590                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1591         }
1592
1593         /* if we fail grabbing identity, we must COW, regrab, and retry */
1594         if (io_grab_identity(req))
1595                 return;
1596
1597         if (!io_identity_cow(req))
1598                 return;
1599
1600         /* can't fail at this point */
1601         if (!io_grab_identity(req))
1602                 WARN_ON(1);
1603 }
1604
1605 static void io_prep_async_link(struct io_kiocb *req)
1606 {
1607         struct io_kiocb *cur;
1608
1609         io_for_each_link(cur, req)
1610                 io_prep_async_work(cur);
1611 }
1612
1613 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
1614 {
1615         struct io_ring_ctx *ctx = req->ctx;
1616         struct io_kiocb *link = io_prep_linked_timeout(req);
1617
1618         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1619                                         &req->work, req->flags);
1620         io_wq_enqueue(ctx->io_wq, &req->work);
1621         return link;
1622 }
1623
1624 static void io_queue_async_work(struct io_kiocb *req)
1625 {
1626         struct io_kiocb *link;
1627
1628         /* init ->work of the whole link before punting */
1629         io_prep_async_link(req);
1630         link = __io_queue_async_work(req);
1631
1632         if (link)
1633                 io_queue_linked_timeout(link);
1634 }
1635
1636 static void io_kill_timeout(struct io_kiocb *req)
1637 {
1638         struct io_timeout_data *io = req->async_data;
1639         int ret;
1640
1641         ret = hrtimer_try_to_cancel(&io->timer);
1642         if (ret != -1) {
1643                 atomic_set(&req->ctx->cq_timeouts,
1644                         atomic_read(&req->ctx->cq_timeouts) + 1);
1645                 list_del_init(&req->timeout.list);
1646                 io_cqring_fill_event(req, 0);
1647                 io_put_req_deferred(req, 1);
1648         }
1649 }
1650
1651 /*
1652  * Returns true if we found and killed one or more timeouts
1653  */
1654 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
1655                              struct files_struct *files)
1656 {
1657         struct io_kiocb *req, *tmp;
1658         int canceled = 0;
1659
1660         spin_lock_irq(&ctx->completion_lock);
1661         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1662                 if (io_match_task(req, tsk, files)) {
1663                         io_kill_timeout(req);
1664                         canceled++;
1665                 }
1666         }
1667         spin_unlock_irq(&ctx->completion_lock);
1668         return canceled != 0;
1669 }
1670
1671 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1672 {
1673         do {
1674                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1675                                                 struct io_defer_entry, list);
1676
1677                 if (req_need_defer(de->req, de->seq))
1678                         break;
1679                 list_del_init(&de->list);
1680                 io_req_task_queue(de->req);
1681                 kfree(de);
1682         } while (!list_empty(&ctx->defer_list));
1683 }
1684
1685 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1686 {
1687         u32 seq;
1688
1689         if (list_empty(&ctx->timeout_list))
1690                 return;
1691
1692         seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1693
1694         do {
1695                 u32 events_needed, events_got;
1696                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1697                                                 struct io_kiocb, timeout.list);
1698
1699                 if (io_is_timeout_noseq(req))
1700                         break;
1701
1702                 /*
1703                  * Since seq can easily wrap around over time, subtract
1704                  * the last seq at which timeouts were flushed before comparing.
1705                  * Assuming not more than 2^31-1 events have happened since,
1706                  * these subtractions won't have wrapped, so we can check if
1707                  * target is in [last_seq, current_seq] by comparing the two.
1708                  */
1709                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1710                 events_got = seq - ctx->cq_last_tm_flush;
1711                 if (events_got < events_needed)
1712                         break;
1713
1714                 list_del_init(&req->timeout.list);
1715                 io_kill_timeout(req);
1716         } while (!list_empty(&ctx->timeout_list));
1717
1718         ctx->cq_last_tm_flush = seq;
1719 }
1720
1721 static void io_commit_cqring(struct io_ring_ctx *ctx)
1722 {
1723         io_flush_timeouts(ctx);
1724
1725         /* order cqe stores with ring update */
1726         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1727
1728         if (unlikely(!list_empty(&ctx->defer_list)))
1729                 __io_queue_deferred(ctx);
1730 }
1731
1732 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1733 {
1734         struct io_rings *r = ctx->rings;
1735
1736         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1737 }
1738
1739 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1740 {
1741         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1742 }
1743
1744 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1745 {
1746         struct io_rings *rings = ctx->rings;
1747         unsigned tail;
1748
1749         /*
1750          * writes to the cq entry need to come after reading head; the
1751          * control dependency is enough as we're using WRITE_ONCE to
1752          * fill the cq entry
1753          */
1754         if (__io_cqring_events(ctx) == rings->cq_ring_entries)
1755                 return NULL;
1756
1757         tail = ctx->cached_cq_tail++;
1758         return &rings->cqes[tail & ctx->cq_mask];
1759 }
1760
1761 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1762 {
1763         if (!ctx->cq_ev_fd)
1764                 return false;
1765         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1766                 return false;
1767         if (!ctx->eventfd_async)
1768                 return true;
1769         return io_wq_current_is_worker();
1770 }
1771
1772 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1773 {
1774         /* see waitqueue_active() comment */
1775         smp_mb();
1776
1777         if (waitqueue_active(&ctx->wait))
1778                 wake_up(&ctx->wait);
1779         if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1780                 wake_up(&ctx->sq_data->wait);
1781         if (io_should_trigger_evfd(ctx))
1782                 eventfd_signal(ctx->cq_ev_fd, 1);
1783         if (waitqueue_active(&ctx->cq_wait)) {
1784                 wake_up_interruptible(&ctx->cq_wait);
1785                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1786         }
1787 }
1788
1789 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1790 {
1791         /* see waitqueue_active() comment */
1792         smp_mb();
1793
1794         if (ctx->flags & IORING_SETUP_SQPOLL) {
1795                 if (waitqueue_active(&ctx->wait))
1796                         wake_up(&ctx->wait);
1797         }
1798         if (io_should_trigger_evfd(ctx))
1799                 eventfd_signal(ctx->cq_ev_fd, 1);
1800         if (waitqueue_active(&ctx->cq_wait)) {
1801                 wake_up_interruptible(&ctx->cq_wait);
1802                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1803         }
1804 }
1805
1806 /* Returns true if there are no backlogged entries after the flush */
1807 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1808                                        struct task_struct *tsk,
1809                                        struct files_struct *files)
1810 {
1811         struct io_rings *rings = ctx->rings;
1812         struct io_kiocb *req, *tmp;
1813         struct io_uring_cqe *cqe;
1814         unsigned long flags;
1815         bool all_flushed, posted;
1816         LIST_HEAD(list);
1817
1818         if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
1819                 return false;
1820
1821         posted = false;
1822         spin_lock_irqsave(&ctx->completion_lock, flags);
1823         list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1824                 if (!io_match_task(req, tsk, files))
1825                         continue;
1826
1827                 cqe = io_get_cqring(ctx);
1828                 if (!cqe && !force)
1829                         break;
1830
1831                 list_move(&req->compl.list, &list);
1832                 if (cqe) {
1833                         WRITE_ONCE(cqe->user_data, req->user_data);
1834                         WRITE_ONCE(cqe->res, req->result);
1835                         WRITE_ONCE(cqe->flags, req->compl.cflags);
1836                 } else {
1837                         ctx->cached_cq_overflow++;
1838                         WRITE_ONCE(ctx->rings->cq_overflow,
1839                                    ctx->cached_cq_overflow);
1840                 }
1841                 posted = true;
1842         }
1843
1844         all_flushed = list_empty(&ctx->cq_overflow_list);
1845         if (all_flushed) {
1846                 clear_bit(0, &ctx->sq_check_overflow);
1847                 clear_bit(0, &ctx->cq_check_overflow);
1848                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1849         }
1850
1851         if (posted)
1852                 io_commit_cqring(ctx);
1853         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1854         if (posted)
1855                 io_cqring_ev_posted(ctx);
1856
1857         while (!list_empty(&list)) {
1858                 req = list_first_entry(&list, struct io_kiocb, compl.list);
1859                 list_del(&req->compl.list);
1860                 io_put_req(req);
1861         }
1862
1863         return all_flushed;
1864 }
1865
1866 static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1867                                      struct task_struct *tsk,
1868                                      struct files_struct *files)
1869 {
1870         if (test_bit(0, &ctx->cq_check_overflow)) {
1871                 /* iopoll syncs against uring_lock, not completion_lock */
1872                 if (ctx->flags & IORING_SETUP_IOPOLL)
1873                         mutex_lock(&ctx->uring_lock);
1874                 __io_cqring_overflow_flush(ctx, force, tsk, files);
1875                 if (ctx->flags & IORING_SETUP_IOPOLL)
1876                         mutex_unlock(&ctx->uring_lock);
1877         }
1878 }
1879
1880 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1881 {
1882         struct io_ring_ctx *ctx = req->ctx;
1883         struct io_uring_cqe *cqe;
1884
1885         trace_io_uring_complete(ctx, req->user_data, res);
1886
1887         /*
1888          * If we can't get a cq entry, userspace overflowed the
1889          * submission (by quite a lot). Increment the overflow count in
1890          * the ring.
1891          */
1892         cqe = io_get_cqring(ctx);
1893         if (likely(cqe)) {
1894                 WRITE_ONCE(cqe->user_data, req->user_data);
1895                 WRITE_ONCE(cqe->res, res);
1896                 WRITE_ONCE(cqe->flags, cflags);
1897         } else if (ctx->cq_overflow_flushed ||
1898                    atomic_read(&req->task->io_uring->in_idle)) {
1899                 /*
1900                  * If we're in ring overflow flush mode, or in task cancel mode,
1901                  * then we cannot store the request for later flushing, we need
1902                  * to drop it on the floor.
1903                  */
1904                 ctx->cached_cq_overflow++;
1905                 WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1906         } else {
1907                 if (list_empty(&ctx->cq_overflow_list)) {
1908                         set_bit(0, &ctx->sq_check_overflow);
1909                         set_bit(0, &ctx->cq_check_overflow);
1910                         ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1911                 }
1912                 io_clean_op(req);
1913                 req->result = res;
1914                 req->compl.cflags = cflags;
1915                 refcount_inc(&req->refs);
1916                 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1917         }
1918 }
1919
1920 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1921 {
1922         __io_cqring_fill_event(req, res, 0);
1923 }
1924
1925 static inline void io_req_complete_post(struct io_kiocb *req, long res,
1926                                         unsigned int cflags)
1927 {
1928         struct io_ring_ctx *ctx = req->ctx;
1929         unsigned long flags;
1930
1931         spin_lock_irqsave(&ctx->completion_lock, flags);
1932         __io_cqring_fill_event(req, res, cflags);
1933         io_commit_cqring(ctx);
1934         /*
1935          * If we're the last reference to this request, add to our locked
1936          * free_list cache.
1937          */
1938         if (refcount_dec_and_test(&req->refs)) {
1939                 struct io_comp_state *cs = &ctx->submit_state.comp;
1940
1941                 io_dismantle_req(req);
1942                 io_put_task(req->task, 1);
1943                 list_add(&req->compl.list, &cs->locked_free_list);
1944                 cs->locked_free_nr++;
1945         } else
1946                 req = NULL;
1947         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1948
1949         io_cqring_ev_posted(ctx);
1950         if (req) {
1951                 io_queue_next(req);
1952                 percpu_ref_put(&ctx->refs);
1953         }
1954 }
1955
1956 static void io_req_complete_state(struct io_kiocb *req, long res,
1957                                   unsigned int cflags)
1958 {
1959         io_clean_op(req);
1960         req->result = res;
1961         req->compl.cflags = cflags;
1962         req->flags |= REQ_F_COMPLETE_INLINE;
1963 }
1964
1965 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1966                                      long res, unsigned cflags)
1967 {
1968         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1969                 io_req_complete_state(req, res, cflags);
1970         else
1971                 io_req_complete_post(req, res, cflags);
1972 }
1973
1974 static inline void io_req_complete(struct io_kiocb *req, long res)
1975 {
1976         __io_req_complete(req, 0, res, 0);
1977 }
1978
1979 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1980 {
1981         struct io_submit_state *state = &ctx->submit_state;
1982         struct io_comp_state *cs = &state->comp;
1983         struct io_kiocb *req = NULL;
1984
1985         /*
1986          * If we have more than a batch's worth of requests in our IRQ side
1987          * locked cache, grab the lock and move them over to our submission
1988          * side cache.
1989          */
1990         if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
1991                 spin_lock_irq(&ctx->completion_lock);
1992                 list_splice_init(&cs->locked_free_list, &cs->free_list);
1993                 cs->locked_free_nr = 0;
1994                 spin_unlock_irq(&ctx->completion_lock);
1995         }
1996
1997         while (!list_empty(&cs->free_list)) {
1998                 req = list_first_entry(&cs->free_list, struct io_kiocb,
1999                                         compl.list);
2000                 list_del(&req->compl.list);
2001                 state->reqs[state->free_reqs++] = req;
2002                 if (state->free_reqs == ARRAY_SIZE(state->reqs))
2003                         break;
2004         }
2005
2006         return req != NULL;
2007 }
2008
2009 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2010 {
2011         struct io_submit_state *state = &ctx->submit_state;
2012
2013         BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
2014
2015         if (!state->free_reqs) {
2016                 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2017                 int ret;
2018
2019                 if (io_flush_cached_reqs(ctx))
2020                         goto got_req;
2021
2022                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
2023                                             state->reqs);
2024
2025                 /*
2026                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
2027                  * retry single alloc to be on the safe side.
2028                  */
2029                 if (unlikely(ret <= 0)) {
2030                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2031                         if (!state->reqs[0])
2032                                 return NULL;
2033                         ret = 1;
2034                 }
2035                 state->free_reqs = ret;
2036         }
2037 got_req:
2038         state->free_reqs--;
2039         return state->reqs[state->free_reqs];
2040 }
2041
2042 static inline void io_put_file(struct io_kiocb *req, struct file *file,
2043                           bool fixed)
2044 {
2045         if (!fixed)
2046                 fput(file);
2047 }
2048
2049 static void io_dismantle_req(struct io_kiocb *req)
2050 {
2051         io_clean_op(req);
2052
2053         if (req->async_data)
2054                 kfree(req->async_data);
2055         if (req->file)
2056                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
2057         if (req->fixed_rsrc_refs)
2058                 percpu_ref_put(req->fixed_rsrc_refs);
2059         io_req_clean_work(req);
2060 }
2061
2062 static inline void io_put_task(struct task_struct *task, int nr)
2063 {
2064         struct io_uring_task *tctx = task->io_uring;
2065
2066         percpu_counter_sub(&tctx->inflight, nr);
2067         if (unlikely(atomic_read(&tctx->in_idle)))
2068                 wake_up(&tctx->wait);
2069         put_task_struct_many(task, nr);
2070 }
2071
2072 static void __io_free_req(struct io_kiocb *req)
2073 {
2074         struct io_ring_ctx *ctx = req->ctx;
2075
2076         io_dismantle_req(req);
2077         io_put_task(req->task, 1);
2078
2079         kmem_cache_free(req_cachep, req);
2080         percpu_ref_put(&ctx->refs);
2081 }
2082
2083 static inline void io_remove_next_linked(struct io_kiocb *req)
2084 {
2085         struct io_kiocb *nxt = req->link;
2086
2087         req->link = nxt->link;
2088         nxt->link = NULL;
2089 }
2090
2091 static void io_kill_linked_timeout(struct io_kiocb *req)
2092 {
2093         struct io_ring_ctx *ctx = req->ctx;
2094         struct io_kiocb *link;
2095         bool cancelled = false;
2096         unsigned long flags;
2097
2098         spin_lock_irqsave(&ctx->completion_lock, flags);
2099         link = req->link;
2100
2101         /*
2102          * Can happen if a linked timeout fired and link had been like
2103          * req -> link t-out -> link t-out [-> ...]
2104          */
2105         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
2106                 struct io_timeout_data *io = link->async_data;
2107                 int ret;
2108
2109                 io_remove_next_linked(req);
2110                 link->timeout.head = NULL;
2111                 ret = hrtimer_try_to_cancel(&io->timer);
2112                 if (ret != -1) {
2113                         io_cqring_fill_event(link, -ECANCELED);
2114                         io_commit_cqring(ctx);
2115                         cancelled = true;
2116                 }
2117         }
2118         req->flags &= ~REQ_F_LINK_TIMEOUT;
2119         spin_unlock_irqrestore(&ctx->completion_lock, flags);
2120
2121         if (cancelled) {
2122                 io_cqring_ev_posted(ctx);
2123                 io_put_req(link);
2124         }
2125 }
2126
2127
2128 static void io_fail_links(struct io_kiocb *req)
2129 {
2130         struct io_kiocb *link, *nxt;
2131         struct io_ring_ctx *ctx = req->ctx;
2132         unsigned long flags;
2133
2134         spin_lock_irqsave(&ctx->completion_lock, flags);
2135         link = req->link;
2136         req->link = NULL;
2137
2138         while (link) {
2139                 nxt = link->link;
2140                 link->link = NULL;
2141
2142                 trace_io_uring_fail_link(req, link);
2143                 io_cqring_fill_event(link, -ECANCELED);
2144
2145                 /*
2146                  * It's ok to free under spinlock as they're not linked anymore,
2147                  * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
2148                  * work.fs->lock.
2149                  */
2150                 if (link->flags & REQ_F_WORK_INITIALIZED)
2151                         io_put_req_deferred(link, 2);
2152                 else
2153                         io_double_put_req(link);
2154                 link = nxt;
2155         }
2156         io_commit_cqring(ctx);
2157         spin_unlock_irqrestore(&ctx->completion_lock, flags);
2158
2159         io_cqring_ev_posted(ctx);
2160 }
2161
2162 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
2163 {
2164         if (req->flags & REQ_F_LINK_TIMEOUT)
2165                 io_kill_linked_timeout(req);
2166
2167         /*
2168          * If LINK is set, we have dependent requests in this chain. If we
2169          * didn't fail this request, queue the first one up, moving any other
2170          * dependencies to the next request. In case of failure, fail the rest
2171          * of the chain.
2172          */
2173         if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
2174                 struct io_kiocb *nxt = req->link;
2175
2176                 req->link = NULL;
2177                 return nxt;
2178         }
2179         io_fail_links(req);
2180         return NULL;
2181 }
2182
2183 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2184 {
2185         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2186                 return NULL;
2187         return __io_req_find_next(req);
2188 }
2189
2190 static bool __tctx_task_work(struct io_uring_task *tctx)
2191 {
2192         struct io_ring_ctx *ctx = NULL;
2193         struct io_wq_work_list list;
2194         struct io_wq_work_node *node;
2195
2196         if (wq_list_empty(&tctx->task_list))
2197                 return false;
2198
2199         spin_lock_irq(&tctx->task_lock);
2200         list = tctx->task_list;
2201         INIT_WQ_LIST(&tctx->task_list);
2202         spin_unlock_irq(&tctx->task_lock);
2203
2204         node = list.first;
2205         while (node) {
2206                 struct io_wq_work_node *next = node->next;
2207                 struct io_ring_ctx *this_ctx;
2208                 struct io_kiocb *req;
2209
2210                 req = container_of(node, struct io_kiocb, io_task_work.node);
2211                 this_ctx = req->ctx;
2212                 req->task_work.func(&req->task_work);
2213                 node = next;
2214
2215                 if (!ctx) {
2216                         ctx = this_ctx;
2217                 } else if (ctx != this_ctx) {
2218                         mutex_lock(&ctx->uring_lock);
2219                         io_submit_flush_completions(&ctx->submit_state.comp, ctx);
2220                         mutex_unlock(&ctx->uring_lock);
2221                         ctx = this_ctx;
2222                 }
2223         }
2224
2225         if (ctx && ctx->submit_state.comp.nr) {
2226                 mutex_lock(&ctx->uring_lock);
2227                 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
2228                 mutex_unlock(&ctx->uring_lock);
2229         }
2230
2231         return list.first != NULL;
2232 }
2233
2234 static void tctx_task_work(struct callback_head *cb)
2235 {
2236         struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
2237
2238         while (__tctx_task_work(tctx))
2239                 cond_resched();
2240
2241         clear_bit(0, &tctx->task_state);
2242 }
2243
2244 static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
2245                             enum task_work_notify_mode notify)
2246 {
2247         struct io_uring_task *tctx = tsk->io_uring;
2248         struct io_wq_work_node *node, *prev;
2249         unsigned long flags;
2250         int ret;
2251
2252         WARN_ON_ONCE(!tctx);
2253
2254         spin_lock_irqsave(&tctx->task_lock, flags);
2255         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2256         spin_unlock_irqrestore(&tctx->task_lock, flags);
2257
2258         /* task_work already pending, we're done */
2259         if (test_bit(0, &tctx->task_state) ||
2260             test_and_set_bit(0, &tctx->task_state))
2261                 return 0;
2262
2263         if (!task_work_add(tsk, &tctx->task_work, notify))
2264                 return 0;
2265
2266         /*
2267          * Slow path - we failed, find and delete work. if the work is not
2268          * in the list, it got run and we're fine.
2269          */
2270         ret = 0;
2271         spin_lock_irqsave(&tctx->task_lock, flags);
2272         wq_list_for_each(node, prev, &tctx->task_list) {
2273                 if (&req->io_task_work.node == node) {
2274                         wq_list_del(&tctx->task_list, node, prev);
2275                         ret = 1;
2276                         break;
2277                 }
2278         }
2279         spin_unlock_irqrestore(&tctx->task_lock, flags);
2280         clear_bit(0, &tctx->task_state);
2281         return ret;
2282 }
2283
2284 static int io_req_task_work_add(struct io_kiocb *req)
2285 {
2286         struct task_struct *tsk = req->task;
2287         struct io_ring_ctx *ctx = req->ctx;
2288         enum task_work_notify_mode notify;
2289         int ret;
2290
2291         if (tsk->flags & PF_EXITING)
2292                 return -ESRCH;
2293
2294         /*
2295          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2296          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2297          * processing task_work. There's no reliable way to tell if TWA_RESUME
2298          * will do the job.
2299          */
2300         notify = TWA_NONE;
2301         if (!(ctx->flags & IORING_SETUP_SQPOLL))
2302                 notify = TWA_SIGNAL;
2303
2304         ret = io_task_work_add(tsk, req, notify);
2305         if (!ret)
2306                 wake_up_process(tsk);
2307
2308         return ret;
2309 }
2310
2311 static void io_req_task_work_add_fallback(struct io_kiocb *req,
2312                                           task_work_func_t cb)
2313 {
2314         struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
2315
2316         init_task_work(&req->task_work, cb);
2317         task_work_add(tsk, &req->task_work, TWA_NONE);
2318         wake_up_process(tsk);
2319 }
2320
2321 static void __io_req_task_cancel(struct io_kiocb *req, int error)
2322 {
2323         struct io_ring_ctx *ctx = req->ctx;
2324
2325         spin_lock_irq(&ctx->completion_lock);
2326         io_cqring_fill_event(req, error);
2327         io_commit_cqring(ctx);
2328         spin_unlock_irq(&ctx->completion_lock);
2329
2330         io_cqring_ev_posted(ctx);
2331         req_set_fail_links(req);
2332         io_double_put_req(req);
2333 }
2334
2335 static void io_req_task_cancel(struct callback_head *cb)
2336 {
2337         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2338         struct io_ring_ctx *ctx = req->ctx;
2339
2340         mutex_lock(&ctx->uring_lock);
2341         __io_req_task_cancel(req, -ECANCELED);
2342         mutex_unlock(&ctx->uring_lock);
2343         percpu_ref_put(&ctx->refs);
2344 }
2345
2346 static void __io_req_task_submit(struct io_kiocb *req)
2347 {
2348         struct io_ring_ctx *ctx = req->ctx;
2349
2350         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
2351         mutex_lock(&ctx->uring_lock);
2352         if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
2353             !io_sq_thread_acquire_mm_files(ctx, req))
2354                 __io_queue_sqe(req);
2355         else
2356                 __io_req_task_cancel(req, -EFAULT);
2357         mutex_unlock(&ctx->uring_lock);
2358 }
2359
2360 static void io_req_task_submit(struct callback_head *cb)
2361 {
2362         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2363
2364         __io_req_task_submit(req);
2365 }
2366
2367 static void io_req_task_queue(struct io_kiocb *req)
2368 {
2369         int ret;
2370
2371         req->task_work.func = io_req_task_submit;
2372         ret = io_req_task_work_add(req);
2373         if (unlikely(ret)) {
2374                 percpu_ref_get(&req->ctx->refs);
2375                 io_req_task_work_add_fallback(req, io_req_task_cancel);
2376         }
2377 }
2378
2379 static inline void io_queue_next(struct io_kiocb *req)
2380 {
2381         struct io_kiocb *nxt = io_req_find_next(req);
2382
2383         if (nxt)
2384                 io_req_task_queue(nxt);
2385 }
2386
2387 static void io_free_req(struct io_kiocb *req)
2388 {
2389         io_queue_next(req);
2390         __io_free_req(req);
2391 }
2392
2393 struct req_batch {
2394         struct task_struct      *task;
2395         int                     task_refs;
2396         int                     ctx_refs;
2397 };
2398
2399 static inline void io_init_req_batch(struct req_batch *rb)
2400 {
2401         rb->task_refs = 0;
2402         rb->ctx_refs = 0;
2403         rb->task = NULL;
2404 }
2405
2406 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2407                                      struct req_batch *rb)
2408 {
2409         if (rb->task)
2410                 io_put_task(rb->task, rb->task_refs);
2411         if (rb->ctx_refs)
2412                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2413 }
2414
2415 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2416                               struct io_submit_state *state)
2417 {
2418         io_queue_next(req);
2419
2420         if (req->task != rb->task) {
2421                 if (rb->task)
2422                         io_put_task(rb->task, rb->task_refs);
2423                 rb->task = req->task;
2424                 rb->task_refs = 0;
2425         }
2426         rb->task_refs++;
2427         rb->ctx_refs++;
2428
2429         io_dismantle_req(req);
2430         if (state->free_reqs != ARRAY_SIZE(state->reqs))
2431                 state->reqs[state->free_reqs++] = req;
2432         else
2433                 list_add(&req->compl.list, &state->comp.free_list);
2434 }
2435
2436 static void io_submit_flush_completions(struct io_comp_state *cs,
2437                                         struct io_ring_ctx *ctx)
2438 {
2439         int i, nr = cs->nr;
2440         struct io_kiocb *req;
2441         struct req_batch rb;
2442
2443         io_init_req_batch(&rb);
2444         spin_lock_irq(&ctx->completion_lock);
2445         for (i = 0; i < nr; i++) {
2446                 req = cs->reqs[i];
2447                 __io_cqring_fill_event(req, req->result, req->compl.cflags);
2448         }
2449         io_commit_cqring(ctx);
2450         spin_unlock_irq(&ctx->completion_lock);
2451
2452         io_cqring_ev_posted(ctx);
2453         for (i = 0; i < nr; i++) {
2454                 req = cs->reqs[i];
2455
2456                 /* submission and completion refs */
2457                 if (refcount_sub_and_test(2, &req->refs))
2458                         io_req_free_batch(&rb, req, &ctx->submit_state);
2459         }
2460
2461         io_req_free_batch_finish(ctx, &rb);
2462         cs->nr = 0;
2463 }
2464
2465 /*
2466  * Drop reference to request, return next in chain (if there is one) if this
2467  * was the last reference to this request.
2468  */
2469 static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2470 {
2471         struct io_kiocb *nxt = NULL;
2472
2473         if (refcount_dec_and_test(&req->refs)) {
2474                 nxt = io_req_find_next(req);
2475                 __io_free_req(req);
2476         }
2477         return nxt;
2478 }
2479
2480 static void io_put_req(struct io_kiocb *req)
2481 {
2482         if (refcount_dec_and_test(&req->refs))
2483                 io_free_req(req);
2484 }
2485
2486 static void io_put_req_deferred_cb(struct callback_head *cb)
2487 {
2488         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2489
2490         io_free_req(req);
2491 }
2492
2493 static void io_free_req_deferred(struct io_kiocb *req)
2494 {
2495         int ret;
2496
2497         req->task_work.func = io_put_req_deferred_cb;
2498         ret = io_req_task_work_add(req);
2499         if (unlikely(ret))
2500                 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
2501 }
2502
2503 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2504 {
2505         if (refcount_sub_and_test(refs, &req->refs))
2506                 io_free_req_deferred(req);
2507 }
2508
2509 static void io_double_put_req(struct io_kiocb *req)
2510 {
2511         /* drop both submit and complete references */
2512         if (refcount_sub_and_test(2, &req->refs))
2513                 io_free_req(req);
2514 }
2515
2516 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2517 {
2518         /* See comment at the top of this file */
2519         smp_rmb();
2520         return __io_cqring_events(ctx);
2521 }
2522
2523 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2524 {
2525         struct io_rings *rings = ctx->rings;
2526
2527         /* make sure SQ entry isn't read before tail */
2528         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2529 }
2530
2531 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2532 {
2533         unsigned int cflags;
2534
2535         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2536         cflags |= IORING_CQE_F_BUFFER;
2537         req->flags &= ~REQ_F_BUFFER_SELECTED;
2538         kfree(kbuf);
2539         return cflags;
2540 }
2541
2542 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2543 {
2544         struct io_buffer *kbuf;
2545
2546         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2547         return io_put_kbuf(req, kbuf);
2548 }
2549
2550 static inline bool io_run_task_work(void)
2551 {
2552         /*
2553          * Not safe to run on exiting task, and the task_work handling will
2554          * not add work to such a task.
2555          */
2556         if (unlikely(current->flags & PF_EXITING))
2557                 return false;
2558         if (current->task_works) {
2559                 __set_current_state(TASK_RUNNING);
2560                 task_work_run();
2561                 return true;
2562         }
2563
2564         return false;
2565 }
2566
2567 /*
2568  * Find and free completed poll iocbs
2569  */
2570 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2571                                struct list_head *done)
2572 {
2573         struct req_batch rb;
2574         struct io_kiocb *req;
2575
2576         /* order with ->result store in io_complete_rw_iopoll() */
2577         smp_rmb();
2578
2579         io_init_req_batch(&rb);
2580         while (!list_empty(done)) {
2581                 int cflags = 0;
2582
2583                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2584                 list_del(&req->inflight_entry);
2585
2586                 if (READ_ONCE(req->result) == -EAGAIN) {
2587                         req->iopoll_completed = 0;
2588                         if (io_rw_reissue(req))
2589                                 continue;
2590                 }
2591
2592                 if (req->flags & REQ_F_BUFFER_SELECTED)
2593                         cflags = io_put_rw_kbuf(req);
2594
2595                 __io_cqring_fill_event(req, req->result, cflags);
2596                 (*nr_events)++;
2597
2598                 if (refcount_dec_and_test(&req->refs))
2599                         io_req_free_batch(&rb, req, &ctx->submit_state);
2600         }
2601
2602         io_commit_cqring(ctx);
2603         io_cqring_ev_posted_iopoll(ctx);
2604         io_req_free_batch_finish(ctx, &rb);
2605 }
2606
2607 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2608                         long min)
2609 {
2610         struct io_kiocb *req, *tmp;
2611         LIST_HEAD(done);
2612         bool spin;
2613         int ret;
2614
2615         /*
2616          * Only spin for completions if we don't have multiple devices hanging
2617          * off our complete list, and we're under the requested amount.
2618          */
2619         spin = !ctx->poll_multi_file && *nr_events < min;
2620
2621         ret = 0;
2622         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2623                 struct kiocb *kiocb = &req->rw.kiocb;
2624
2625                 /*
2626                  * Move completed and retryable entries to our local lists.
2627                  * If we find a request that requires polling, break out
2628                  * and complete those lists first, if we have entries there.
2629                  */
2630                 if (READ_ONCE(req->iopoll_completed)) {
2631                         list_move_tail(&req->inflight_entry, &done);
2632                         continue;
2633                 }
2634                 if (!list_empty(&done))
2635                         break;
2636
2637                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2638                 if (ret < 0)
2639                         break;
2640
2641                 /* iopoll may have completed current req */
2642                 if (READ_ONCE(req->iopoll_completed))
2643                         list_move_tail(&req->inflight_entry, &done);
2644
2645                 if (ret && spin)
2646                         spin = false;
2647                 ret = 0;
2648         }
2649
2650         if (!list_empty(&done))
2651                 io_iopoll_complete(ctx, nr_events, &done);
2652
2653         return ret;
2654 }
2655
2656 /*
2657  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2658  * non-spinning poll check - we'll still enter the driver poll loop, but only
2659  * as a non-spinning completion check.
2660  */
2661 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2662                                 long min)
2663 {
2664         while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2665                 int ret;
2666
2667                 ret = io_do_iopoll(ctx, nr_events, min);
2668                 if (ret < 0)
2669                         return ret;
2670                 if (*nr_events >= min)
2671                         return 0;
2672         }
2673
2674         return 1;
2675 }
2676
2677 /*
2678  * We can't just wait for polled events to come to us, we have to actively
2679  * find and complete them.
2680  */
2681 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2682 {
2683         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2684                 return;
2685
2686         mutex_lock(&ctx->uring_lock);
2687         while (!list_empty(&ctx->iopoll_list)) {
2688                 unsigned int nr_events = 0;
2689
2690                 io_do_iopoll(ctx, &nr_events, 0);
2691
2692                 /* let it sleep and repeat later if can't complete a request */
2693                 if (nr_events == 0)
2694                         break;
2695                 /*
2696                  * Ensure we allow local-to-the-cpu processing to take place,
2697                  * in this case we need to ensure that we reap all events.
2698                  * Also let task_work, etc. to progress by releasing the mutex
2699                  */
2700                 if (need_resched()) {
2701                         mutex_unlock(&ctx->uring_lock);
2702                         cond_resched();
2703                         mutex_lock(&ctx->uring_lock);
2704                 }
2705         }
2706         mutex_unlock(&ctx->uring_lock);
2707 }
2708
2709 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2710 {
2711         unsigned int nr_events = 0;
2712         int iters = 0, ret = 0;
2713
2714         /*
2715          * We disallow the app entering submit/complete with polling, but we
2716          * still need to lock the ring to prevent racing with polled issue
2717          * that got punted to a workqueue.
2718          */
2719         mutex_lock(&ctx->uring_lock);
2720         do {
2721                 /*
2722                  * Don't enter poll loop if we already have events pending.
2723                  * If we do, we can potentially be spinning for commands that
2724                  * already triggered a CQE (eg in error).
2725                  */
2726                 if (test_bit(0, &ctx->cq_check_overflow))
2727                         __io_cqring_overflow_flush(ctx, false, NULL, NULL);
2728                 if (io_cqring_events(ctx))
2729                         break;
2730
2731                 /*
2732                  * If a submit got punted to a workqueue, we can have the
2733                  * application entering polling for a command before it gets
2734                  * issued. That app will hold the uring_lock for the duration
2735                  * of the poll right here, so we need to take a breather every
2736                  * now and then to ensure that the issue has a chance to add
2737                  * the poll to the issued list. Otherwise we can spin here
2738                  * forever, while the workqueue is stuck trying to acquire the
2739                  * very same mutex.
2740                  */
2741                 if (!(++iters & 7)) {
2742                         mutex_unlock(&ctx->uring_lock);
2743                         io_run_task_work();
2744                         mutex_lock(&ctx->uring_lock);
2745                 }
2746
2747                 ret = io_iopoll_getevents(ctx, &nr_events, min);
2748                 if (ret <= 0)
2749                         break;
2750                 ret = 0;
2751         } while (min && !nr_events && !need_resched());
2752
2753         mutex_unlock(&ctx->uring_lock);
2754         return ret;
2755 }
2756
2757 static void kiocb_end_write(struct io_kiocb *req)
2758 {
2759         /*
2760          * Tell lockdep we inherited freeze protection from submission
2761          * thread.
2762          */
2763         if (req->flags & REQ_F_ISREG) {
2764                 struct inode *inode = file_inode(req->file);
2765
2766                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2767         }
2768         file_end_write(req->file);
2769 }
2770
2771 #ifdef CONFIG_BLOCK
2772 static bool io_resubmit_prep(struct io_kiocb *req)
2773 {
2774         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2775         int rw, ret;
2776         struct iov_iter iter;
2777
2778         /* already prepared */
2779         if (req->async_data)
2780                 return true;
2781
2782         switch (req->opcode) {
2783         case IORING_OP_READV:
2784         case IORING_OP_READ_FIXED:
2785         case IORING_OP_READ:
2786                 rw = READ;
2787                 break;
2788         case IORING_OP_WRITEV:
2789         case IORING_OP_WRITE_FIXED:
2790         case IORING_OP_WRITE:
2791                 rw = WRITE;
2792                 break;
2793         default:
2794                 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2795                                 req->opcode);
2796                 return false;
2797         }
2798
2799         ret = io_import_iovec(rw, req, &iovec, &iter, false);
2800         if (ret < 0)
2801                 return false;
2802         return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2803 }
2804 #endif
2805
2806 static bool io_rw_reissue(struct io_kiocb *req)
2807 {
2808 #ifdef CONFIG_BLOCK
2809         umode_t mode = file_inode(req->file)->i_mode;
2810         int ret;
2811
2812         if (!S_ISBLK(mode) && !S_ISREG(mode))
2813                 return false;
2814         if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
2815                 return false;
2816
2817         lockdep_assert_held(&req->ctx->uring_lock);
2818
2819         ret = io_sq_thread_acquire_mm_files(req->ctx, req);
2820
2821         if (!ret && io_resubmit_prep(req)) {
2822                 refcount_inc(&req->refs);
2823                 io_queue_async_work(req);
2824                 return true;
2825         }
2826         req_set_fail_links(req);
2827 #endif
2828         return false;
2829 }
2830
2831 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2832                              unsigned int issue_flags)
2833 {
2834         int cflags = 0;
2835
2836         if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
2837                 return;
2838         if (res != req->result)
2839                 req_set_fail_links(req);
2840
2841         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2842                 kiocb_end_write(req);
2843         if (req->flags & REQ_F_BUFFER_SELECTED)
2844                 cflags = io_put_rw_kbuf(req);
2845         __io_req_complete(req, issue_flags, res, cflags);
2846 }
2847
2848 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2849 {
2850         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2851
2852         __io_complete_rw(req, res, res2, 0);
2853 }
2854
2855 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2856 {
2857         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2858
2859         if (kiocb->ki_flags & IOCB_WRITE)
2860                 kiocb_end_write(req);
2861
2862         if (res != -EAGAIN && res != req->result)
2863                 req_set_fail_links(req);
2864
2865         WRITE_ONCE(req->result, res);
2866         /* order with io_poll_complete() checking ->result */
2867         smp_wmb();
2868         WRITE_ONCE(req->iopoll_completed, 1);
2869 }
2870
2871 /*
2872  * After the iocb has been issued, it's safe to be found on the poll list.
2873  * Adding the kiocb to the list AFTER submission ensures that we don't
2874  * find it from a io_iopoll_getevents() thread before the issuer is done
2875  * accessing the kiocb cookie.
2876  */
2877 static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
2878 {
2879         struct io_ring_ctx *ctx = req->ctx;
2880
2881         /*
2882          * Track whether we have multiple files in our lists. This will impact
2883          * how we do polling eventually, not spinning if we're on potentially
2884          * different devices.
2885          */
2886         if (list_empty(&ctx->iopoll_list)) {
2887                 ctx->poll_multi_file = false;
2888         } else if (!ctx->poll_multi_file) {
2889                 struct io_kiocb *list_req;
2890
2891                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2892                                                 inflight_entry);
2893                 if (list_req->file != req->file)
2894                         ctx->poll_multi_file = true;
2895         }
2896
2897         /*
2898          * For fast devices, IO may have already completed. If it has, add
2899          * it to the front so we find it first.
2900          */
2901         if (READ_ONCE(req->iopoll_completed))
2902                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2903         else
2904                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2905
2906         /*
2907          * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
2908          * task context or in io worker task context. If current task context is
2909          * sq thread, we don't need to check whether should wake up sq thread.
2910          */
2911         if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
2912             wq_has_sleeper(&ctx->sq_data->wait))
2913                 wake_up(&ctx->sq_data->wait);
2914 }
2915
2916 static inline void io_state_file_put(struct io_submit_state *state)
2917 {
2918         if (state->file_refs) {
2919                 fput_many(state->file, state->file_refs);
2920                 state->file_refs = 0;
2921         }
2922 }
2923
2924 /*
2925  * Get as many references to a file as we have IOs left in this submission,
2926  * assuming most submissions are for one file, or at least that each file
2927  * has more than one submission.
2928  */
2929 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2930 {
2931         if (!state)
2932                 return fget(fd);
2933
2934         if (state->file_refs) {
2935                 if (state->fd == fd) {
2936                         state->file_refs--;
2937                         return state->file;
2938                 }
2939                 io_state_file_put(state);
2940         }
2941         state->file = fget_many(fd, state->ios_left);
2942         if (unlikely(!state->file))
2943                 return NULL;
2944
2945         state->fd = fd;
2946         state->file_refs = state->ios_left - 1;
2947         return state->file;
2948 }
2949
2950 static bool io_bdev_nowait(struct block_device *bdev)
2951 {
2952         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2953 }
2954
2955 /*
2956  * If we tracked the file through the SCM inflight mechanism, we could support
2957  * any file. For now, just ensure that anything potentially problematic is done
2958  * inline.
2959  */
2960 static bool io_file_supports_async(struct file *file, int rw)
2961 {
2962         umode_t mode = file_inode(file)->i_mode;
2963
2964         if (S_ISBLK(mode)) {
2965                 if (IS_ENABLED(CONFIG_BLOCK) &&
2966                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2967                         return true;
2968                 return false;
2969         }
2970         if (S_ISCHR(mode) || S_ISSOCK(mode))
2971                 return true;
2972         if (S_ISREG(mode)) {
2973                 if (IS_ENABLED(CONFIG_BLOCK) &&
2974                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2975                     file->f_op != &io_uring_fops)
2976                         return true;
2977                 return false;
2978         }
2979
2980         /* any ->read/write should understand O_NONBLOCK */
2981         if (file->f_flags & O_NONBLOCK)
2982                 return true;
2983
2984         if (!(file->f_mode & FMODE_NOWAIT))
2985                 return false;
2986
2987         if (rw == READ)
2988                 return file->f_op->read_iter != NULL;
2989
2990         return file->f_op->write_iter != NULL;
2991 }
2992
2993 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2994 {
2995         struct io_ring_ctx *ctx = req->ctx;
2996         struct kiocb *kiocb = &req->rw.kiocb;
2997         struct file *file = req->file;
2998         unsigned ioprio;
2999         int ret;
3000
3001         if (S_ISREG(file_inode(file)->i_mode))
3002                 req->flags |= REQ_F_ISREG;
3003
3004         kiocb->ki_pos = READ_ONCE(sqe->off);
3005         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
3006                 req->flags |= REQ_F_CUR_POS;
3007                 kiocb->ki_pos = file->f_pos;
3008         }
3009         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3010         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
3011         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
3012         if (unlikely(ret))
3013                 return ret;
3014
3015         /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
3016         if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
3017                 req->flags |= REQ_F_NOWAIT;
3018
3019         ioprio = READ_ONCE(sqe->ioprio);
3020         if (ioprio) {
3021                 ret = ioprio_check_cap(ioprio);
3022                 if (ret)
3023                         return ret;
3024
3025                 kiocb->ki_ioprio = ioprio;
3026         } else
3027                 kiocb->ki_ioprio = get_current_ioprio();
3028
3029         if (ctx->flags & IORING_SETUP_IOPOLL) {
3030                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
3031                     !kiocb->ki_filp->f_op->iopoll)
3032                         return -EOPNOTSUPP;
3033
3034                 kiocb->ki_flags |= IOCB_HIPRI;
3035                 kiocb->ki_complete = io_complete_rw_iopoll;
3036                 req->iopoll_completed = 0;
3037         } else {
3038                 if (kiocb->ki_flags & IOCB_HIPRI)
3039                         return -EINVAL;
3040                 kiocb->ki_complete = io_complete_rw;
3041         }
3042
3043         req->rw.addr = READ_ONCE(sqe->addr);
3044         req->rw.len = READ_ONCE(sqe->len);
3045         req->buf_index = READ_ONCE(sqe->buf_index);
3046         return 0;
3047 }
3048
3049 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3050 {
3051         switch (ret) {
3052         case -EIOCBQUEUED:
3053                 break;
3054         case -ERESTARTSYS:
3055         case -ERESTARTNOINTR:
3056         case -ERESTARTNOHAND:
3057         case -ERESTART_RESTARTBLOCK:
3058                 /*
3059                  * We can't just restart the syscall, since previously
3060                  * submitted sqes may already be in progress. Just fail this
3061                  * IO with EINTR.
3062                  */
3063                 ret = -EINTR;
3064                 fallthrough;
3065         default:
3066                 kiocb->ki_complete(kiocb, ret, 0);
3067         }
3068 }
3069
3070 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
3071                        unsigned int issue_flags)
3072 {
3073         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
3074         struct io_async_rw *io = req->async_data;
3075
3076         /* add previously done IO, if any */
3077         if (io && io->bytes_done > 0) {
3078                 if (ret < 0)
3079                         ret = io->bytes_done;
3080                 else
3081                         ret += io->bytes_done;
3082         }
3083
3084         if (req->flags & REQ_F_CUR_POS)
3085                 req->file->f_pos = kiocb->ki_pos;
3086         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
3087                 __io_complete_rw(req, ret, 0, issue_flags);
3088         else
3089                 io_rw_done(kiocb, ret);
3090 }
3091
3092 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
3093 {
3094         struct io_ring_ctx *ctx = req->ctx;
3095         size_t len = req->rw.len;
3096         struct io_mapped_ubuf *imu;
3097         u16 index, buf_index = req->buf_index;
3098         size_t offset;
3099         u64 buf_addr;
3100
3101         if (unlikely(buf_index >= ctx->nr_user_bufs))
3102                 return -EFAULT;
3103         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3104         imu = &ctx->user_bufs[index];
3105         buf_addr = req->rw.addr;
3106
3107         /* overflow */
3108         if (buf_addr + len < buf_addr)
3109                 return -EFAULT;
3110         /* not inside the mapped region */
3111         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
3112                 return -EFAULT;
3113
3114         /*
3115          * May not be a start of buffer, set size appropriately
3116          * and advance us to the beginning.
3117          */
3118         offset = buf_addr - imu->ubuf;
3119         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
3120
3121         if (offset) {
3122                 /*
3123                  * Don't use iov_iter_advance() here, as it's really slow for
3124                  * using the latter parts of a big fixed buffer - it iterates
3125                  * over each segment manually. We can cheat a bit here, because
3126                  * we know that:
3127                  *
3128                  * 1) it's a BVEC iter, we set it up
3129                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
3130                  *    first and last bvec
3131                  *
3132                  * So just find our index, and adjust the iterator afterwards.
3133                  * If the offset is within the first bvec (or the whole first
3134                  * bvec, just use iov_iter_advance(). This makes it easier
3135                  * since we can just skip the first segment, which may not
3136                  * be PAGE_SIZE aligned.
3137                  */
3138                 const struct bio_vec *bvec = imu->bvec;
3139
3140                 if (offset <= bvec->bv_len) {
3141                         iov_iter_advance(iter, offset);
3142                 } else {
3143                         unsigned long seg_skip;
3144
3145                         /* skip first vec */
3146                         offset -= bvec->bv_len;
3147                         seg_skip = 1 + (offset >> PAGE_SHIFT);
3148
3149                         iter->bvec = bvec + seg_skip;
3150                         iter->nr_segs -= seg_skip;
3151                         iter->count -= bvec->bv_len + offset;
3152                         iter->iov_offset = offset & ~PAGE_MASK;
3153                 }
3154         }
3155
3156         return 0;
3157 }
3158
3159 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3160 {
3161         if (needs_lock)
3162                 mutex_unlock(&ctx->uring_lock);
3163 }
3164
3165 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3166 {
3167         /*
3168          * "Normal" inline submissions always hold the uring_lock, since we
3169          * grab it from the system call. Same is true for the SQPOLL offload.
3170          * The only exception is when we've detached the request and issue it
3171          * from an async worker thread, grab the lock for that case.
3172          */
3173         if (needs_lock)
3174                 mutex_lock(&ctx->uring_lock);
3175 }
3176
3177 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3178                                           int bgid, struct io_buffer *kbuf,
3179                                           bool needs_lock)
3180 {
3181         struct io_buffer *head;
3182
3183         if (req->flags & REQ_F_BUFFER_SELECTED)
3184                 return kbuf;
3185
3186         io_ring_submit_lock(req->ctx, needs_lock);
3187
3188         lockdep_assert_held(&req->ctx->uring_lock);
3189
3190         head = idr_find(&req->ctx->io_buffer_idr, bgid);
3191         if (head) {
3192                 if (!list_empty(&head->list)) {
3193                         kbuf = list_last_entry(&head->list, struct io_buffer,
3194                                                         list);
3195                         list_del(&kbuf->list);
3196                 } else {
3197                         kbuf = head;
3198                         idr_remove(&req->ctx->io_buffer_idr, bgid);
3199                 }
3200                 if (*len > kbuf->len)
3201                         *len = kbuf->len;
3202         } else {
3203                 kbuf = ERR_PTR(-ENOBUFS);
3204         }
3205
3206         io_ring_submit_unlock(req->ctx, needs_lock);
3207
3208         return kbuf;
3209 }
3210
3211 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3212                                         bool needs_lock)
3213 {
3214         struct io_buffer *kbuf;
3215         u16 bgid;
3216
3217         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3218         bgid = req->buf_index;
3219         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
3220         if (IS_ERR(kbuf))
3221                 return kbuf;
3222         req->rw.addr = (u64) (unsigned long) kbuf;
3223         req->flags |= REQ_F_BUFFER_SELECTED;
3224         return u64_to_user_ptr(kbuf->addr);
3225 }
3226
3227 #ifdef CONFIG_COMPAT
3228 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3229                                 bool needs_lock)
3230 {
3231         struct compat_iovec __user *uiov;
3232         compat_ssize_t clen;
3233         void __user *buf;
3234         ssize_t len;
3235
3236         uiov = u64_to_user_ptr(req->rw.addr);
3237         if (!access_ok(uiov, sizeof(*uiov)))
3238                 return -EFAULT;
3239         if (__get_user(clen, &uiov->iov_len))
3240                 return -EFAULT;
3241         if (clen < 0)
3242                 return -EINVAL;
3243
3244         len = clen;
3245         buf = io_rw_buffer_select(req, &len, needs_lock);
3246         if (IS_ERR(buf))
3247                 return PTR_ERR(buf);
3248         iov[0].iov_base = buf;
3249         iov[0].iov_len = (compat_size_t) len;
3250         return 0;
3251 }
3252 #endif
3253
3254 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3255                                       bool needs_lock)
3256 {
3257         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3258         void __user *buf;
3259         ssize_t len;
3260
3261         if (copy_from_user(iov, uiov, sizeof(*uiov)))
3262                 return -EFAULT;
3263
3264         len = iov[0].iov_len;
3265         if (len < 0)
3266                 return -EINVAL;
3267         buf = io_rw_buffer_select(req, &len, needs_lock);
3268         if (IS_ERR(buf))
3269                 return PTR_ERR(buf);
3270         iov[0].iov_base = buf;
3271         iov[0].iov_len = len;
3272         return 0;
3273 }
3274
3275 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3276                                     bool needs_lock)
3277 {
3278         if (req->flags & REQ_F_BUFFER_SELECTED) {
3279                 struct io_buffer *kbuf;
3280
3281                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3282                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3283                 iov[0].iov_len = kbuf->len;
3284                 return 0;
3285         }
3286         if (req->rw.len != 1)
3287                 return -EINVAL;
3288
3289 #ifdef CONFIG_COMPAT
3290         if (req->ctx->compat)
3291                 return io_compat_import(req, iov, needs_lock);
3292 #endif
3293
3294         return __io_iov_buffer_select(req, iov, needs_lock);
3295 }
3296
3297 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3298                            struct iov_iter *iter, bool needs_lock)
3299 {
3300         void __user *buf = u64_to_user_ptr(req->rw.addr);
3301         size_t sqe_len = req->rw.len;
3302         u8 opcode = req->opcode;
3303         ssize_t ret;
3304
3305         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3306                 *iovec = NULL;
3307                 return io_import_fixed(req, rw, iter);
3308         }
3309
3310         /* buffer index only valid with fixed read/write, or buffer select  */
3311         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
3312                 return -EINVAL;
3313
3314         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3315                 if (req->flags & REQ_F_BUFFER_SELECT) {
3316                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3317                         if (IS_ERR(buf))
3318                                 return PTR_ERR(buf);
3319                         req->rw.len = sqe_len;
3320                 }
3321
3322                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3323                 *iovec = NULL;
3324                 return ret;
3325         }
3326
3327         if (req->flags & REQ_F_BUFFER_SELECT) {
3328                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
3329                 if (!ret)
3330                         iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
3331                 *iovec = NULL;
3332                 return ret;
3333         }
3334
3335         return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3336                               req->ctx->compat);
3337 }
3338
3339 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3340 {
3341         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3342 }
3343
3344 /*
3345  * For files that don't have ->read_iter() and ->write_iter(), handle them
3346  * by looping over ->read() or ->write() manually.
3347  */
3348 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3349 {
3350         struct kiocb *kiocb = &req->rw.kiocb;
3351         struct file *file = req->file;
3352         ssize_t ret = 0;
3353
3354         /*
3355          * Don't support polled IO through this interface, and we can't
3356          * support non-blocking either. For the latter, this just causes
3357          * the kiocb to be handled from an async context.
3358          */
3359         if (kiocb->ki_flags & IOCB_HIPRI)
3360                 return -EOPNOTSUPP;
3361         if (kiocb->ki_flags & IOCB_NOWAIT)
3362                 return -EAGAIN;
3363
3364         while (iov_iter_count(iter)) {
3365                 struct iovec iovec;
3366                 ssize_t nr;
3367
3368                 if (!iov_iter_is_bvec(iter)) {
3369                         iovec = iov_iter_iovec(iter);
3370                 } else {
3371                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3372                         iovec.iov_len = req->rw.len;
3373                 }
3374
3375                 if (rw == READ) {
3376                         nr = file->f_op->read(file, iovec.iov_base,
3377                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3378                 } else {
3379                         nr = file->f_op->write(file, iovec.iov_base,
3380                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3381                 }
3382
3383                 if (nr < 0) {
3384                         if (!ret)
3385                                 ret = nr;
3386                         break;
3387                 }
3388                 ret += nr;
3389                 if (nr != iovec.iov_len)
3390                         break;
3391                 req->rw.len -= nr;
3392                 req->rw.addr += nr;
3393                 iov_iter_advance(iter, nr);
3394         }
3395
3396         return ret;
3397 }
3398
3399 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3400                           const struct iovec *fast_iov, struct iov_iter *iter)
3401 {
3402         struct io_async_rw *rw = req->async_data;
3403
3404         memcpy(&rw->iter, iter, sizeof(*iter));
3405         rw->free_iovec = iovec;
3406         rw->bytes_done = 0;
3407         /* can only be fixed buffers, no need to do anything */
3408         if (iov_iter_is_bvec(iter))
3409                 return;
3410         if (!iovec) {
3411                 unsigned iov_off = 0;
3412
3413                 rw->iter.iov = rw->fast_iov;
3414                 if (iter->iov != fast_iov) {
3415                         iov_off = iter->iov - fast_iov;
3416                         rw->iter.iov += iov_off;
3417                 }
3418                 if (rw->fast_iov != fast_iov)
3419                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3420                                sizeof(struct iovec) * iter->nr_segs);
3421         } else {
3422                 req->flags |= REQ_F_NEED_CLEANUP;
3423         }
3424 }
3425
3426 static inline int __io_alloc_async_data(struct io_kiocb *req)
3427 {
3428         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3429         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3430         return req->async_data == NULL;
3431 }
3432
3433 static int io_alloc_async_data(struct io_kiocb *req)
3434 {
3435         if (!io_op_defs[req->opcode].needs_async_data)
3436                 return 0;
3437
3438         return  __io_alloc_async_data(req);
3439 }
3440
3441 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3442                              const struct iovec *fast_iov,
3443                              struct iov_iter *iter, bool force)
3444 {
3445         if (!force && !io_op_defs[req->opcode].needs_async_data)
3446                 return 0;
3447         if (!req->async_data) {
3448                 if (__io_alloc_async_data(req)) {
3449                         kfree(iovec);
3450                         return -ENOMEM;
3451                 }
3452
3453                 io_req_map_rw(req, iovec, fast_iov, iter);
3454         }
3455         return 0;
3456 }
3457
3458 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3459 {
3460         struct io_async_rw *iorw = req->async_data;
3461         struct iovec *iov = iorw->fast_iov;
3462         int ret;
3463
3464         ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3465         if (unlikely(ret < 0))
3466                 return ret;
3467
3468         iorw->bytes_done = 0;
3469         iorw->free_iovec = iov;
3470         if (iov)
3471                 req->flags |= REQ_F_NEED_CLEANUP;
3472         return 0;
3473 }
3474
3475 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3476 {
3477         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3478                 return -EBADF;
3479         return io_prep_rw(req, sqe);
3480 }
3481
3482 /*
3483  * This is our waitqueue callback handler, registered through lock_page_async()
3484  * when we initially tried to do the IO with the iocb armed our waitqueue.
3485  * This gets called when the page is unlocked, and we generally expect that to
3486  * happen when the page IO is completed and the page is now uptodate. This will
3487  * queue a task_work based retry of the operation, attempting to copy the data
3488  * again. If the latter fails because the page was NOT uptodate, then we will
3489  * do a thread based blocking retry of the operation. That's the unexpected
3490  * slow path.
3491  */
3492 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3493                              int sync, void *arg)
3494 {
3495         struct wait_page_queue *wpq;
3496         struct io_kiocb *req = wait->private;
3497         struct wait_page_key *key = arg;
3498
3499         wpq = container_of(wait, struct wait_page_queue, wait);
3500
3501         if (!wake_page_match(wpq, key))
3502                 return 0;
3503
3504         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3505         list_del_init(&wait->entry);
3506
3507         /* submit ref gets dropped, acquire a new one */
3508         refcount_inc(&req->refs);
3509         io_req_task_queue(req);
3510         return 1;
3511 }
3512
3513 /*
3514  * This controls whether a given IO request should be armed for async page
3515  * based retry. If we return false here, the request is handed to the async
3516  * worker threads for retry. If we're doing buffered reads on a regular file,
3517  * we prepare a private wait_page_queue entry and retry the operation. This
3518  * will either succeed because the page is now uptodate and unlocked, or it
3519  * will register a callback when the page is unlocked at IO completion. Through
3520  * that callback, io_uring uses task_work to setup a retry of the operation.
3521  * That retry will attempt the buffered read again. The retry will generally
3522  * succeed, or in rare cases where it fails, we then fall back to using the
3523  * async worker threads for a blocking retry.
3524  */
3525 static bool io_rw_should_retry(struct io_kiocb *req)
3526 {
3527         struct io_async_rw *rw = req->async_data;
3528         struct wait_page_queue *wait = &rw->wpq;
3529         struct kiocb *kiocb = &req->rw.kiocb;
3530
3531         /* never retry for NOWAIT, we just complete with -EAGAIN */
3532         if (req->flags & REQ_F_NOWAIT)
3533                 return false;
3534
3535         /* Only for buffered IO */
3536         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3537                 return false;
3538
3539         /*
3540          * just use poll if we can, and don't attempt if the fs doesn't
3541          * support callback based unlocks
3542          */
3543         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3544                 return false;
3545
3546         wait->wait.func = io_async_buf_func;
3547         wait->wait.private = req;
3548         wait->wait.flags = 0;
3549         INIT_LIST_HEAD(&wait->wait.entry);
3550         kiocb->ki_flags |= IOCB_WAITQ;
3551         kiocb->ki_flags &= ~IOCB_NOWAIT;
3552         kiocb->ki_waitq = wait;
3553         return true;
3554 }
3555
3556 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3557 {
3558         if (req->file->f_op->read_iter)
3559                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3560         else if (req->file->f_op->read)
3561                 return loop_rw_iter(READ, req, iter);
3562         else
3563                 return -EINVAL;
3564 }
3565
3566 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3567 {
3568         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3569         struct kiocb *kiocb = &req->rw.kiocb;
3570         struct iov_iter __iter, *iter = &__iter;
3571         struct io_async_rw *rw = req->async_data;
3572         ssize_t io_size, ret, ret2;
3573         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3574
3575         if (rw) {
3576                 iter = &rw->iter;
3577                 iovec = NULL;
3578         } else {
3579                 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3580                 if (ret < 0)
3581                         return ret;
3582         }
3583         io_size = iov_iter_count(iter);
3584         req->result = io_size;
3585
3586         /* Ensure we clear previously set non-block flag */
3587         if (!force_nonblock)
3588                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3589         else
3590                 kiocb->ki_flags |= IOCB_NOWAIT;
3591
3592         /* If the file doesn't support async, just async punt */
3593         if (force_nonblock && !io_file_supports_async(req->file, READ)) {
3594                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3595                 return ret ?: -EAGAIN;
3596         }
3597
3598         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3599         if (unlikely(ret)) {
3600                 kfree(iovec);
3601                 return ret;
3602         }
3603
3604         ret = io_iter_do_read(req, iter);
3605
3606         if (ret == -EIOCBQUEUED) {
3607                 goto out_free;
3608         } else if (ret == -EAGAIN) {
3609                 /* IOPOLL retry should happen for io-wq threads */
3610                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3611                         goto done;
3612                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3613                 if (req->flags & REQ_F_NOWAIT)
3614                         goto done;
3615                 /* some cases will consume bytes even on error returns */
3616                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3617                 ret = 0;
3618         } else if (ret <= 0 || ret == io_size || !force_nonblock ||
3619                    (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3620                 /* read all, failed, already did sync or don't want to retry */
3621                 goto done;
3622         }
3623
3624         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3625         if (ret2)
3626                 return ret2;
3627
3628         iovec = NULL;
3629         rw = req->async_data;
3630         /* now use our persistent iterator, if we aren't already */
3631         iter = &rw->iter;
3632
3633         do {
3634                 io_size -= ret;
3635                 rw->bytes_done += ret;
3636                 /* if we can retry, do so with the callbacks armed */
3637                 if (!io_rw_should_retry(req)) {
3638                         kiocb->ki_flags &= ~IOCB_WAITQ;
3639                         return -EAGAIN;
3640                 }
3641
3642                 /*
3643                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3644                  * we get -EIOCBQUEUED, then we'll get a notification when the
3645                  * desired page gets unlocked. We can also get a partial read
3646                  * here, and if we do, then just retry at the new offset.
3647                  */
3648                 ret = io_iter_do_read(req, iter);
3649                 if (ret == -EIOCBQUEUED)
3650                         return 0;
3651                 /* we got some bytes, but not all. retry. */
3652         } while (ret > 0 && ret < io_size);
3653 done:
3654         kiocb_done(kiocb, ret, issue_flags);
3655 out_free:
3656         /* it's faster to check here then delegate to kfree */
3657         if (iovec)
3658                 kfree(iovec);
3659         return 0;
3660 }
3661
3662 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3663 {
3664         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3665                 return -EBADF;
3666         return io_prep_rw(req, sqe);
3667 }
3668
3669 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3670 {
3671         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3672         struct kiocb *kiocb = &req->rw.kiocb;
3673         struct iov_iter __iter, *iter = &__iter;
3674         struct io_async_rw *rw = req->async_data;
3675         ssize_t ret, ret2, io_size;
3676         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3677
3678         if (rw) {
3679                 iter = &rw->iter;
3680                 iovec = NULL;
3681         } else {
3682                 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3683                 if (ret < 0)
3684                         return ret;
3685         }
3686         io_size = iov_iter_count(iter);
3687         req->result = io_size;
3688
3689         /* Ensure we clear previously set non-block flag */
3690         if (!force_nonblock)
3691                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3692         else
3693                 kiocb->ki_flags |= IOCB_NOWAIT;
3694
3695         /* If the file doesn't support async, just async punt */
3696         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3697                 goto copy_iov;
3698
3699         /* file path doesn't support NOWAIT for non-direct_IO */
3700         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3701             (req->flags & REQ_F_ISREG))
3702                 goto copy_iov;
3703
3704         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3705         if (unlikely(ret))
3706                 goto out_free;
3707
3708         /*
3709          * Open-code file_start_write here to grab freeze protection,
3710          * which will be released by another thread in
3711          * io_complete_rw().  Fool lockdep by telling it the lock got
3712          * released so that it doesn't complain about the held lock when
3713          * we return to userspace.
3714          */
3715         if (req->flags & REQ_F_ISREG) {
3716                 sb_start_write(file_inode(req->file)->i_sb);
3717                 __sb_writers_release(file_inode(req->file)->i_sb,
3718                                         SB_FREEZE_WRITE);
3719         }
3720         kiocb->ki_flags |= IOCB_WRITE;
3721
3722         if (req->file->f_op->write_iter)
3723                 ret2 = call_write_iter(req->file, kiocb, iter);
3724         else if (req->file->f_op->write)
3725                 ret2 = loop_rw_iter(WRITE, req, iter);
3726         else
3727                 ret2 = -EINVAL;
3728
3729         /*
3730          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3731          * retry them without IOCB_NOWAIT.
3732          */
3733         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3734                 ret2 = -EAGAIN;
3735         /* no retry on NONBLOCK nor RWF_NOWAIT */
3736         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3737                 goto done;
3738         if (!force_nonblock || ret2 != -EAGAIN) {
3739                 /* IOPOLL retry should happen for io-wq threads */
3740                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3741                         goto copy_iov;
3742 done:
3743                 kiocb_done(kiocb, ret2, issue_flags);
3744         } else {
3745 copy_iov:
3746                 /* some cases will consume bytes even on error returns */
3747                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3748                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3749                 return ret ?: -EAGAIN;
3750         }
3751 out_free:
3752         /* it's reportedly faster than delegating the null check to kfree() */
3753         if (iovec)
3754                 kfree(iovec);
3755         return ret;
3756 }
3757
3758 static int io_renameat_prep(struct io_kiocb *req,
3759                             const struct io_uring_sqe *sqe)
3760 {
3761         struct io_rename *ren = &req->rename;
3762         const char __user *oldf, *newf;
3763
3764         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3765                 return -EBADF;
3766
3767         ren->old_dfd = READ_ONCE(sqe->fd);
3768         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3769         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3770         ren->new_dfd = READ_ONCE(sqe->len);
3771         ren->flags = READ_ONCE(sqe->rename_flags);
3772
3773         ren->oldpath = getname(oldf);
3774         if (IS_ERR(ren->oldpath))
3775                 return PTR_ERR(ren->oldpath);
3776
3777         ren->newpath = getname(newf);
3778         if (IS_ERR(ren->newpath)) {
3779                 putname(ren->oldpath);
3780                 return PTR_ERR(ren->newpath);
3781         }
3782
3783         req->flags |= REQ_F_NEED_CLEANUP;
3784         return 0;
3785 }
3786
3787 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3788 {
3789         struct io_rename *ren = &req->rename;
3790         int ret;
3791
3792         if (issue_flags & IO_URING_F_NONBLOCK)
3793                 return -EAGAIN;
3794
3795         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3796                                 ren->newpath, ren->flags);
3797
3798         req->flags &= ~REQ_F_NEED_CLEANUP;
3799         if (ret < 0)
3800                 req_set_fail_links(req);
3801         io_req_complete(req, ret);
3802         return 0;
3803 }
3804
3805 static int io_unlinkat_prep(struct io_kiocb *req,
3806                             const struct io_uring_sqe *sqe)
3807 {
3808         struct io_unlink *un = &req->unlink;
3809         const char __user *fname;
3810
3811         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3812                 return -EBADF;
3813
3814         un->dfd = READ_ONCE(sqe->fd);
3815
3816         un->flags = READ_ONCE(sqe->unlink_flags);
3817         if (un->flags & ~AT_REMOVEDIR)
3818                 return -EINVAL;
3819
3820         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3821         un->filename = getname(fname);
3822         if (IS_ERR(un->filename))
3823                 return PTR_ERR(un->filename);
3824
3825         req->flags |= REQ_F_NEED_CLEANUP;
3826         return 0;
3827 }
3828
3829 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3830 {
3831         struct io_unlink *un = &req->unlink;
3832         int ret;
3833
3834         if (issue_flags & IO_URING_F_NONBLOCK)
3835                 return -EAGAIN;
3836
3837         if (un->flags & AT_REMOVEDIR)
3838                 ret = do_rmdir(un->dfd, un->filename);
3839         else
3840                 ret = do_unlinkat(un->dfd, un->filename);
3841
3842         req->flags &= ~REQ_F_NEED_CLEANUP;
3843         if (ret < 0)
3844                 req_set_fail_links(req);
3845         io_req_complete(req, ret);
3846         return 0;
3847 }
3848
3849 static int io_shutdown_prep(struct io_kiocb *req,
3850                             const struct io_uring_sqe *sqe)
3851 {
3852 #if defined(CONFIG_NET)
3853         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3854                 return -EINVAL;
3855         if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3856             sqe->buf_index)
3857                 return -EINVAL;
3858
3859         req->shutdown.how = READ_ONCE(sqe->len);
3860         return 0;
3861 #else
3862         return -EOPNOTSUPP;
3863 #endif
3864 }
3865
3866 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3867 {
3868 #if defined(CONFIG_NET)
3869         struct socket *sock;
3870         int ret;
3871
3872         if (issue_flags & IO_URING_F_NONBLOCK)
3873                 return -EAGAIN;
3874
3875         sock = sock_from_file(req->file);
3876         if (unlikely(!sock))
3877                 return -ENOTSOCK;
3878
3879         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3880         if (ret < 0)
3881                 req_set_fail_links(req);
3882         io_req_complete(req, ret);
3883         return 0;
3884 #else
3885         return -EOPNOTSUPP;
3886 #endif
3887 }
3888
3889 static int __io_splice_prep(struct io_kiocb *req,
3890                             const struct io_uring_sqe *sqe)
3891 {
3892         struct io_splice* sp = &req->splice;
3893         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3894
3895         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3896                 return -EINVAL;
3897
3898         sp->file_in = NULL;
3899         sp->len = READ_ONCE(sqe->len);
3900         sp->flags = READ_ONCE(sqe->splice_flags);
3901
3902         if (unlikely(sp->flags & ~valid_flags))
3903                 return -EINVAL;
3904
3905         sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3906                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3907         if (!sp->file_in)
3908                 return -EBADF;
3909         req->flags |= REQ_F_NEED_CLEANUP;
3910
3911         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3912                 /*
3913                  * Splice operation will be punted aync, and here need to
3914                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
3915                  */
3916                 io_req_init_async(req);
3917                 req->work.flags |= IO_WQ_WORK_UNBOUND;
3918         }
3919
3920         return 0;
3921 }
3922
3923 static int io_tee_prep(struct io_kiocb *req,
3924                        const struct io_uring_sqe *sqe)
3925 {
3926         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3927                 return -EINVAL;
3928         return __io_splice_prep(req, sqe);
3929 }
3930
3931 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3932 {
3933         struct io_splice *sp = &req->splice;
3934         struct file *in = sp->file_in;
3935         struct file *out = sp->file_out;
3936         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3937         long ret = 0;
3938
3939         if (issue_flags & IO_URING_F_NONBLOCK)
3940                 return -EAGAIN;
3941         if (sp->len)
3942                 ret = do_tee(in, out, sp->len, flags);
3943
3944         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3945         req->flags &= ~REQ_F_NEED_CLEANUP;
3946
3947         if (ret != sp->len)
3948                 req_set_fail_links(req);
3949         io_req_complete(req, ret);
3950         return 0;
3951 }
3952
3953 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3954 {
3955         struct io_splice* sp = &req->splice;
3956
3957         sp->off_in = READ_ONCE(sqe->splice_off_in);
3958         sp->off_out = READ_ONCE(sqe->off);
3959         return __io_splice_prep(req, sqe);
3960 }
3961
3962 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3963 {
3964         struct io_splice *sp = &req->splice;
3965         struct file *in = sp->file_in;
3966         struct file *out = sp->file_out;
3967         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3968         loff_t *poff_in, *poff_out;
3969         long ret = 0;
3970
3971         if (issue_flags & IO_URING_F_NONBLOCK)
3972                 return -EAGAIN;
3973
3974         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3975         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3976
3977         if (sp->len)
3978                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3979
3980         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3981         req->flags &= ~REQ_F_NEED_CLEANUP;
3982
3983         if (ret != sp->len)
3984                 req_set_fail_links(req);
3985         io_req_complete(req, ret);
3986         return 0;
3987 }
3988
3989 /*
3990  * IORING_OP_NOP just posts a completion event, nothing else.
3991  */
3992 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3993 {
3994         struct io_ring_ctx *ctx = req->ctx;
3995
3996         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3997                 return -EINVAL;
3998
3999         __io_req_complete(req, issue_flags, 0, 0);
4000         return 0;
4001 }
4002
4003 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4004 {
4005         struct io_ring_ctx *ctx = req->ctx;
4006
4007         if (!req->file)
4008                 return -EBADF;
4009
4010         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4011                 return -EINVAL;
4012         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4013                 return -EINVAL;
4014
4015         req->sync.flags = READ_ONCE(sqe->fsync_flags);
4016         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4017                 return -EINVAL;
4018
4019         req->sync.off = READ_ONCE(sqe->off);
4020         req->sync.len = READ_ONCE(sqe->len);
4021         return 0;
4022 }
4023
4024 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
4025 {
4026         loff_t end = req->sync.off + req->sync.len;
4027         int ret;
4028
4029         /* fsync always requires a blocking context */
4030         if (issue_flags & IO_URING_F_NONBLOCK)
4031                 return -EAGAIN;
4032
4033         ret = vfs_fsync_range(req->file, req->sync.off,
4034                                 end > 0 ? end : LLONG_MAX,
4035                                 req->sync.flags & IORING_FSYNC_DATASYNC);
4036         if (ret < 0)
4037                 req_set_fail_links(req);
4038         io_req_complete(req, ret);
4039         return 0;
4040 }
4041
4042 static int io_fallocate_prep(struct io_kiocb *req,
4043                              const struct io_uring_sqe *sqe)
4044 {
4045         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
4046                 return -EINVAL;
4047         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4048                 return -EINVAL;
4049
4050         req->sync.off = READ_ONCE(sqe->off);
4051         req->sync.len = READ_ONCE(sqe->addr);
4052         req->sync.mode = READ_ONCE(sqe->len);
4053         return 0;
4054 }
4055
4056 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
4057 {
4058         int ret;
4059
4060         /* fallocate always requiring blocking context */
4061         if (issue_flags & IO_URING_F_NONBLOCK)
4062                 return -EAGAIN;
4063         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4064                                 req->sync.len);
4065         if (ret < 0)
4066                 req_set_fail_links(req);
4067         io_req_complete(req, ret);
4068         return 0;
4069 }
4070
4071 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4072 {
4073         const char __user *fname;
4074         int ret;
4075
4076         if (unlikely(sqe->ioprio || sqe->buf_index))
4077                 return -EINVAL;
4078         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4079                 return -EBADF;
4080
4081         /* open.how should be already initialised */
4082         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
4083                 req->open.how.flags |= O_LARGEFILE;
4084
4085         req->open.dfd = READ_ONCE(sqe->fd);
4086         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4087         req->open.filename = getname(fname);
4088         if (IS_ERR(req->open.filename)) {
4089                 ret = PTR_ERR(req->open.filename);
4090                 req->open.filename = NULL;
4091                 return ret;
4092         }
4093         req->open.nofile = rlimit(RLIMIT_NOFILE);
4094         req->flags |= REQ_F_NEED_CLEANUP;
4095         return 0;
4096 }
4097
4098 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4099 {
4100         u64 flags, mode;
4101
4102         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4103                 return -EINVAL;
4104         mode = READ_ONCE(sqe->len);
4105         flags = READ_ONCE(sqe->open_flags);
4106         req->open.how = build_open_how(flags, mode);
4107         return __io_openat_prep(req, sqe);
4108 }
4109
4110 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4111 {
4112         struct open_how __user *how;
4113         size_t len;
4114         int ret;
4115
4116         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4117                 return -EINVAL;
4118         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4119         len = READ_ONCE(sqe->len);
4120         if (len < OPEN_HOW_SIZE_VER0)
4121                 return -EINVAL;
4122
4123         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4124                                         len);
4125         if (ret)
4126                 return ret;
4127
4128         return __io_openat_prep(req, sqe);
4129 }
4130
4131 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
4132 {
4133         struct open_flags op;
4134         struct file *file;
4135         bool nonblock_set;
4136         bool resolve_nonblock;
4137         int ret;
4138
4139         ret = build_open_flags(&req->open.how, &op);
4140         if (ret)
4141                 goto err;
4142         nonblock_set = op.open_flag & O_NONBLOCK;
4143         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
4144         if (issue_flags & IO_URING_F_NONBLOCK) {
4145                 /*
4146                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4147                  * it'll always -EAGAIN
4148                  */
4149                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4150                         return -EAGAIN;
4151                 op.lookup_flags |= LOOKUP_CACHED;
4152                 op.open_flag |= O_NONBLOCK;
4153         }
4154
4155         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4156         if (ret < 0)
4157                 goto err;
4158
4159         file = do_filp_open(req->open.dfd, req->open.filename, &op);
4160         /* only retry if RESOLVE_CACHED wasn't already set by application */
4161         if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
4162             file == ERR_PTR(-EAGAIN)) {
4163                 /*
4164                  * We could hang on to this 'fd', but seems like marginal
4165                  * gain for something that is now known to be a slower path.
4166                  * So just put it, and we'll get a new one when we retry.
4167                  */
4168                 put_unused_fd(ret);
4169                 return -EAGAIN;
4170         }
4171
4172         if (IS_ERR(file)) {
4173                 put_unused_fd(ret);
4174                 ret = PTR_ERR(file);
4175         } else {
4176                 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4177                         file->f_flags &= ~O_NONBLOCK;
4178                 fsnotify_open(file);
4179                 fd_install(ret, file);
4180         }
4181 err:
4182         putname(req->open.filename);
4183         req->flags &= ~REQ_F_NEED_CLEANUP;
4184         if (ret < 0)
4185                 req_set_fail_links(req);
4186         io_req_complete(req, ret);
4187         return 0;
4188 }
4189
4190 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4191 {
4192         return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
4193 }
4194
4195 static int io_remove_buffers_prep(struct io_kiocb *req,
4196                                   const struct io_uring_sqe *sqe)
4197 {
4198         struct io_provide_buf *p = &req->pbuf;
4199         u64 tmp;
4200
4201         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
4202                 return -EINVAL;
4203
4204         tmp = READ_ONCE(sqe->fd);
4205         if (!tmp || tmp > USHRT_MAX)
4206                 return -EINVAL;
4207
4208         memset(p, 0, sizeof(*p));
4209         p->nbufs = tmp;
4210         p->bgid = READ_ONCE(sqe->buf_group);
4211         return 0;
4212 }
4213
4214 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4215                                int bgid, unsigned nbufs)
4216 {
4217         unsigned i = 0;
4218
4219         /* shouldn't happen */
4220         if (!nbufs)
4221                 return 0;
4222
4223         /* the head kbuf is the list itself */
4224         while (!list_empty(&buf->list)) {
4225                 struct io_buffer *nxt;
4226
4227                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
4228                 list_del(&nxt->list);
4229                 kfree(nxt);
4230                 if (++i == nbufs)
4231                         return i;
4232         }
4233         i++;
4234         kfree(buf);
4235         idr_remove(&ctx->io_buffer_idr, bgid);
4236
4237         return i;
4238 }
4239
4240 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4241 {
4242         struct io_provide_buf *p = &req->pbuf;
4243         struct io_ring_ctx *ctx = req->ctx;
4244         struct io_buffer *head;
4245         int ret = 0;
4246         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4247
4248         io_ring_submit_lock(ctx, !force_nonblock);
4249
4250         lockdep_assert_held(&ctx->uring_lock);
4251
4252         ret = -ENOENT;
4253         head = idr_find(&ctx->io_buffer_idr, p->bgid);
4254         if (head)
4255                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
4256         if (ret < 0)
4257                 req_set_fail_links(req);
4258
4259         /* need to hold the lock to complete IOPOLL requests */
4260         if (ctx->flags & IORING_SETUP_IOPOLL) {
4261                 __io_req_complete(req, issue_flags, ret, 0);
4262                 io_ring_submit_unlock(ctx, !force_nonblock);
4263         } else {
4264                 io_ring_submit_unlock(ctx, !force_nonblock);
4265                 __io_req_complete(req, issue_flags, ret, 0);
4266         }
4267         return 0;
4268 }
4269
4270 static int io_provide_buffers_prep(struct io_kiocb *req,
4271                                    const struct io_uring_sqe *sqe)
4272 {
4273         struct io_provide_buf *p = &req->pbuf;
4274         u64 tmp;
4275
4276         if (sqe->ioprio || sqe->rw_flags)
4277                 return -EINVAL;
4278
4279         tmp = READ_ONCE(sqe->fd);
4280         if (!tmp || tmp > USHRT_MAX)
4281                 return -E2BIG;
4282         p->nbufs = tmp;
4283         p->addr = READ_ONCE(sqe->addr);
4284         p->len = READ_ONCE(sqe->len);
4285
4286         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
4287                 return -EFAULT;
4288
4289         p->bgid = READ_ONCE(sqe->buf_group);
4290         tmp = READ_ONCE(sqe->off);
4291         if (tmp > USHRT_MAX)
4292                 return -E2BIG;
4293         p->bid = tmp;
4294         return 0;
4295 }
4296
4297 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4298 {
4299         struct io_buffer *buf;
4300         u64 addr = pbuf->addr;
4301         int i, bid = pbuf->bid;
4302
4303         for (i = 0; i < pbuf->nbufs; i++) {
4304                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
4305                 if (!buf)
4306                         break;
4307
4308                 buf->addr = addr;
4309                 buf->len = pbuf->len;
4310                 buf->bid = bid;
4311                 addr += pbuf->len;
4312                 bid++;
4313                 if (!*head) {
4314                         INIT_LIST_HEAD(&buf->list);
4315                         *head = buf;
4316                 } else {
4317                         list_add_tail(&buf->list, &(*head)->list);
4318                 }
4319         }
4320
4321         return i ? i : -ENOMEM;
4322 }
4323
4324 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4325 {
4326         struct io_provide_buf *p = &req->pbuf;
4327         struct io_ring_ctx *ctx = req->ctx;
4328         struct io_buffer *head, *list;
4329         int ret = 0;
4330         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4331
4332         io_ring_submit_lock(ctx, !force_nonblock);
4333
4334         lockdep_assert_held(&ctx->uring_lock);
4335
4336         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
4337
4338         ret = io_add_buffers(p, &head);
4339         if (ret < 0)
4340                 goto out;
4341
4342         if (!list) {
4343                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
4344                                         GFP_KERNEL);
4345                 if (ret < 0) {
4346                         __io_remove_buffers(ctx, head, p->bgid, -1U);
4347                         goto out;
4348                 }
4349         }
4350 out:
4351         if (ret < 0)
4352                 req_set_fail_links(req);
4353
4354         /* need to hold the lock to complete IOPOLL requests */
4355         if (ctx->flags & IORING_SETUP_IOPOLL) {
4356                 __io_req_complete(req, issue_flags, ret, 0);
4357                 io_ring_submit_unlock(ctx, !force_nonblock);
4358         } else {
4359                 io_ring_submit_unlock(ctx, !force_nonblock);
4360                 __io_req_complete(req, issue_flags, ret, 0);
4361         }
4362         return 0;
4363 }
4364
4365 static int io_epoll_ctl_prep(struct io_kiocb *req,
4366                              const struct io_uring_sqe *sqe)
4367 {
4368 #if defined(CONFIG_EPOLL)
4369         if (sqe->ioprio || sqe->buf_index)
4370                 return -EINVAL;
4371         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4372                 return -EINVAL;
4373
4374         req->epoll.epfd = READ_ONCE(sqe->fd);
4375         req->epoll.op = READ_ONCE(sqe->len);
4376         req->epoll.fd = READ_ONCE(sqe->off);
4377
4378         if (ep_op_has_event(req->epoll.op)) {
4379                 struct epoll_event __user *ev;
4380
4381                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4382                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4383                         return -EFAULT;
4384         }
4385
4386         return 0;
4387 #else
4388         return -EOPNOTSUPP;
4389 #endif
4390 }
4391
4392 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4393 {
4394 #if defined(CONFIG_EPOLL)
4395         struct io_epoll *ie = &req->epoll;
4396         int ret;
4397         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4398
4399         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4400         if (force_nonblock && ret == -EAGAIN)
4401                 return -EAGAIN;
4402
4403         if (ret < 0)
4404                 req_set_fail_links(req);
4405         __io_req_complete(req, issue_flags, ret, 0);
4406         return 0;
4407 #else
4408         return -EOPNOTSUPP;
4409 #endif
4410 }
4411
4412 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4413 {
4414 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4415         if (sqe->ioprio || sqe->buf_index || sqe->off)
4416                 return -EINVAL;
4417         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4418                 return -EINVAL;
4419
4420         req->madvise.addr = READ_ONCE(sqe->addr);
4421         req->madvise.len = READ_ONCE(sqe->len);
4422         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4423         return 0;
4424 #else
4425         return -EOPNOTSUPP;
4426 #endif
4427 }
4428
4429 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4430 {
4431 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4432         struct io_madvise *ma = &req->madvise;
4433         int ret;
4434
4435         if (issue_flags & IO_URING_F_NONBLOCK)
4436                 return -EAGAIN;
4437
4438         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4439         if (ret < 0)
4440                 req_set_fail_links(req);
4441         io_req_complete(req, ret);
4442         return 0;
4443 #else
4444         return -EOPNOTSUPP;
4445 #endif
4446 }
4447
4448 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4449 {
4450         if (sqe->ioprio || sqe->buf_index || sqe->addr)
4451                 return -EINVAL;
4452         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4453                 return -EINVAL;
4454
4455         req->fadvise.offset = READ_ONCE(sqe->off);
4456         req->fadvise.len = READ_ONCE(sqe->len);
4457         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4458         return 0;
4459 }
4460
4461 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4462 {
4463         struct io_fadvise *fa = &req->fadvise;
4464         int ret;
4465
4466         if (issue_flags & IO_URING_F_NONBLOCK) {
4467                 switch (fa->advice) {
4468                 case POSIX_FADV_NORMAL:
4469                 case POSIX_FADV_RANDOM:
4470                 case POSIX_FADV_SEQUENTIAL:
4471                         break;
4472                 default:
4473                         return -EAGAIN;
4474                 }
4475         }
4476
4477         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4478         if (ret < 0)
4479                 req_set_fail_links(req);
4480         io_req_complete(req, ret);
4481         return 0;
4482 }
4483
4484 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4485 {
4486         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4487                 return -EINVAL;
4488         if (sqe->ioprio || sqe->buf_index)
4489                 return -EINVAL;
4490         if (req->flags & REQ_F_FIXED_FILE)
4491                 return -EBADF;
4492
4493         req->statx.dfd = READ_ONCE(sqe->fd);
4494         req->statx.mask = READ_ONCE(sqe->len);
4495         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4496         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4497         req->statx.flags = READ_ONCE(sqe->statx_flags);
4498
4499         return 0;
4500 }
4501
4502 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4503 {
4504         struct io_statx *ctx = &req->statx;
4505         int ret;
4506
4507         if (issue_flags & IO_URING_F_NONBLOCK) {
4508                 /* only need file table for an actual valid fd */
4509                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4510                         req->flags |= REQ_F_NO_FILE_TABLE;
4511                 return -EAGAIN;
4512         }
4513
4514         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4515                        ctx->buffer);
4516
4517         if (ret < 0)
4518                 req_set_fail_links(req);
4519         io_req_complete(req, ret);
4520         return 0;
4521 }
4522
4523 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4524 {
4525         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4526                 return -EINVAL;
4527         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4528             sqe->rw_flags || sqe->buf_index)
4529                 return -EINVAL;
4530         if (req->flags & REQ_F_FIXED_FILE)
4531                 return -EBADF;
4532
4533         req->close.fd = READ_ONCE(sqe->fd);
4534         return 0;
4535 }
4536
4537 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4538 {
4539         struct files_struct *files = current->files;
4540         struct io_close *close = &req->close;
4541         struct fdtable *fdt;
4542         struct file *file;
4543         int ret;
4544
4545         file = NULL;
4546         ret = -EBADF;
4547         spin_lock(&files->file_lock);
4548         fdt = files_fdtable(files);
4549         if (close->fd >= fdt->max_fds) {
4550                 spin_unlock(&files->file_lock);
4551                 goto err;
4552         }
4553         file = fdt->fd[close->fd];
4554         if (!file) {
4555                 spin_unlock(&files->file_lock);
4556                 goto err;
4557         }
4558
4559         if (file->f_op == &io_uring_fops) {
4560                 spin_unlock(&files->file_lock);
4561                 file = NULL;
4562                 goto err;
4563         }
4564
4565         /* if the file has a flush method, be safe and punt to async */
4566         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4567                 spin_unlock(&files->file_lock);
4568                 return -EAGAIN;
4569         }
4570
4571         ret = __close_fd_get_file(close->fd, &file);
4572         spin_unlock(&files->file_lock);
4573         if (ret < 0) {
4574                 if (ret == -ENOENT)
4575                         ret = -EBADF;
4576                 goto err;
4577         }
4578
4579         /* No ->flush() or already async, safely close from here */
4580         ret = filp_close(file, current->files);
4581 err:
4582         if (ret < 0)
4583                 req_set_fail_links(req);
4584         if (file)
4585                 fput(file);
4586         __io_req_complete(req, issue_flags, ret, 0);
4587         return 0;
4588 }
4589
4590 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4591 {
4592         struct io_ring_ctx *ctx = req->ctx;
4593
4594         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4595                 return -EINVAL;
4596         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4597                 return -EINVAL;
4598
4599         req->sync.off = READ_ONCE(sqe->off);
4600         req->sync.len = READ_ONCE(sqe->len);
4601         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4602         return 0;
4603 }
4604
4605 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4606 {
4607         int ret;
4608
4609         /* sync_file_range always requires a blocking context */
4610         if (issue_flags & IO_URING_F_NONBLOCK)
4611                 return -EAGAIN;
4612
4613         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4614                                 req->sync.flags);
4615         if (ret < 0)
4616                 req_set_fail_links(req);
4617         io_req_complete(req, ret);
4618         return 0;
4619 }
4620
4621 #if defined(CONFIG_NET)
4622 static int io_setup_async_msg(struct io_kiocb *req,
4623                               struct io_async_msghdr *kmsg)
4624 {
4625         struct io_async_msghdr *async_msg = req->async_data;
4626
4627         if (async_msg)
4628                 return -EAGAIN;
4629         if (io_alloc_async_data(req)) {
4630                 kfree(kmsg->free_iov);
4631                 return -ENOMEM;
4632         }
4633         async_msg = req->async_data;
4634         req->flags |= REQ_F_NEED_CLEANUP;
4635         memcpy(async_msg, kmsg, sizeof(*kmsg));
4636         async_msg->msg.msg_name = &async_msg->addr;
4637         /* if were using fast_iov, set it to the new one */
4638         if (!async_msg->free_iov)
4639                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4640
4641         return -EAGAIN;
4642 }
4643
4644 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4645                                struct io_async_msghdr *iomsg)
4646 {
4647         iomsg->msg.msg_name = &iomsg->addr;
4648         iomsg->free_iov = iomsg->fast_iov;
4649         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4650                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4651 }
4652
4653 static int io_sendmsg_prep_async(struct io_kiocb *req)
4654 {
4655         int ret;
4656
4657         if (!io_op_defs[req->opcode].needs_async_data)
4658                 return 0;
4659         ret = io_sendmsg_copy_hdr(req, req->async_data);
4660         if (!ret)
4661                 req->flags |= REQ_F_NEED_CLEANUP;
4662         return ret;
4663 }
4664
4665 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4666 {
4667         struct io_sr_msg *sr = &req->sr_msg;
4668
4669         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4670                 return -EINVAL;
4671
4672         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4673         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4674         sr->len = READ_ONCE(sqe->len);
4675
4676 #ifdef CONFIG_COMPAT
4677         if (req->ctx->compat)
4678                 sr->msg_flags |= MSG_CMSG_COMPAT;
4679 #endif
4680         return 0;
4681 }
4682
4683 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4684 {
4685         struct io_async_msghdr iomsg, *kmsg;
4686         struct socket *sock;
4687         unsigned flags;
4688         int ret;
4689
4690         sock = sock_from_file(req->file);
4691         if (unlikely(!sock))
4692                 return -ENOTSOCK;
4693
4694         kmsg = req->async_data;
4695         if (!kmsg) {
4696                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4697                 if (ret)
4698                         return ret;
4699                 kmsg = &iomsg;
4700         }
4701
4702         flags = req->sr_msg.msg_flags;
4703         if (flags & MSG_DONTWAIT)
4704                 req->flags |= REQ_F_NOWAIT;
4705         else if (issue_flags & IO_URING_F_NONBLOCK)
4706                 flags |= MSG_DONTWAIT;
4707
4708         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4709         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4710                 return io_setup_async_msg(req, kmsg);
4711         if (ret == -ERESTARTSYS)
4712                 ret = -EINTR;
4713
4714         /* fast path, check for non-NULL to avoid function call */
4715         if (kmsg->free_iov)
4716                 kfree(kmsg->free_iov);
4717         req->flags &= ~REQ_F_NEED_CLEANUP;
4718         if (ret < 0)
4719                 req_set_fail_links(req);
4720         __io_req_complete(req, issue_flags, ret, 0);
4721         return 0;
4722 }
4723
4724 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4725 {
4726         struct io_sr_msg *sr = &req->sr_msg;
4727         struct msghdr msg;
4728         struct iovec iov;
4729         struct socket *sock;
4730         unsigned flags;
4731         int ret;
4732
4733         sock = sock_from_file(req->file);
4734         if (unlikely(!sock))
4735                 return -ENOTSOCK;
4736
4737         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4738         if (unlikely(ret))
4739                 return ret;
4740
4741         msg.msg_name = NULL;
4742         msg.msg_control = NULL;
4743         msg.msg_controllen = 0;
4744         msg.msg_namelen = 0;
4745
4746         flags = req->sr_msg.msg_flags;
4747         if (flags & MSG_DONTWAIT)
4748                 req->flags |= REQ_F_NOWAIT;
4749         else if (issue_flags & IO_URING_F_NONBLOCK)
4750                 flags |= MSG_DONTWAIT;
4751
4752         msg.msg_flags = flags;
4753         ret = sock_sendmsg(sock, &msg);
4754         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4755                 return -EAGAIN;
4756         if (ret == -ERESTARTSYS)
4757                 ret = -EINTR;
4758
4759         if (ret < 0)
4760                 req_set_fail_links(req);
4761         __io_req_complete(req, issue_flags, ret, 0);
4762         return 0;
4763 }
4764
4765 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4766                                  struct io_async_msghdr *iomsg)
4767 {
4768         struct io_sr_msg *sr = &req->sr_msg;
4769         struct iovec __user *uiov;
4770         size_t iov_len;
4771         int ret;
4772
4773         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4774                                         &iomsg->uaddr, &uiov, &iov_len);
4775         if (ret)
4776                 return ret;
4777
4778         if (req->flags & REQ_F_BUFFER_SELECT) {
4779                 if (iov_len > 1)
4780                         return -EINVAL;
4781                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4782                         return -EFAULT;
4783                 sr->len = iomsg->fast_iov[0].iov_len;
4784                 iomsg->free_iov = NULL;
4785         } else {
4786                 iomsg->free_iov = iomsg->fast_iov;
4787                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4788                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4789                                      false);
4790                 if (ret > 0)
4791                         ret = 0;
4792         }
4793
4794         return ret;
4795 }
4796
4797 #ifdef CONFIG_COMPAT
4798 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4799                                         struct io_async_msghdr *iomsg)
4800 {
4801         struct compat_msghdr __user *msg_compat;
4802         struct io_sr_msg *sr = &req->sr_msg;
4803         struct compat_iovec __user *uiov;
4804         compat_uptr_t ptr;
4805         compat_size_t len;
4806         int ret;
4807
4808         msg_compat = (struct compat_msghdr __user *) sr->umsg;
4809         ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4810                                         &ptr, &len);
4811         if (ret)
4812                 return ret;
4813
4814         uiov = compat_ptr(ptr);
4815         if (req->flags & REQ_F_BUFFER_SELECT) {
4816                 compat_ssize_t clen;
4817
4818                 if (len > 1)
4819                         return -EINVAL;
4820                 if (!access_ok(uiov, sizeof(*uiov)))
4821                         return -EFAULT;
4822                 if (__get_user(clen, &uiov->iov_len))
4823                         return -EFAULT;
4824                 if (clen < 0)
4825                         return -EINVAL;
4826                 sr->len = clen;
4827                 iomsg->free_iov = NULL;
4828         } else {
4829                 iomsg->free_iov = iomsg->fast_iov;
4830                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4831                                    UIO_FASTIOV, &iomsg->free_iov,
4832                                    &iomsg->msg.msg_iter, true);
4833                 if (ret < 0)
4834                         return ret;
4835         }
4836
4837         return 0;
4838 }
4839 #endif
4840
4841 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4842                                struct io_async_msghdr *iomsg)
4843 {
4844         iomsg->msg.msg_name = &iomsg->addr;
4845
4846 #ifdef CONFIG_COMPAT
4847         if (req->ctx->compat)
4848                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4849 #endif
4850
4851         return __io_recvmsg_copy_hdr(req, iomsg);
4852 }
4853
4854 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4855                                                bool needs_lock)
4856 {
4857         struct io_sr_msg *sr = &req->sr_msg;
4858         struct io_buffer *kbuf;
4859
4860         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4861         if (IS_ERR(kbuf))
4862                 return kbuf;
4863
4864         sr->kbuf = kbuf;
4865         req->flags |= REQ_F_BUFFER_SELECTED;
4866         return kbuf;
4867 }
4868
4869 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4870 {
4871         return io_put_kbuf(req, req->sr_msg.kbuf);
4872 }
4873
4874 static int io_recvmsg_prep_async(struct io_kiocb *req)
4875 {
4876         int ret;
4877
4878         if (!io_op_defs[req->opcode].needs_async_data)
4879                 return 0;
4880         ret = io_recvmsg_copy_hdr(req, req->async_data);
4881         if (!ret)
4882                 req->flags |= REQ_F_NEED_CLEANUP;
4883         return ret;
4884 }
4885
4886 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4887 {
4888         struct io_sr_msg *sr = &req->sr_msg;
4889
4890         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4891                 return -EINVAL;
4892
4893         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4894         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4895         sr->len = READ_ONCE(sqe->len);
4896         sr->bgid = READ_ONCE(sqe->buf_group);
4897
4898 #ifdef CONFIG_COMPAT
4899         if (req->ctx->compat)
4900                 sr->msg_flags |= MSG_CMSG_COMPAT;
4901 #endif
4902         return 0;
4903 }
4904
4905 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4906 {
4907         struct io_async_msghdr iomsg, *kmsg;
4908         struct socket *sock;
4909         struct io_buffer *kbuf;
4910         unsigned flags;
4911         int ret, cflags = 0;
4912         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4913
4914         sock = sock_from_file(req->file);
4915         if (unlikely(!sock))
4916                 return -ENOTSOCK;
4917
4918         kmsg = req->async_data;
4919         if (!kmsg) {
4920                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4921                 if (ret)
4922                         return ret;
4923                 kmsg = &iomsg;
4924         }
4925
4926         if (req->flags & REQ_F_BUFFER_SELECT) {
4927                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4928                 if (IS_ERR(kbuf))
4929                         return PTR_ERR(kbuf);
4930                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4931                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4932                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4933                                 1, req->sr_msg.len);
4934         }
4935
4936         flags = req->sr_msg.msg_flags;
4937         if (flags & MSG_DONTWAIT)
4938                 req->flags |= REQ_F_NOWAIT;
4939         else if (force_nonblock)
4940                 flags |= MSG_DONTWAIT;
4941
4942         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4943                                         kmsg->uaddr, flags);
4944         if (force_nonblock && ret == -EAGAIN)
4945                 return io_setup_async_msg(req, kmsg);
4946         if (ret == -ERESTARTSYS)
4947                 ret = -EINTR;
4948
4949         if (req->flags & REQ_F_BUFFER_SELECTED)
4950                 cflags = io_put_recv_kbuf(req);
4951         /* fast path, check for non-NULL to avoid function call */
4952         if (kmsg->free_iov)
4953                 kfree(kmsg->free_iov);
4954         req->flags &= ~REQ_F_NEED_CLEANUP;
4955         if (ret < 0)
4956                 req_set_fail_links(req);
4957         __io_req_complete(req, issue_flags, ret, cflags);
4958         return 0;
4959 }
4960
4961 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4962 {
4963         struct io_buffer *kbuf;
4964         struct io_sr_msg *sr = &req->sr_msg;
4965         struct msghdr msg;
4966         void __user *buf = sr->buf;
4967         struct socket *sock;
4968         struct iovec iov;
4969         unsigned flags;
4970         int ret, cflags = 0;
4971         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4972
4973         sock = sock_from_file(req->file);
4974         if (unlikely(!sock))
4975                 return -ENOTSOCK;
4976
4977         if (req->flags & REQ_F_BUFFER_SELECT) {
4978                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4979                 if (IS_ERR(kbuf))
4980                         return PTR_ERR(kbuf);
4981                 buf = u64_to_user_ptr(kbuf->addr);
4982         }
4983
4984         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4985         if (unlikely(ret))
4986                 goto out_free;
4987
4988         msg.msg_name = NULL;
4989         msg.msg_control = NULL;
4990         msg.msg_controllen = 0;
4991         msg.msg_namelen = 0;
4992         msg.msg_iocb = NULL;
4993         msg.msg_flags = 0;
4994
4995         flags = req->sr_msg.msg_flags;
4996         if (flags & MSG_DONTWAIT)
4997                 req->flags |= REQ_F_NOWAIT;
4998         else if (force_nonblock)
4999                 flags |= MSG_DONTWAIT;
5000
5001         ret = sock_recvmsg(sock, &msg, flags);
5002         if (force_nonblock && ret == -EAGAIN)
5003                 return -EAGAIN;
5004         if (ret == -ERESTARTSYS)
5005                 ret = -EINTR;
5006 out_free:
5007         if (req->flags & REQ_F_BUFFER_SELECTED)
5008                 cflags = io_put_recv_kbuf(req);
5009         if (ret < 0)
5010                 req_set_fail_links(req);
5011         __io_req_complete(req, issue_flags, ret, cflags);
5012         return 0;
5013 }
5014
5015 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5016 {
5017         struct io_accept *accept = &req->accept;
5018
5019         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5020                 return -EINVAL;
5021         if (sqe->ioprio || sqe->len || sqe->buf_index)
5022                 return -EINVAL;
5023
5024         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5025         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5026         accept->flags = READ_ONCE(sqe->accept_flags);
5027         accept->nofile = rlimit(RLIMIT_NOFILE);
5028         return 0;
5029 }
5030
5031 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5032 {
5033         struct io_accept *accept = &req->accept;
5034         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5035         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
5036         int ret;
5037
5038         if (req->file->f_flags & O_NONBLOCK)
5039                 req->flags |= REQ_F_NOWAIT;
5040
5041         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
5042                                         accept->addr_len, accept->flags,
5043                                         accept->nofile);
5044         if (ret == -EAGAIN && force_nonblock)
5045                 return -EAGAIN;
5046         if (ret < 0) {
5047                 if (ret == -ERESTARTSYS)
5048                         ret = -EINTR;
5049                 req_set_fail_links(req);
5050         }
5051         __io_req_complete(req, issue_flags, ret, 0);
5052         return 0;
5053 }
5054
5055 static int io_connect_prep_async(struct io_kiocb *req)
5056 {
5057         struct io_async_connect *io = req->async_data;
5058         struct io_connect *conn = &req->connect;
5059
5060         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5061 }
5062
5063 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5064 {
5065         struct io_connect *conn = &req->connect;
5066
5067         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5068                 return -EINVAL;
5069         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
5070                 return -EINVAL;
5071
5072         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5073         conn->addr_len =  READ_ONCE(sqe->addr2);
5074         return 0;
5075 }
5076
5077 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5078 {
5079         struct io_async_connect __io, *io;
5080         unsigned file_flags;
5081         int ret;
5082         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5083
5084         if (req->async_data) {
5085                 io = req->async_data;
5086         } else {
5087                 ret = move_addr_to_kernel(req->connect.addr,
5088                                                 req->connect.addr_len,
5089                                                 &__io.address);
5090                 if (ret)
5091                         goto out;
5092                 io = &__io;
5093         }
5094
5095         file_flags = force_nonblock ? O_NONBLOCK : 0;
5096
5097         ret = __sys_connect_file(req->file, &io->address,
5098                                         req->connect.addr_len, file_flags);
5099         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
5100                 if (req->async_data)
5101                         return -EAGAIN;
5102                 if (io_alloc_async_data(req)) {
5103                         ret = -ENOMEM;
5104                         goto out;
5105                 }
5106                 io = req->async_data;
5107                 memcpy(req->async_data, &__io, sizeof(__io));
5108                 return -EAGAIN;
5109         }
5110         if (ret == -ERESTARTSYS)
5111                 ret = -EINTR;
5112 out:
5113         if (ret < 0)
5114                 req_set_fail_links(req);
5115         __io_req_complete(req, issue_flags, ret, 0);
5116         return 0;
5117 }
5118 #else /* !CONFIG_NET */
5119 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5120 {
5121         return -EOPNOTSUPP;
5122 }
5123
5124 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
5125 {
5126         return -EOPNOTSUPP;
5127 }
5128
5129 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
5130 {
5131         return -EOPNOTSUPP;
5132 }
5133
5134 static int io_recvmsg_prep(struct io_kiocb *req,
5135                            const struct io_uring_sqe *sqe)
5136 {
5137         return -EOPNOTSUPP;
5138 }
5139
5140 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
5141 {
5142         return -EOPNOTSUPP;
5143 }
5144
5145 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
5146 {
5147         return -EOPNOTSUPP;
5148 }
5149
5150 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5151 {
5152         return -EOPNOTSUPP;
5153 }
5154
5155 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5156 {
5157         return -EOPNOTSUPP;
5158 }
5159
5160 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5161 {
5162         return -EOPNOTSUPP;
5163 }
5164
5165 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5166 {
5167         return -EOPNOTSUPP;
5168 }
5169 #endif /* CONFIG_NET */
5170
5171 struct io_poll_table {
5172         struct poll_table_struct pt;
5173         struct io_kiocb *req;
5174         int error;
5175 };
5176
5177 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5178                            __poll_t mask, task_work_func_t func)
5179 {
5180         int ret;
5181
5182         /* for instances that support it check for an event match first: */
5183         if (mask && !(mask & poll->events))
5184                 return 0;
5185
5186         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5187
5188         list_del_init(&poll->wait.entry);
5189
5190         req->result = mask;
5191         req->task_work.func = func;
5192         percpu_ref_get(&req->ctx->refs);
5193
5194         /*
5195          * If this fails, then the task is exiting. When a task exits, the
5196          * work gets canceled, so just cancel this request as well instead
5197          * of executing it. We can't safely execute it anyway, as we may not
5198          * have the needed state needed for it anyway.
5199          */
5200         ret = io_req_task_work_add(req);
5201         if (unlikely(ret)) {
5202                 WRITE_ONCE(poll->canceled, true);
5203                 io_req_task_work_add_fallback(req, func);
5204         }
5205         return 1;
5206 }
5207
5208 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5209         __acquires(&req->ctx->completion_lock)
5210 {
5211         struct io_ring_ctx *ctx = req->ctx;
5212
5213         if (!req->result && !READ_ONCE(poll->canceled)) {
5214                 struct poll_table_struct pt = { ._key = poll->events };
5215
5216                 req->result = vfs_poll(req->file, &pt) & poll->events;
5217         }
5218
5219         spin_lock_irq(&ctx->completion_lock);
5220         if (!req->result && !READ_ONCE(poll->canceled)) {
5221                 add_wait_queue(poll->head, &poll->wait);
5222                 return true;
5223         }
5224
5225         return false;
5226 }
5227
5228 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5229 {
5230         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5231         if (req->opcode == IORING_OP_POLL_ADD)
5232                 return req->async_data;
5233         return req->apoll->double_poll;
5234 }
5235
5236 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5237 {
5238         if (req->opcode == IORING_OP_POLL_ADD)
5239                 return &req->poll;
5240         return &req->apoll->poll;
5241 }
5242
5243 static void io_poll_remove_double(struct io_kiocb *req)
5244 {
5245         struct io_poll_iocb *poll = io_poll_get_double(req);
5246
5247         lockdep_assert_held(&req->ctx->completion_lock);
5248
5249         if (poll && poll->head) {
5250                 struct wait_queue_head *head = poll->head;
5251
5252                 spin_lock(&head->lock);
5253                 list_del_init(&poll->wait.entry);
5254                 if (poll->wait.private)
5255                         refcount_dec(&req->refs);
5256                 poll->head = NULL;
5257                 spin_unlock(&head->lock);
5258         }
5259 }
5260
5261 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
5262 {
5263         struct io_ring_ctx *ctx = req->ctx;
5264
5265         io_poll_remove_double(req);
5266         req->poll.done = true;
5267         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
5268         io_commit_cqring(ctx);
5269 }
5270
5271 static void io_poll_task_func(struct callback_head *cb)
5272 {
5273         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5274         struct io_ring_ctx *ctx = req->ctx;
5275         struct io_kiocb *nxt;
5276
5277         if (io_poll_rewait(req, &req->poll)) {
5278                 spin_unlock_irq(&ctx->completion_lock);
5279         } else {
5280                 hash_del(&req->hash_node);
5281                 io_poll_complete(req, req->result, 0);
5282                 spin_unlock_irq(&ctx->completion_lock);
5283
5284                 nxt = io_put_req_find_next(req);
5285                 io_cqring_ev_posted(ctx);
5286                 if (nxt)
5287                         __io_req_task_submit(nxt);
5288         }
5289
5290         percpu_ref_put(&ctx->refs);
5291 }
5292
5293 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5294                                int sync, void *key)
5295 {
5296         struct io_kiocb *req = wait->private;
5297         struct io_poll_iocb *poll = io_poll_get_single(req);
5298         __poll_t mask = key_to_poll(key);
5299
5300         /* for instances that support it check for an event match first: */
5301         if (mask && !(mask & poll->events))
5302                 return 0;
5303
5304         list_del_init(&wait->entry);
5305
5306         if (poll && poll->head) {
5307                 bool done;
5308
5309                 spin_lock(&poll->head->lock);
5310                 done = list_empty(&poll->wait.entry);
5311                 if (!done)
5312                         list_del_init(&poll->wait.entry);
5313                 /* make sure double remove sees this as being gone */
5314                 wait->private = NULL;
5315                 spin_unlock(&poll->head->lock);
5316                 if (!done) {
5317                         /* use wait func handler, so it matches the rq type */
5318                         poll->wait.func(&poll->wait, mode, sync, key);
5319                 }
5320         }
5321         refcount_dec(&req->refs);
5322         return 1;
5323 }
5324
5325 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5326                               wait_queue_func_t wake_func)
5327 {
5328         poll->head = NULL;
5329         poll->done = false;
5330         poll->canceled = false;
5331         poll->events = events;
5332         INIT_LIST_HEAD(&poll->wait.entry);
5333         init_waitqueue_func_entry(&poll->wait, wake_func);
5334 }
5335
5336 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5337                             struct wait_queue_head *head,
5338                             struct io_poll_iocb **poll_ptr)
5339 {
5340         struct io_kiocb *req = pt->req;
5341
5342         /*
5343          * If poll->head is already set, it's because the file being polled
5344          * uses multiple waitqueues for poll handling (eg one for read, one
5345          * for write). Setup a separate io_poll_iocb if this happens.
5346          */
5347         if (unlikely(poll->head)) {
5348                 struct io_poll_iocb *poll_one = poll;
5349
5350                 /* already have a 2nd entry, fail a third attempt */
5351                 if (*poll_ptr) {
5352                         pt->error = -EINVAL;
5353                         return;
5354                 }
5355                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5356                 if (!poll) {
5357                         pt->error = -ENOMEM;
5358                         return;
5359                 }
5360                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5361                 refcount_inc(&req->refs);
5362                 poll->wait.private = req;
5363                 *poll_ptr = poll;
5364         }
5365
5366         pt->error = 0;
5367         poll->head = head;
5368
5369         if (poll->events & EPOLLEXCLUSIVE)
5370                 add_wait_queue_exclusive(head, &poll->wait);
5371         else
5372                 add_wait_queue(head, &poll->wait);
5373 }
5374
5375 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5376                                struct poll_table_struct *p)
5377 {
5378         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5379         struct async_poll *apoll = pt->req->apoll;
5380
5381         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5382 }
5383
5384 static void io_async_task_func(struct callback_head *cb)
5385 {
5386         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5387         struct async_poll *apoll = req->apoll;
5388         struct io_ring_ctx *ctx = req->ctx;
5389
5390         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5391
5392         if (io_poll_rewait(req, &apoll->poll)) {
5393                 spin_unlock_irq(&ctx->completion_lock);
5394                 percpu_ref_put(&ctx->refs);
5395                 return;
5396         }
5397
5398         /* If req is still hashed, it cannot have been canceled. Don't check. */
5399         if (hash_hashed(&req->hash_node))
5400                 hash_del(&req->hash_node);
5401
5402         io_poll_remove_double(req);
5403         spin_unlock_irq(&ctx->completion_lock);
5404
5405         if (!READ_ONCE(apoll->poll.canceled))
5406                 __io_req_task_submit(req);
5407         else
5408                 __io_req_task_cancel(req, -ECANCELED);
5409
5410         percpu_ref_put(&ctx->refs);
5411         kfree(apoll->double_poll);
5412         kfree(apoll);
5413 }
5414
5415 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5416                         void *key)
5417 {
5418         struct io_kiocb *req = wait->private;
5419         struct io_poll_iocb *poll = &req->apoll->poll;
5420
5421         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5422                                         key_to_poll(key));
5423
5424         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5425 }
5426
5427 static void io_poll_req_insert(struct io_kiocb *req)
5428 {
5429         struct io_ring_ctx *ctx = req->ctx;
5430         struct hlist_head *list;
5431
5432         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5433         hlist_add_head(&req->hash_node, list);
5434 }
5435
5436 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5437                                       struct io_poll_iocb *poll,
5438                                       struct io_poll_table *ipt, __poll_t mask,
5439                                       wait_queue_func_t wake_func)
5440         __acquires(&ctx->completion_lock)
5441 {
5442         struct io_ring_ctx *ctx = req->ctx;
5443         bool cancel = false;
5444
5445         INIT_HLIST_NODE(&req->hash_node);
5446         io_init_poll_iocb(poll, mask, wake_func);
5447         poll->file = req->file;
5448         poll->wait.private = req;
5449
5450         ipt->pt._key = mask;
5451         ipt->req = req;
5452         ipt->error = -EINVAL;
5453
5454         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5455
5456         spin_lock_irq(&ctx->completion_lock);
5457         if (likely(poll->head)) {
5458                 spin_lock(&poll->head->lock);
5459                 if (unlikely(list_empty(&poll->wait.entry))) {
5460                         if (ipt->error)
5461                                 cancel = true;
5462                         ipt->error = 0;
5463                         mask = 0;
5464                 }
5465                 if (mask || ipt->error)
5466                         list_del_init(&poll->wait.entry);
5467                 else if (cancel)
5468                         WRITE_ONCE(poll->canceled, true);
5469                 else if (!poll->done) /* actually waiting for an event */
5470                         io_poll_req_insert(req);
5471                 spin_unlock(&poll->head->lock);
5472         }
5473
5474         return mask;
5475 }
5476
5477 static bool io_arm_poll_handler(struct io_kiocb *req)
5478 {
5479         const struct io_op_def *def = &io_op_defs[req->opcode];
5480         struct io_ring_ctx *ctx = req->ctx;
5481         struct async_poll *apoll;
5482         struct io_poll_table ipt;
5483         __poll_t mask, ret;
5484         int rw;
5485
5486         if (!req->file || !file_can_poll(req->file))
5487                 return false;
5488         if (req->flags & REQ_F_POLLED)
5489                 return false;
5490         if (def->pollin)
5491                 rw = READ;
5492         else if (def->pollout)
5493                 rw = WRITE;
5494         else
5495                 return false;
5496         /* if we can't nonblock try, then no point in arming a poll handler */
5497         if (!io_file_supports_async(req->file, rw))
5498                 return false;
5499
5500         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5501         if (unlikely(!apoll))
5502                 return false;
5503         apoll->double_poll = NULL;
5504
5505         req->flags |= REQ_F_POLLED;
5506         req->apoll = apoll;
5507
5508         mask = 0;
5509         if (def->pollin)
5510                 mask |= POLLIN | POLLRDNORM;
5511         if (def->pollout)
5512                 mask |= POLLOUT | POLLWRNORM;
5513
5514         /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5515         if ((req->opcode == IORING_OP_RECVMSG) &&
5516             (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5517                 mask &= ~POLLIN;
5518
5519         mask |= POLLERR | POLLPRI;
5520
5521         ipt.pt._qproc = io_async_queue_proc;
5522
5523         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5524                                         io_async_wake);
5525         if (ret || ipt.error) {
5526                 io_poll_remove_double(req);
5527                 spin_unlock_irq(&ctx->completion_lock);
5528                 kfree(apoll->double_poll);
5529                 kfree(apoll);
5530                 return false;
5531         }
5532         spin_unlock_irq(&ctx->completion_lock);
5533         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5534                                         apoll->poll.events);
5535         return true;
5536 }
5537
5538 static bool __io_poll_remove_one(struct io_kiocb *req,
5539                                  struct io_poll_iocb *poll)
5540 {
5541         bool do_complete = false;
5542
5543         spin_lock(&poll->head->lock);
5544         WRITE_ONCE(poll->canceled, true);
5545         if (!list_empty(&poll->wait.entry)) {
5546                 list_del_init(&poll->wait.entry);
5547                 do_complete = true;
5548         }
5549         spin_unlock(&poll->head->lock);
5550         hash_del(&req->hash_node);
5551         return do_complete;
5552 }
5553
5554 static bool io_poll_remove_one(struct io_kiocb *req)
5555 {
5556         bool do_complete;
5557
5558         io_poll_remove_double(req);
5559
5560         if (req->opcode == IORING_OP_POLL_ADD) {
5561                 do_complete = __io_poll_remove_one(req, &req->poll);
5562         } else {
5563                 struct async_poll *apoll = req->apoll;
5564
5565                 /* non-poll requests have submit ref still */
5566                 do_complete = __io_poll_remove_one(req, &apoll->poll);
5567                 if (do_complete) {
5568                         io_put_req(req);
5569                         kfree(apoll->double_poll);
5570                         kfree(apoll);
5571                 }
5572         }
5573
5574         if (do_complete) {
5575                 io_cqring_fill_event(req, -ECANCELED);
5576                 io_commit_cqring(req->ctx);
5577                 req_set_fail_links(req);
5578                 io_put_req_deferred(req, 1);
5579         }
5580
5581         return do_complete;
5582 }
5583
5584 /*
5585  * Returns true if we found and killed one or more poll requests
5586  */
5587 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5588                                struct files_struct *files)
5589 {
5590         struct hlist_node *tmp;
5591         struct io_kiocb *req;
5592         int posted = 0, i;
5593
5594         spin_lock_irq(&ctx->completion_lock);
5595         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5596                 struct hlist_head *list;
5597
5598                 list = &ctx->cancel_hash[i];
5599                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5600                         if (io_match_task(req, tsk, files))
5601                                 posted += io_poll_remove_one(req);
5602                 }
5603         }
5604         spin_unlock_irq(&ctx->completion_lock);
5605
5606         if (posted)
5607                 io_cqring_ev_posted(ctx);
5608
5609         return posted != 0;
5610 }
5611
5612 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5613 {
5614         struct hlist_head *list;
5615         struct io_kiocb *req;
5616
5617         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5618         hlist_for_each_entry(req, list, hash_node) {
5619                 if (sqe_addr != req->user_data)
5620                         continue;
5621                 if (io_poll_remove_one(req))
5622                         return 0;
5623                 return -EALREADY;
5624         }
5625
5626         return -ENOENT;
5627 }
5628
5629 static int io_poll_remove_prep(struct io_kiocb *req,
5630                                const struct io_uring_sqe *sqe)
5631 {
5632         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5633                 return -EINVAL;
5634         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5635             sqe->poll_events)
5636                 return -EINVAL;
5637
5638         req->poll_remove.addr = READ_ONCE(sqe->addr);
5639         return 0;
5640 }
5641
5642 /*
5643  * Find a running poll command that matches one specified in sqe->addr,
5644  * and remove it if found.
5645  */
5646 static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
5647 {
5648         struct io_ring_ctx *ctx = req->ctx;
5649         int ret;
5650
5651         spin_lock_irq(&ctx->completion_lock);
5652         ret = io_poll_cancel(ctx, req->poll_remove.addr);
5653         spin_unlock_irq(&ctx->completion_lock);
5654
5655         if (ret < 0)
5656                 req_set_fail_links(req);
5657         io_req_complete(req, ret);
5658         return 0;
5659 }
5660
5661 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5662                         void *key)
5663 {
5664         struct io_kiocb *req = wait->private;
5665         struct io_poll_iocb *poll = &req->poll;
5666
5667         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5668 }
5669
5670 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5671                                struct poll_table_struct *p)
5672 {
5673         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5674
5675         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5676 }
5677
5678 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5679 {
5680         struct io_poll_iocb *poll = &req->poll;
5681         u32 events;
5682
5683         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5684                 return -EINVAL;
5685         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5686                 return -EINVAL;
5687
5688         events = READ_ONCE(sqe->poll32_events);
5689 #ifdef __BIG_ENDIAN
5690         events = swahw32(events);
5691 #endif
5692         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5693                        (events & EPOLLEXCLUSIVE);
5694         return 0;
5695 }
5696
5697 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5698 {
5699         struct io_poll_iocb *poll = &req->poll;
5700         struct io_ring_ctx *ctx = req->ctx;
5701         struct io_poll_table ipt;
5702         __poll_t mask;
5703
5704         ipt.pt._qproc = io_poll_queue_proc;
5705
5706         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5707                                         io_poll_wake);
5708
5709         if (mask) { /* no async, we'd stolen it */
5710                 ipt.error = 0;
5711                 io_poll_complete(req, mask, 0);
5712         }
5713         spin_unlock_irq(&ctx->completion_lock);
5714
5715         if (mask) {
5716                 io_cqring_ev_posted(ctx);
5717                 io_put_req(req);
5718         }
5719         return ipt.error;
5720 }
5721
5722 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5723 {
5724         struct io_timeout_data *data = container_of(timer,
5725                                                 struct io_timeout_data, timer);
5726         struct io_kiocb *req = data->req;
5727         struct io_ring_ctx *ctx = req->ctx;
5728         unsigned long flags;
5729
5730         spin_lock_irqsave(&ctx->completion_lock, flags);
5731         list_del_init(&req->timeout.list);
5732         atomic_set(&req->ctx->cq_timeouts,
5733                 atomic_read(&req->ctx->cq_timeouts) + 1);
5734
5735         io_cqring_fill_event(req, -ETIME);
5736         io_commit_cqring(ctx);
5737         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5738
5739         io_cqring_ev_posted(ctx);
5740         req_set_fail_links(req);
5741         io_put_req(req);
5742         return HRTIMER_NORESTART;
5743 }
5744
5745 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5746                                            __u64 user_data)
5747 {
5748         struct io_timeout_data *io;
5749         struct io_kiocb *req;
5750         int ret = -ENOENT;
5751
5752         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5753                 if (user_data == req->user_data) {
5754                         ret = 0;
5755                         break;
5756                 }
5757         }
5758
5759         if (ret == -ENOENT)
5760                 return ERR_PTR(ret);
5761
5762         io = req->async_data;
5763         ret = hrtimer_try_to_cancel(&io->timer);
5764         if (ret == -1)
5765                 return ERR_PTR(-EALREADY);
5766         list_del_init(&req->timeout.list);
5767         return req;
5768 }
5769
5770 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5771 {
5772         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5773
5774         if (IS_ERR(req))
5775                 return PTR_ERR(req);
5776
5777         req_set_fail_links(req);
5778         io_cqring_fill_event(req, -ECANCELED);
5779         io_put_req_deferred(req, 1);
5780         return 0;
5781 }
5782
5783 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5784                              struct timespec64 *ts, enum hrtimer_mode mode)
5785 {
5786         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5787         struct io_timeout_data *data;
5788
5789         if (IS_ERR(req))
5790                 return PTR_ERR(req);
5791
5792         req->timeout.off = 0; /* noseq */
5793         data = req->async_data;
5794         list_add_tail(&req->timeout.list, &ctx->timeout_list);
5795         hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5796         data->timer.function = io_timeout_fn;
5797         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5798         return 0;
5799 }
5800
5801 static int io_timeout_remove_prep(struct io_kiocb *req,
5802                                   const struct io_uring_sqe *sqe)
5803 {
5804         struct io_timeout_rem *tr = &req->timeout_rem;
5805
5806         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5807                 return -EINVAL;
5808         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5809                 return -EINVAL;
5810         if (sqe->ioprio || sqe->buf_index || sqe->len)
5811                 return -EINVAL;
5812
5813         tr->addr = READ_ONCE(sqe->addr);
5814         tr->flags = READ_ONCE(sqe->timeout_flags);
5815         if (tr->flags & IORING_TIMEOUT_UPDATE) {
5816                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5817                         return -EINVAL;
5818                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5819                         return -EFAULT;
5820         } else if (tr->flags) {
5821                 /* timeout removal doesn't support flags */
5822                 return -EINVAL;
5823         }
5824
5825         return 0;
5826 }
5827
5828 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5829 {
5830         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5831                                             : HRTIMER_MODE_REL;
5832 }
5833
5834 /*
5835  * Remove or update an existing timeout command
5836  */
5837 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5838 {
5839         struct io_timeout_rem *tr = &req->timeout_rem;
5840         struct io_ring_ctx *ctx = req->ctx;
5841         int ret;
5842
5843         spin_lock_irq(&ctx->completion_lock);
5844         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
5845                 ret = io_timeout_cancel(ctx, tr->addr);
5846         else
5847                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5848                                         io_translate_timeout_mode(tr->flags));
5849
5850         io_cqring_fill_event(req, ret);
5851         io_commit_cqring(ctx);
5852         spin_unlock_irq(&ctx->completion_lock);
5853         io_cqring_ev_posted(ctx);
5854         if (ret < 0)
5855                 req_set_fail_links(req);
5856         io_put_req(req);
5857         return 0;
5858 }
5859
5860 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5861                            bool is_timeout_link)
5862 {
5863         struct io_timeout_data *data;
5864         unsigned flags;
5865         u32 off = READ_ONCE(sqe->off);
5866
5867         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5868                 return -EINVAL;
5869         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5870                 return -EINVAL;
5871         if (off && is_timeout_link)
5872                 return -EINVAL;
5873         flags = READ_ONCE(sqe->timeout_flags);
5874         if (flags & ~IORING_TIMEOUT_ABS)
5875                 return -EINVAL;
5876
5877         req->timeout.off = off;
5878
5879         if (!req->async_data && io_alloc_async_data(req))
5880                 return -ENOMEM;
5881
5882         data = req->async_data;
5883         data->req = req;
5884
5885         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5886                 return -EFAULT;
5887
5888         data->mode = io_translate_timeout_mode(flags);
5889         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5890         return 0;
5891 }
5892
5893 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5894 {
5895         struct io_ring_ctx *ctx = req->ctx;
5896         struct io_timeout_data *data = req->async_data;
5897         struct list_head *entry;
5898         u32 tail, off = req->timeout.off;
5899
5900         spin_lock_irq(&ctx->completion_lock);
5901
5902         /*
5903          * sqe->off holds how many events that need to occur for this
5904          * timeout event to be satisfied. If it isn't set, then this is
5905          * a pure timeout request, sequence isn't used.
5906          */
5907         if (io_is_timeout_noseq(req)) {
5908                 entry = ctx->timeout_list.prev;
5909                 goto add;
5910         }
5911
5912         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5913         req->timeout.target_seq = tail + off;
5914
5915         /* Update the last seq here in case io_flush_timeouts() hasn't.
5916          * This is safe because ->completion_lock is held, and submissions
5917          * and completions are never mixed in the same ->completion_lock section.
5918          */
5919         ctx->cq_last_tm_flush = tail;
5920
5921         /*
5922          * Insertion sort, ensuring the first entry in the list is always
5923          * the one we need first.
5924          */
5925         list_for_each_prev(entry, &ctx->timeout_list) {
5926                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5927                                                   timeout.list);
5928
5929                 if (io_is_timeout_noseq(nxt))
5930                         continue;
5931                 /* nxt.seq is behind @tail, otherwise would've been completed */
5932                 if (off >= nxt->timeout.target_seq - tail)
5933                         break;
5934         }
5935 add:
5936         list_add(&req->timeout.list, entry);
5937         data->timer.function = io_timeout_fn;
5938         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5939         spin_unlock_irq(&ctx->completion_lock);
5940         return 0;
5941 }
5942
5943 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5944 {
5945         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5946
5947         return req->user_data == (unsigned long) data;
5948 }
5949
5950 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
5951 {
5952         enum io_wq_cancel cancel_ret;
5953         int ret = 0;
5954
5955         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
5956         switch (cancel_ret) {
5957         case IO_WQ_CANCEL_OK:
5958                 ret = 0;
5959                 break;
5960         case IO_WQ_CANCEL_RUNNING:
5961                 ret = -EALREADY;
5962                 break;
5963         case IO_WQ_CANCEL_NOTFOUND:
5964                 ret = -ENOENT;
5965                 break;
5966         }
5967
5968         return ret;
5969 }
5970
5971 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5972                                      struct io_kiocb *req, __u64 sqe_addr,
5973                                      int success_ret)
5974 {
5975         unsigned long flags;
5976         int ret;
5977
5978         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5979         if (ret != -ENOENT) {
5980                 spin_lock_irqsave(&ctx->completion_lock, flags);
5981                 goto done;
5982         }
5983
5984         spin_lock_irqsave(&ctx->completion_lock, flags);
5985         ret = io_timeout_cancel(ctx, sqe_addr);
5986         if (ret != -ENOENT)
5987                 goto done;
5988         ret = io_poll_cancel(ctx, sqe_addr);
5989 done:
5990         if (!ret)
5991                 ret = success_ret;
5992         io_cqring_fill_event(req, ret);
5993         io_commit_cqring(ctx);
5994         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5995         io_cqring_ev_posted(ctx);
5996
5997         if (ret < 0)
5998                 req_set_fail_links(req);
5999         io_put_req(req);
6000 }
6001
6002 static int io_async_cancel_prep(struct io_kiocb *req,
6003                                 const struct io_uring_sqe *sqe)
6004 {
6005         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6006                 return -EINVAL;
6007         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6008                 return -EINVAL;
6009         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
6010                 return -EINVAL;
6011
6012         req->cancel.addr = READ_ONCE(sqe->addr);
6013         return 0;
6014 }
6015
6016 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
6017 {
6018         struct io_ring_ctx *ctx = req->ctx;
6019
6020         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
6021         return 0;
6022 }
6023
6024 static int io_rsrc_update_prep(struct io_kiocb *req,
6025                                 const struct io_uring_sqe *sqe)
6026 {
6027         if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
6028                 return -EINVAL;
6029         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6030                 return -EINVAL;
6031         if (sqe->ioprio || sqe->rw_flags)
6032                 return -EINVAL;
6033
6034         req->rsrc_update.offset = READ_ONCE(sqe->off);
6035         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6036         if (!req->rsrc_update.nr_args)
6037                 return -EINVAL;
6038         req->rsrc_update.arg = READ_ONCE(sqe->addr);
6039         return 0;
6040 }
6041
6042 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6043 {
6044         struct io_ring_ctx *ctx = req->ctx;
6045         struct io_uring_rsrc_update up;
6046         int ret;
6047
6048         if (issue_flags & IO_URING_F_NONBLOCK)
6049                 return -EAGAIN;
6050
6051         up.offset = req->rsrc_update.offset;
6052         up.data = req->rsrc_update.arg;
6053
6054         mutex_lock(&ctx->uring_lock);
6055         ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
6056         mutex_unlock(&ctx->uring_lock);
6057
6058         if (ret < 0)
6059                 req_set_fail_links(req);
6060         __io_req_complete(req, issue_flags, ret, 0);
6061         return 0;
6062 }
6063
6064 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6065 {
6066         switch (req->opcode) {
6067         case IORING_OP_NOP:
6068                 return 0;
6069         case IORING_OP_READV:
6070         case IORING_OP_READ_FIXED:
6071         case IORING_OP_READ:
6072                 return io_read_prep(req, sqe);
6073         case IORING_OP_WRITEV:
6074         case IORING_OP_WRITE_FIXED:
6075         case IORING_OP_WRITE:
6076                 return io_write_prep(req, sqe);
6077         case IORING_OP_POLL_ADD:
6078                 return io_poll_add_prep(req, sqe);
6079         case IORING_OP_POLL_REMOVE:
6080                 return io_poll_remove_prep(req, sqe);
6081         case IORING_OP_FSYNC:
6082                 return io_fsync_prep(req, sqe);
6083         case IORING_OP_SYNC_FILE_RANGE:
6084                 return io_sfr_prep(req, sqe);
6085         case IORING_OP_SENDMSG:
6086         case IORING_OP_SEND:
6087                 return io_sendmsg_prep(req, sqe);
6088         case IORING_OP_RECVMSG:
6089         case IORING_OP_RECV:
6090                 return io_recvmsg_prep(req, sqe);
6091         case IORING_OP_CONNECT:
6092                 return io_connect_prep(req, sqe);
6093         case IORING_OP_TIMEOUT:
6094                 return io_timeout_prep(req, sqe, false);
6095         case IORING_OP_TIMEOUT_REMOVE:
6096                 return io_timeout_remove_prep(req, sqe);
6097         case IORING_OP_ASYNC_CANCEL:
6098                 return io_async_cancel_prep(req, sqe);
6099         case IORING_OP_LINK_TIMEOUT:
6100                 return io_timeout_prep(req, sqe, true);
6101         case IORING_OP_ACCEPT:
6102                 return io_accept_prep(req, sqe);
6103         case IORING_OP_FALLOCATE:
6104                 return io_fallocate_prep(req, sqe);
6105         case IORING_OP_OPENAT:
6106                 return io_openat_prep(req, sqe);
6107         case IORING_OP_CLOSE:
6108                 return io_close_prep(req, sqe);
6109         case IORING_OP_FILES_UPDATE:
6110                 return io_rsrc_update_prep(req, sqe);
6111         case IORING_OP_STATX:
6112                 return io_statx_prep(req, sqe);
6113         case IORING_OP_FADVISE:
6114                 return io_fadvise_prep(req, sqe);
6115         case IORING_OP_MADVISE:
6116                 return io_madvise_prep(req, sqe);
6117         case IORING_OP_OPENAT2:
6118                 return io_openat2_prep(req, sqe);
6119         case IORING_OP_EPOLL_CTL:
6120                 return io_epoll_ctl_prep(req, sqe);
6121         case IORING_OP_SPLICE:
6122                 return io_splice_prep(req, sqe);
6123         case IORING_OP_PROVIDE_BUFFERS:
6124                 return io_provide_buffers_prep(req, sqe);
6125         case IORING_OP_REMOVE_BUFFERS:
6126                 return io_remove_buffers_prep(req, sqe);
6127         case IORING_OP_TEE:
6128                 return io_tee_prep(req, sqe);
6129         case IORING_OP_SHUTDOWN:
6130                 return io_shutdown_prep(req, sqe);
6131         case IORING_OP_RENAMEAT:
6132                 return io_renameat_prep(req, sqe);
6133         case IORING_OP_UNLINKAT:
6134                 return io_unlinkat_prep(req, sqe);
6135         }
6136
6137         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6138                         req->opcode);
6139         return-EINVAL;
6140 }
6141
6142 static int io_req_prep_async(struct io_kiocb *req)
6143 {
6144         switch (req->opcode) {
6145         case IORING_OP_READV:
6146         case IORING_OP_READ_FIXED:
6147         case IORING_OP_READ:
6148                 return io_rw_prep_async(req, READ);
6149         case IORING_OP_WRITEV:
6150         case IORING_OP_WRITE_FIXED:
6151         case IORING_OP_WRITE:
6152                 return io_rw_prep_async(req, WRITE);
6153         case IORING_OP_SENDMSG:
6154         case IORING_OP_SEND:
6155                 return io_sendmsg_prep_async(req);
6156         case IORING_OP_RECVMSG:
6157         case IORING_OP_RECV:
6158                 return io_recvmsg_prep_async(req);
6159         case IORING_OP_CONNECT:
6160                 return io_connect_prep_async(req);
6161         }
6162         return 0;
6163 }
6164
6165 static int io_req_defer_prep(struct io_kiocb *req)
6166 {
6167         if (!io_op_defs[req->opcode].needs_async_data)
6168                 return 0;
6169         /* some opcodes init it during the inital prep */
6170         if (req->async_data)
6171                 return 0;
6172         if (__io_alloc_async_data(req))
6173                 return -EAGAIN;
6174         return io_req_prep_async(req);
6175 }
6176
6177 static u32 io_get_sequence(struct io_kiocb *req)
6178 {
6179         struct io_kiocb *pos;
6180         struct io_ring_ctx *ctx = req->ctx;
6181         u32 total_submitted, nr_reqs = 0;
6182
6183         io_for_each_link(pos, req)
6184                 nr_reqs++;
6185
6186         total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
6187         return total_submitted - nr_reqs;
6188 }
6189
6190 static int io_req_defer(struct io_kiocb *req)
6191 {
6192         struct io_ring_ctx *ctx = req->ctx;
6193         struct io_defer_entry *de;
6194         int ret;
6195         u32 seq;
6196
6197         /* Still need defer if there is pending req in defer list. */
6198         if (likely(list_empty_careful(&ctx->defer_list) &&
6199                 !(req->flags & REQ_F_IO_DRAIN)))
6200                 return 0;
6201
6202         seq = io_get_sequence(req);
6203         /* Still a chance to pass the sequence check */
6204         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
6205                 return 0;
6206
6207         ret = io_req_defer_prep(req);
6208         if (ret)
6209                 return ret;
6210         io_prep_async_link(req);
6211         de = kmalloc(sizeof(*de), GFP_KERNEL);
6212         if (!de)
6213                 return -ENOMEM;
6214
6215         spin_lock_irq(&ctx->completion_lock);
6216         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
6217                 spin_unlock_irq(&ctx->completion_lock);
6218                 kfree(de);
6219                 io_queue_async_work(req);
6220                 return -EIOCBQUEUED;
6221         }
6222
6223         trace_io_uring_defer(ctx, req, req->user_data);
6224         de->req = req;
6225         de->seq = seq;
6226         list_add_tail(&de->list, &ctx->defer_list);
6227         spin_unlock_irq(&ctx->completion_lock);
6228         return -EIOCBQUEUED;
6229 }
6230
6231 static void __io_clean_op(struct io_kiocb *req)
6232 {
6233         if (req->flags & REQ_F_BUFFER_SELECTED) {
6234                 switch (req->opcode) {
6235                 case IORING_OP_READV:
6236                 case IORING_OP_READ_FIXED:
6237                 case IORING_OP_READ:
6238                         kfree((void *)(unsigned long)req->rw.addr);
6239                         break;
6240                 case IORING_OP_RECVMSG:
6241                 case IORING_OP_RECV:
6242                         kfree(req->sr_msg.kbuf);
6243                         break;
6244                 }
6245                 req->flags &= ~REQ_F_BUFFER_SELECTED;
6246         }
6247
6248         if (req->flags & REQ_F_NEED_CLEANUP) {
6249                 switch (req->opcode) {
6250                 case IORING_OP_READV:
6251                 case IORING_OP_READ_FIXED:
6252                 case IORING_OP_READ:
6253                 case IORING_OP_WRITEV:
6254                 case IORING_OP_WRITE_FIXED:
6255                 case IORING_OP_WRITE: {
6256                         struct io_async_rw *io = req->async_data;
6257                         if (io->free_iovec)
6258                                 kfree(io->free_iovec);
6259                         break;
6260                         }
6261                 case IORING_OP_RECVMSG:
6262                 case IORING_OP_SENDMSG: {
6263                         struct io_async_msghdr *io = req->async_data;
6264
6265                         kfree(io->free_iov);
6266                         break;
6267                         }
6268                 case IORING_OP_SPLICE:
6269                 case IORING_OP_TEE:
6270                         io_put_file(req, req->splice.file_in,
6271                                     (req->splice.flags & SPLICE_F_FD_IN_FIXED));
6272                         break;
6273                 case IORING_OP_OPENAT:
6274                 case IORING_OP_OPENAT2:
6275                         if (req->open.filename)
6276                                 putname(req->open.filename);
6277                         break;
6278                 case IORING_OP_RENAMEAT:
6279                         putname(req->rename.oldpath);
6280                         putname(req->rename.newpath);
6281                         break;
6282                 case IORING_OP_UNLINKAT:
6283                         putname(req->unlink.filename);
6284                         break;
6285                 }
6286                 req->flags &= ~REQ_F_NEED_CLEANUP;
6287         }
6288 }
6289
6290 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
6291 {
6292         struct io_ring_ctx *ctx = req->ctx;
6293         int ret;
6294
6295         switch (req->opcode) {
6296         case IORING_OP_NOP:
6297                 ret = io_nop(req, issue_flags);
6298                 break;
6299         case IORING_OP_READV:
6300         case IORING_OP_READ_FIXED:
6301         case IORING_OP_READ:
6302                 ret = io_read(req, issue_flags);
6303                 break;
6304         case IORING_OP_WRITEV:
6305         case IORING_OP_WRITE_FIXED:
6306         case IORING_OP_WRITE:
6307                 ret = io_write(req, issue_flags);
6308                 break;
6309         case IORING_OP_FSYNC:
6310                 ret = io_fsync(req, issue_flags);
6311                 break;
6312         case IORING_OP_POLL_ADD:
6313                 ret = io_poll_add(req, issue_flags);
6314                 break;
6315         case IORING_OP_POLL_REMOVE:
6316                 ret = io_poll_remove(req, issue_flags);
6317                 break;
6318         case IORING_OP_SYNC_FILE_RANGE:
6319                 ret = io_sync_file_range(req, issue_flags);
6320                 break;
6321         case IORING_OP_SENDMSG:
6322                 ret = io_sendmsg(req, issue_flags);
6323                 break;
6324         case IORING_OP_SEND:
6325                 ret = io_send(req, issue_flags);
6326                 break;
6327         case IORING_OP_RECVMSG:
6328                 ret = io_recvmsg(req, issue_flags);
6329                 break;
6330         case IORING_OP_RECV:
6331                 ret = io_recv(req, issue_flags);
6332                 break;
6333         case IORING_OP_TIMEOUT:
6334                 ret = io_timeout(req, issue_flags);
6335                 break;
6336         case IORING_OP_TIMEOUT_REMOVE:
6337                 ret = io_timeout_remove(req, issue_flags);
6338                 break;
6339         case IORING_OP_ACCEPT:
6340                 ret = io_accept(req, issue_flags);
6341                 break;
6342         case IORING_OP_CONNECT:
6343                 ret = io_connect(req, issue_flags);
6344                 break;
6345         case IORING_OP_ASYNC_CANCEL:
6346                 ret = io_async_cancel(req, issue_flags);
6347                 break;
6348         case IORING_OP_FALLOCATE:
6349                 ret = io_fallocate(req, issue_flags);
6350                 break;
6351         case IORING_OP_OPENAT:
6352                 ret = io_openat(req, issue_flags);
6353                 break;
6354         case IORING_OP_CLOSE:
6355                 ret = io_close(req, issue_flags);
6356                 break;
6357         case IORING_OP_FILES_UPDATE:
6358                 ret = io_files_update(req, issue_flags);
6359                 break;
6360         case IORING_OP_STATX:
6361                 ret = io_statx(req, issue_flags);
6362                 break;
6363         case IORING_OP_FADVISE:
6364                 ret = io_fadvise(req, issue_flags);
6365                 break;
6366         case IORING_OP_MADVISE:
6367                 ret = io_madvise(req, issue_flags);
6368                 break;
6369         case IORING_OP_OPENAT2:
6370                 ret = io_openat2(req, issue_flags);
6371                 break;
6372         case IORING_OP_EPOLL_CTL:
6373                 ret = io_epoll_ctl(req, issue_flags);
6374                 break;
6375         case IORING_OP_SPLICE:
6376                 ret = io_splice(req, issue_flags);
6377                 break;
6378         case IORING_OP_PROVIDE_BUFFERS:
6379                 ret = io_provide_buffers(req, issue_flags);
6380                 break;
6381         case IORING_OP_REMOVE_BUFFERS:
6382                 ret = io_remove_buffers(req, issue_flags);
6383                 break;
6384         case IORING_OP_TEE:
6385                 ret = io_tee(req, issue_flags);
6386                 break;
6387         case IORING_OP_SHUTDOWN:
6388                 ret = io_shutdown(req, issue_flags);
6389                 break;
6390         case IORING_OP_RENAMEAT:
6391                 ret = io_renameat(req, issue_flags);
6392                 break;
6393         case IORING_OP_UNLINKAT:
6394                 ret = io_unlinkat(req, issue_flags);
6395                 break;
6396         default:
6397                 ret = -EINVAL;
6398                 break;
6399         }
6400
6401         if (ret)
6402                 return ret;
6403
6404         /* If the op doesn't have a file, we're not polling for it */
6405         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
6406                 const bool in_async = io_wq_current_is_worker();
6407
6408                 /* workqueue context doesn't hold uring_lock, grab it now */
6409                 if (in_async)
6410                         mutex_lock(&ctx->uring_lock);
6411
6412                 io_iopoll_req_issued(req, in_async);
6413
6414                 if (in_async)
6415                         mutex_unlock(&ctx->uring_lock);
6416         }
6417
6418         return 0;
6419 }
6420
6421 static void io_wq_submit_work(struct io_wq_work *work)
6422 {
6423         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6424         struct io_kiocb *timeout;
6425         int ret = 0;
6426
6427         timeout = io_prep_linked_timeout(req);
6428         if (timeout)
6429                 io_queue_linked_timeout(timeout);
6430
6431         if (work->flags & IO_WQ_WORK_CANCEL) {
6432                 /* io-wq is going to take down one */
6433                 refcount_inc(&req->refs);
6434                 percpu_ref_get(&req->ctx->refs);
6435                 io_req_task_work_add_fallback(req, io_req_task_cancel);
6436                 return;
6437         }
6438
6439         if (!ret) {
6440                 do {
6441                         ret = io_issue_sqe(req, 0);
6442                         /*
6443                          * We can get EAGAIN for polled IO even though we're
6444                          * forcing a sync submission from here, since we can't
6445                          * wait for request slots on the block side.
6446                          */
6447                         if (ret != -EAGAIN)
6448                                 break;
6449                         cond_resched();
6450                 } while (1);
6451         }
6452
6453         if (ret) {
6454                 struct io_ring_ctx *lock_ctx = NULL;
6455
6456                 if (req->ctx->flags & IORING_SETUP_IOPOLL)
6457                         lock_ctx = req->ctx;
6458
6459                 /*
6460                  * io_iopoll_complete() does not hold completion_lock to
6461                  * complete polled io, so here for polled io, we can not call
6462                  * io_req_complete() directly, otherwise there maybe concurrent
6463                  * access to cqring, defer_list, etc, which is not safe. Given
6464                  * that io_iopoll_complete() is always called under uring_lock,
6465                  * so here for polled io, we also get uring_lock to complete
6466                  * it.
6467                  */
6468                 if (lock_ctx)
6469                         mutex_lock(&lock_ctx->uring_lock);
6470
6471                 req_set_fail_links(req);
6472                 io_req_complete(req, ret);
6473
6474                 if (lock_ctx)
6475                         mutex_unlock(&lock_ctx->uring_lock);
6476         }
6477 }
6478
6479 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6480                                               int index)
6481 {
6482         struct fixed_rsrc_table *table;
6483
6484         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
6485         return table->files[index & IORING_FILE_TABLE_MASK];
6486 }
6487
6488 static struct file *io_file_get(struct io_submit_state *state,
6489                                 struct io_kiocb *req, int fd, bool fixed)
6490 {
6491         struct io_ring_ctx *ctx = req->ctx;
6492         struct file *file;
6493
6494         if (fixed) {
6495                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6496                         return NULL;
6497                 fd = array_index_nospec(fd, ctx->nr_user_files);
6498                 file = io_file_from_index(ctx, fd);
6499                 io_set_resource_node(req);
6500         } else {
6501                 trace_io_uring_file_get(ctx, fd);
6502                 file = __io_file_get(state, fd);
6503         }
6504
6505         if (file && unlikely(file->f_op == &io_uring_fops))
6506                 io_req_track_inflight(req);
6507         return file;
6508 }
6509
6510 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6511 {
6512         struct io_timeout_data *data = container_of(timer,
6513                                                 struct io_timeout_data, timer);
6514         struct io_kiocb *prev, *req = data->req;
6515         struct io_ring_ctx *ctx = req->ctx;
6516         unsigned long flags;
6517
6518         spin_lock_irqsave(&ctx->completion_lock, flags);
6519         prev = req->timeout.head;
6520         req->timeout.head = NULL;
6521
6522         /*
6523          * We don't expect the list to be empty, that will only happen if we
6524          * race with the completion of the linked work.
6525          */
6526         if (prev && refcount_inc_not_zero(&prev->refs))
6527                 io_remove_next_linked(prev);
6528         else
6529                 prev = NULL;
6530         spin_unlock_irqrestore(&ctx->completion_lock, flags);
6531
6532         if (prev) {
6533                 req_set_fail_links(prev);
6534                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6535                 io_put_req_deferred(prev, 1);
6536         } else {
6537                 io_req_complete_post(req, -ETIME, 0);
6538                 io_put_req_deferred(req, 1);
6539         }
6540         return HRTIMER_NORESTART;
6541 }
6542
6543 static void __io_queue_linked_timeout(struct io_kiocb *req)
6544 {
6545         /*
6546          * If the back reference is NULL, then our linked request finished
6547          * before we got a chance to setup the timer
6548          */
6549         if (req->timeout.head) {
6550                 struct io_timeout_data *data = req->async_data;
6551
6552                 data->timer.function = io_link_timeout_fn;
6553                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6554                                 data->mode);
6555         }
6556 }
6557
6558 static void io_queue_linked_timeout(struct io_kiocb *req)
6559 {
6560         struct io_ring_ctx *ctx = req->ctx;
6561
6562         spin_lock_irq(&ctx->completion_lock);
6563         __io_queue_linked_timeout(req);
6564         spin_unlock_irq(&ctx->completion_lock);
6565
6566         /* drop submission reference */
6567         io_put_req(req);
6568 }
6569
6570 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6571 {
6572         struct io_kiocb *nxt = req->link;
6573
6574         if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6575             nxt->opcode != IORING_OP_LINK_TIMEOUT)
6576                 return NULL;
6577
6578         nxt->timeout.head = req;
6579         nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6580         req->flags |= REQ_F_LINK_TIMEOUT;
6581         return nxt;
6582 }
6583
6584 static void __io_queue_sqe(struct io_kiocb *req)
6585 {
6586         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6587         const struct cred *old_creds = NULL;
6588         int ret;
6589
6590         if ((req->flags & REQ_F_WORK_INITIALIZED) &&
6591             (req->work.flags & IO_WQ_WORK_CREDS) &&
6592             req->work.identity->creds != current_cred())
6593                 old_creds = override_creds(req->work.identity->creds);
6594
6595         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6596
6597         if (old_creds)
6598                 revert_creds(old_creds);
6599
6600         /*
6601          * We async punt it if the file wasn't marked NOWAIT, or if the file
6602          * doesn't support non-blocking read/write attempts
6603          */
6604         if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6605                 if (!io_arm_poll_handler(req)) {
6606                         /*
6607                          * Queued up for async execution, worker will release
6608                          * submit reference when the iocb is actually submitted.
6609                          */
6610                         io_queue_async_work(req);
6611                 }
6612         } else if (likely(!ret)) {
6613                 /* drop submission reference */
6614                 if (req->flags & REQ_F_COMPLETE_INLINE) {
6615                         struct io_ring_ctx *ctx = req->ctx;
6616                         struct io_comp_state *cs = &ctx->submit_state.comp;
6617
6618                         cs->reqs[cs->nr++] = req;
6619                         if (cs->nr == ARRAY_SIZE(cs->reqs))
6620                                 io_submit_flush_completions(cs, ctx);
6621                 } else {
6622                         io_put_req(req);
6623                 }
6624         } else {
6625                 req_set_fail_links(req);
6626                 io_put_req(req);
6627                 io_req_complete(req, ret);
6628         }
6629         if (linked_timeout)
6630                 io_queue_linked_timeout(linked_timeout);
6631 }
6632
6633 static void io_queue_sqe(struct io_kiocb *req)
6634 {
6635         int ret;
6636
6637         ret = io_req_defer(req);
6638         if (ret) {
6639                 if (ret != -EIOCBQUEUED) {
6640 fail_req:
6641                         req_set_fail_links(req);
6642                         io_put_req(req);
6643                         io_req_complete(req, ret);
6644                 }
6645         } else if (req->flags & REQ_F_FORCE_ASYNC) {
6646                 ret = io_req_defer_prep(req);
6647                 if (unlikely(ret))
6648                         goto fail_req;
6649                 io_queue_async_work(req);
6650         } else {
6651                 __io_queue_sqe(req);
6652         }
6653 }
6654
6655 /*
6656  * Check SQE restrictions (opcode and flags).
6657  *
6658  * Returns 'true' if SQE is allowed, 'false' otherwise.
6659  */
6660 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6661                                         struct io_kiocb *req,
6662                                         unsigned int sqe_flags)
6663 {
6664         if (!ctx->restricted)
6665                 return true;
6666
6667         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6668                 return false;
6669
6670         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6671             ctx->restrictions.sqe_flags_required)
6672                 return false;
6673
6674         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6675                           ctx->restrictions.sqe_flags_required))
6676                 return false;
6677
6678         return true;
6679 }
6680
6681 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6682                        const struct io_uring_sqe *sqe)
6683 {
6684         struct io_submit_state *state;
6685         unsigned int sqe_flags;
6686         int id, ret = 0;
6687
6688         req->opcode = READ_ONCE(sqe->opcode);
6689         /* same numerical values with corresponding REQ_F_*, safe to copy */
6690         req->flags = sqe_flags = READ_ONCE(sqe->flags);
6691         req->user_data = READ_ONCE(sqe->user_data);
6692         req->async_data = NULL;
6693         req->file = NULL;
6694         req->ctx = ctx;
6695         req->link = NULL;
6696         req->fixed_rsrc_refs = NULL;
6697         /* one is dropped after submission, the other at completion */
6698         refcount_set(&req->refs, 2);
6699         req->task = current;
6700         req->result = 0;
6701
6702         /* enforce forwards compatibility on users */
6703         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6704                 return -EINVAL;
6705
6706         if (unlikely(req->opcode >= IORING_OP_LAST))
6707                 return -EINVAL;
6708
6709         if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
6710                 return -EFAULT;
6711
6712         if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6713                 return -EACCES;
6714
6715         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6716             !io_op_defs[req->opcode].buffer_select)
6717                 return -EOPNOTSUPP;
6718
6719         id = READ_ONCE(sqe->personality);
6720         if (id) {
6721                 struct io_identity *iod;
6722
6723                 iod = idr_find(&ctx->personality_idr, id);
6724                 if (unlikely(!iod))
6725                         return -EINVAL;
6726                 refcount_inc(&iod->count);
6727
6728                 __io_req_init_async(req);
6729                 get_cred(iod->creds);
6730                 req->work.identity = iod;
6731                 req->work.flags |= IO_WQ_WORK_CREDS;
6732         }
6733
6734         state = &ctx->submit_state;
6735
6736         /*
6737          * Plug now if we have more than 1 IO left after this, and the target
6738          * is potentially a read/write to block based storage.
6739          */
6740         if (!state->plug_started && state->ios_left > 1 &&
6741             io_op_defs[req->opcode].plug) {
6742                 blk_start_plug(&state->plug);
6743                 state->plug_started = true;
6744         }
6745
6746         if (io_op_defs[req->opcode].needs_file) {
6747                 bool fixed = req->flags & REQ_F_FIXED_FILE;
6748
6749                 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6750                 if (unlikely(!req->file))
6751                         ret = -EBADF;
6752         }
6753
6754         state->ios_left--;
6755         return ret;
6756 }
6757
6758 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
6759                          const struct io_uring_sqe *sqe)
6760 {
6761         struct io_submit_link *link = &ctx->submit_state.link;
6762         int ret;
6763
6764         ret = io_init_req(ctx, req, sqe);
6765         if (unlikely(ret)) {
6766 fail_req:
6767                 io_put_req(req);
6768                 io_req_complete(req, ret);
6769                 if (link->head) {
6770                         /* fail even hard links since we don't submit */
6771                         link->head->flags |= REQ_F_FAIL_LINK;
6772                         io_put_req(link->head);
6773                         io_req_complete(link->head, -ECANCELED);
6774                         link->head = NULL;
6775                 }
6776                 return ret;
6777         }
6778         ret = io_req_prep(req, sqe);
6779         if (unlikely(ret))
6780                 goto fail_req;
6781
6782         /* don't need @sqe from now on */
6783         trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6784                                 true, ctx->flags & IORING_SETUP_SQPOLL);
6785
6786         /*
6787          * If we already have a head request, queue this one for async
6788          * submittal once the head completes. If we don't have a head but
6789          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6790          * submitted sync once the chain is complete. If none of those
6791          * conditions are true (normal request), then just queue it.
6792          */
6793         if (link->head) {
6794                 struct io_kiocb *head = link->head;
6795
6796                 /*
6797                  * Taking sequential execution of a link, draining both sides
6798                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6799                  * requests in the link. So, it drains the head and the
6800                  * next after the link request. The last one is done via
6801                  * drain_next flag to persist the effect across calls.
6802                  */
6803                 if (req->flags & REQ_F_IO_DRAIN) {
6804                         head->flags |= REQ_F_IO_DRAIN;
6805                         ctx->drain_next = 1;
6806                 }
6807                 ret = io_req_defer_prep(req);
6808                 if (unlikely(ret))
6809                         goto fail_req;
6810                 trace_io_uring_link(ctx, req, head);
6811                 link->last->link = req;
6812                 link->last = req;
6813
6814                 /* last request of a link, enqueue the link */
6815                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6816                         io_queue_sqe(head);
6817                         link->head = NULL;
6818                 }
6819         } else {
6820                 if (unlikely(ctx->drain_next)) {
6821                         req->flags |= REQ_F_IO_DRAIN;
6822                         ctx->drain_next = 0;
6823                 }
6824                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6825                         link->head = req;
6826                         link->last = req;
6827                 } else {
6828                         io_queue_sqe(req);
6829                 }
6830         }
6831
6832         return 0;
6833 }
6834
6835 /*
6836  * Batched submission is done, ensure local IO is flushed out.
6837  */
6838 static void io_submit_state_end(struct io_submit_state *state,
6839                                 struct io_ring_ctx *ctx)
6840 {
6841         if (state->link.head)
6842                 io_queue_sqe(state->link.head);
6843         if (state->comp.nr)
6844                 io_submit_flush_completions(&state->comp, ctx);
6845         if (state->plug_started)
6846                 blk_finish_plug(&state->plug);
6847         io_state_file_put(state);
6848 }
6849
6850 /*
6851  * Start submission side cache.
6852  */
6853 static void io_submit_state_start(struct io_submit_state *state,
6854                                   unsigned int max_ios)
6855 {
6856         state->plug_started = false;
6857         state->ios_left = max_ios;
6858         /* set only head, no need to init link_last in advance */
6859         state->link.head = NULL;
6860 }
6861
6862 static void io_commit_sqring(struct io_ring_ctx *ctx)
6863 {
6864         struct io_rings *rings = ctx->rings;
6865
6866         /*
6867          * Ensure any loads from the SQEs are done at this point,
6868          * since once we write the new head, the application could
6869          * write new data to them.
6870          */
6871         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6872 }
6873
6874 /*
6875  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6876  * that is mapped by userspace. This means that care needs to be taken to
6877  * ensure that reads are stable, as we cannot rely on userspace always
6878  * being a good citizen. If members of the sqe are validated and then later
6879  * used, it's important that those reads are done through READ_ONCE() to
6880  * prevent a re-load down the line.
6881  */
6882 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6883 {
6884         u32 *sq_array = ctx->sq_array;
6885         unsigned head;
6886
6887         /*
6888          * The cached sq head (or cq tail) serves two purposes:
6889          *
6890          * 1) allows us to batch the cost of updating the user visible
6891          *    head updates.
6892          * 2) allows the kernel side to track the head on its own, even
6893          *    though the application is the one updating it.
6894          */
6895         head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
6896         if (likely(head < ctx->sq_entries))
6897                 return &ctx->sq_sqes[head];
6898
6899         /* drop invalid entries */
6900         ctx->cached_sq_dropped++;
6901         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6902         return NULL;
6903 }
6904
6905 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6906 {
6907         int submitted = 0;
6908
6909         /* if we have a backlog and couldn't flush it all, return BUSY */
6910         if (test_bit(0, &ctx->sq_check_overflow)) {
6911                 if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
6912                         return -EBUSY;
6913         }
6914
6915         /* make sure SQ entry isn't read before tail */
6916         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6917
6918         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6919                 return -EAGAIN;
6920
6921         percpu_counter_add(&current->io_uring->inflight, nr);
6922         refcount_add(nr, &current->usage);
6923         io_submit_state_start(&ctx->submit_state, nr);
6924
6925         while (submitted < nr) {
6926                 const struct io_uring_sqe *sqe;
6927                 struct io_kiocb *req;
6928
6929                 req = io_alloc_req(ctx);
6930                 if (unlikely(!req)) {
6931                         if (!submitted)
6932                                 submitted = -EAGAIN;
6933                         break;
6934                 }
6935                 sqe = io_get_sqe(ctx);
6936                 if (unlikely(!sqe)) {
6937                         kmem_cache_free(req_cachep, req);
6938                         break;
6939                 }
6940                 /* will complete beyond this point, count as submitted */
6941                 submitted++;
6942                 if (io_submit_sqe(ctx, req, sqe))
6943                         break;
6944         }
6945
6946         if (unlikely(submitted != nr)) {
6947                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6948                 struct io_uring_task *tctx = current->io_uring;
6949                 int unused = nr - ref_used;
6950
6951                 percpu_ref_put_many(&ctx->refs, unused);
6952                 percpu_counter_sub(&tctx->inflight, unused);
6953                 put_task_struct_many(current, unused);
6954         }
6955
6956         io_submit_state_end(&ctx->submit_state, ctx);
6957          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6958         io_commit_sqring(ctx);
6959
6960         return submitted;
6961 }
6962
6963 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6964 {
6965         /* Tell userspace we may need a wakeup call */
6966         spin_lock_irq(&ctx->completion_lock);
6967         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6968         spin_unlock_irq(&ctx->completion_lock);
6969 }
6970
6971 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6972 {
6973         spin_lock_irq(&ctx->completion_lock);
6974         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6975         spin_unlock_irq(&ctx->completion_lock);
6976 }
6977
6978 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6979 {
6980         unsigned int to_submit;
6981         int ret = 0;
6982
6983         to_submit = io_sqring_entries(ctx);
6984         /* if we're handling multiple rings, cap submit size for fairness */
6985         if (cap_entries && to_submit > 8)
6986                 to_submit = 8;
6987
6988         if (!list_empty(&ctx->iopoll_list) || to_submit) {
6989                 unsigned nr_events = 0;
6990
6991                 mutex_lock(&ctx->uring_lock);
6992                 if (!list_empty(&ctx->iopoll_list))
6993                         io_do_iopoll(ctx, &nr_events, 0);
6994
6995                 if (to_submit && !ctx->sqo_dead &&
6996                     likely(!percpu_ref_is_dying(&ctx->refs)))
6997                         ret = io_submit_sqes(ctx, to_submit);
6998                 mutex_unlock(&ctx->uring_lock);
6999         }
7000
7001         if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
7002                 wake_up(&ctx->sqo_sq_wait);
7003
7004         return ret;
7005 }
7006
7007 static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7008 {
7009         struct io_ring_ctx *ctx;
7010         unsigned sq_thread_idle = 0;
7011
7012         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7013                 if (sq_thread_idle < ctx->sq_thread_idle)
7014                         sq_thread_idle = ctx->sq_thread_idle;
7015         }
7016
7017         sqd->sq_thread_idle = sq_thread_idle;
7018 }
7019
7020 static void io_sqd_init_new(struct io_sq_data *sqd)
7021 {
7022         struct io_ring_ctx *ctx;
7023
7024         while (!list_empty(&sqd->ctx_new_list)) {
7025                 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
7026                 list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
7027                 complete(&ctx->sq_thread_comp);
7028         }
7029
7030         io_sqd_update_thread_idle(sqd);
7031 }
7032
7033 static int io_sq_thread(void *data)
7034 {
7035         struct cgroup_subsys_state *cur_css = NULL;
7036         struct files_struct *old_files = current->files;
7037         struct nsproxy *old_nsproxy = current->nsproxy;
7038         const struct cred *old_cred = NULL;
7039         struct io_sq_data *sqd = data;
7040         struct io_ring_ctx *ctx;
7041         unsigned long timeout = 0;
7042         DEFINE_WAIT(wait);
7043
7044         task_lock(current);
7045         current->files = NULL;
7046         current->nsproxy = NULL;
7047         task_unlock(current);
7048
7049         while (!kthread_should_stop()) {
7050                 int ret;
7051                 bool cap_entries, sqt_spin, needs_sched;
7052
7053                 /*
7054                  * Any changes to the sqd lists are synchronized through the
7055                  * kthread parking. This synchronizes the thread vs users,
7056                  * the users are synchronized on the sqd->ctx_lock.
7057                  */
7058                 if (kthread_should_park()) {
7059                         kthread_parkme();
7060                         /*
7061                          * When sq thread is unparked, in case the previous park operation
7062                          * comes from io_put_sq_data(), which means that sq thread is going
7063                          * to be stopped, so here needs to have a check.
7064                          */
7065                         if (kthread_should_stop())
7066                                 break;
7067                 }
7068
7069                 if (unlikely(!list_empty(&sqd->ctx_new_list))) {
7070                         io_sqd_init_new(sqd);
7071                         timeout = jiffies + sqd->sq_thread_idle;
7072                 }
7073
7074                 sqt_spin = false;
7075                 cap_entries = !list_is_singular(&sqd->ctx_list);
7076                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7077                         if (current->cred != ctx->creds) {
7078                                 if (old_cred)
7079                                         revert_creds(old_cred);
7080                                 old_cred = override_creds(ctx->creds);
7081                         }
7082                         io_sq_thread_associate_blkcg(ctx, &cur_css);
7083 #ifdef CONFIG_AUDIT
7084                         current->loginuid = ctx->loginuid;
7085                         current->sessionid = ctx->sessionid;
7086 #endif
7087
7088                         ret = __io_sq_thread(ctx, cap_entries);
7089                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
7090                                 sqt_spin = true;
7091
7092                         io_sq_thread_drop_mm_files();
7093                 }
7094
7095                 if (sqt_spin || !time_after(jiffies, timeout)) {
7096                         io_run_task_work();
7097                         io_sq_thread_drop_mm_files();
7098                         cond_resched();
7099                         if (sqt_spin)
7100                                 timeout = jiffies + sqd->sq_thread_idle;
7101                         continue;
7102                 }
7103
7104                 needs_sched = true;
7105                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7106                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7107                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
7108                             !list_empty_careful(&ctx->iopoll_list)) {
7109                                 needs_sched = false;
7110                                 break;
7111                         }
7112                         if (io_sqring_entries(ctx)) {
7113                                 needs_sched = false;
7114                                 break;
7115                         }
7116                 }
7117
7118                 if (needs_sched && !kthread_should_park()) {
7119                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7120                                 io_ring_set_wakeup_flag(ctx);
7121
7122                         schedule();
7123                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7124                                 io_ring_clear_wakeup_flag(ctx);
7125                 }
7126
7127                 finish_wait(&sqd->wait, &wait);
7128                 timeout = jiffies + sqd->sq_thread_idle;
7129         }
7130
7131         io_run_task_work();
7132         io_sq_thread_drop_mm_files();
7133
7134         if (cur_css)
7135                 io_sq_thread_unassociate_blkcg();
7136         if (old_cred)
7137                 revert_creds(old_cred);
7138
7139         task_lock(current);
7140         current->files = old_files;
7141         current->nsproxy = old_nsproxy;
7142         task_unlock(current);
7143
7144         kthread_parkme();
7145
7146         return 0;
7147 }
7148
7149 struct io_wait_queue {
7150         struct wait_queue_entry wq;
7151         struct io_ring_ctx *ctx;
7152         unsigned to_wait;
7153         unsigned nr_timeouts;
7154 };
7155
7156 static inline bool io_should_wake(struct io_wait_queue *iowq)
7157 {
7158         struct io_ring_ctx *ctx = iowq->ctx;
7159
7160         /*
7161          * Wake up if we have enough events, or if a timeout occurred since we
7162          * started waiting. For timeouts, we always want to return to userspace,
7163          * regardless of event count.
7164          */
7165         return io_cqring_events(ctx) >= iowq->to_wait ||
7166                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
7167 }
7168
7169 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7170                             int wake_flags, void *key)
7171 {
7172         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7173                                                         wq);
7174
7175         /*
7176          * Cannot safely flush overflowed CQEs from here, ensure we wake up
7177          * the task, and the next invocation will do it.
7178          */
7179         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
7180                 return autoremove_wake_function(curr, mode, wake_flags, key);
7181         return -1;
7182 }
7183
7184 static int io_run_task_work_sig(void)
7185 {
7186         if (io_run_task_work())
7187                 return 1;
7188         if (!signal_pending(current))
7189                 return 0;
7190         if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
7191                 return -ERESTARTSYS;
7192         return -EINTR;
7193 }
7194
7195 /* when returns >0, the caller should retry */
7196 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7197                                           struct io_wait_queue *iowq,
7198                                           signed long *timeout)
7199 {
7200         int ret;
7201
7202         /* make sure we run task_work before checking for signals */
7203         ret = io_run_task_work_sig();
7204         if (ret || io_should_wake(iowq))
7205                 return ret;
7206         /* let the caller flush overflows, retry */
7207         if (test_bit(0, &ctx->cq_check_overflow))
7208                 return 1;
7209
7210         *timeout = schedule_timeout(*timeout);
7211         return !*timeout ? -ETIME : 1;
7212 }
7213
7214 /*
7215  * Wait until events become available, if we don't already have some. The
7216  * application must reap them itself, as they reside on the shared cq ring.
7217  */
7218 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
7219                           const sigset_t __user *sig, size_t sigsz,
7220                           struct __kernel_timespec __user *uts)
7221 {
7222         struct io_wait_queue iowq = {
7223                 .wq = {
7224                         .private        = current,
7225                         .func           = io_wake_function,
7226                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
7227                 },
7228                 .ctx            = ctx,
7229                 .to_wait        = min_events,
7230         };
7231         struct io_rings *rings = ctx->rings;
7232         signed long timeout = MAX_SCHEDULE_TIMEOUT;
7233         int ret;
7234
7235         do {
7236                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
7237                 if (io_cqring_events(ctx) >= min_events)
7238                         return 0;
7239                 if (!io_run_task_work())
7240                         break;
7241         } while (1);
7242
7243         if (sig) {
7244 #ifdef CONFIG_COMPAT
7245                 if (in_compat_syscall())
7246                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
7247                                                       sigsz);
7248                 else
7249 #endif
7250                         ret = set_user_sigmask(sig, sigsz);
7251
7252                 if (ret)
7253                         return ret;
7254         }
7255
7256         if (uts) {
7257                 struct timespec64 ts;
7258
7259                 if (get_timespec64(&ts, uts))
7260                         return -EFAULT;
7261                 timeout = timespec64_to_jiffies(&ts);
7262         }
7263
7264         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
7265         trace_io_uring_cqring_wait(ctx, min_events);
7266         do {
7267                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
7268                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
7269                                                 TASK_INTERRUPTIBLE);
7270                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7271                 finish_wait(&ctx->wait, &iowq.wq);
7272         } while (ret > 0);
7273
7274         restore_saved_sigmask_unless(ret == -EINTR);
7275
7276         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
7277 }
7278
7279 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7280 {
7281 #if defined(CONFIG_UNIX)
7282         if (ctx->ring_sock) {
7283                 struct sock *sock = ctx->ring_sock->sk;
7284                 struct sk_buff *skb;
7285
7286                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7287                         kfree_skb(skb);
7288         }
7289 #else
7290         int i;
7291
7292         for (i = 0; i < ctx->nr_user_files; i++) {
7293                 struct file *file;
7294
7295                 file = io_file_from_index(ctx, i);
7296                 if (file)
7297                         fput(file);
7298         }
7299 #endif
7300 }
7301
7302 static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
7303 {
7304         struct fixed_rsrc_data *data;
7305
7306         data = container_of(ref, struct fixed_rsrc_data, refs);
7307         complete(&data->done);
7308 }
7309
7310 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
7311 {
7312         spin_lock_bh(&ctx->rsrc_ref_lock);
7313 }
7314
7315 static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
7316 {
7317         spin_unlock_bh(&ctx->rsrc_ref_lock);
7318 }
7319
7320 static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
7321                                  struct fixed_rsrc_data *rsrc_data,
7322                                  struct fixed_rsrc_ref_node *ref_node)
7323 {
7324         io_rsrc_ref_lock(ctx);
7325         rsrc_data->node = ref_node;
7326         list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
7327         io_rsrc_ref_unlock(ctx);
7328         percpu_ref_get(&rsrc_data->refs);
7329 }
7330
7331 static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
7332                                struct io_ring_ctx *ctx,
7333                                struct fixed_rsrc_ref_node *backup_node)
7334 {
7335         struct fixed_rsrc_ref_node *ref_node;
7336         int ret;
7337
7338         io_rsrc_ref_lock(ctx);
7339         ref_node = data->node;
7340         io_rsrc_ref_unlock(ctx);
7341         if (ref_node)
7342                 percpu_ref_kill(&ref_node->refs);
7343
7344         percpu_ref_kill(&data->refs);
7345
7346         /* wait for all refs nodes to complete */
7347         flush_delayed_work(&ctx->rsrc_put_work);
7348         do {
7349                 ret = wait_for_completion_interruptible(&data->done);
7350                 if (!ret)
7351                         break;
7352                 ret = io_run_task_work_sig();
7353                 if (ret < 0) {
7354                         percpu_ref_resurrect(&data->refs);
7355                         reinit_completion(&data->done);
7356                         io_sqe_rsrc_set_node(ctx, data, backup_node);
7357                         return ret;
7358                 }
7359         } while (1);
7360
7361         destroy_fixed_rsrc_ref_node(backup_node);
7362         return 0;
7363 }
7364
7365 static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
7366 {
7367         struct fixed_rsrc_data *data;
7368
7369         data = kzalloc(sizeof(*data), GFP_KERNEL);
7370         if (!data)
7371                 return NULL;
7372
7373         if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
7374                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
7375                 kfree(data);
7376                 return NULL;
7377         }
7378         data->ctx = ctx;
7379         init_completion(&data->done);
7380         return data;
7381 }
7382
7383 static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
7384 {
7385         percpu_ref_exit(&data->refs);
7386         kfree(data->table);
7387         kfree(data);
7388 }
7389
7390 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7391 {
7392         struct fixed_rsrc_data *data = ctx->file_data;
7393         struct fixed_rsrc_ref_node *backup_node;
7394         unsigned nr_tables, i;
7395         int ret;
7396
7397         if (!data)
7398                 return -ENXIO;
7399         backup_node = alloc_fixed_rsrc_ref_node(ctx);
7400         if (!backup_node)
7401                 return -ENOMEM;
7402         init_fixed_file_ref_node(ctx, backup_node);
7403
7404         ret = io_rsrc_ref_quiesce(data, ctx, backup_node);
7405         if (ret)
7406                 return ret;
7407
7408         __io_sqe_files_unregister(ctx);
7409         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7410         for (i = 0; i < nr_tables; i++)
7411                 kfree(data->table[i].files);
7412         free_fixed_rsrc_data(data);
7413         ctx->file_data = NULL;
7414         ctx->nr_user_files = 0;
7415         return 0;
7416 }
7417
7418 static void io_put_sq_data(struct io_sq_data *sqd)
7419 {
7420         if (refcount_dec_and_test(&sqd->refs)) {
7421                 /*
7422                  * The park is a bit of a work-around, without it we get
7423                  * warning spews on shutdown with SQPOLL set and affinity
7424                  * set to a single CPU.
7425                  */
7426                 if (sqd->thread) {
7427                         kthread_park(sqd->thread);
7428                         kthread_stop(sqd->thread);
7429                 }
7430
7431                 kfree(sqd);
7432         }
7433 }
7434
7435 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7436 {
7437         struct io_ring_ctx *ctx_attach;
7438         struct io_sq_data *sqd;
7439         struct fd f;
7440
7441         f = fdget(p->wq_fd);
7442         if (!f.file)
7443                 return ERR_PTR(-ENXIO);
7444         if (f.file->f_op != &io_uring_fops) {
7445                 fdput(f);
7446                 return ERR_PTR(-EINVAL);
7447         }
7448
7449         ctx_attach = f.file->private_data;
7450         sqd = ctx_attach->sq_data;
7451         if (!sqd) {
7452                 fdput(f);
7453                 return ERR_PTR(-EINVAL);
7454         }
7455
7456         refcount_inc(&sqd->refs);
7457         fdput(f);
7458         return sqd;
7459 }
7460
7461 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7462 {
7463         struct io_sq_data *sqd;
7464
7465         if (p->flags & IORING_SETUP_ATTACH_WQ)
7466                 return io_attach_sq_data(p);
7467
7468         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7469         if (!sqd)
7470                 return ERR_PTR(-ENOMEM);
7471
7472         refcount_set(&sqd->refs, 1);
7473         INIT_LIST_HEAD(&sqd->ctx_list);
7474         INIT_LIST_HEAD(&sqd->ctx_new_list);
7475         mutex_init(&sqd->ctx_lock);
7476         mutex_init(&sqd->lock);
7477         init_waitqueue_head(&sqd->wait);
7478         return sqd;
7479 }
7480
7481 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7482         __releases(&sqd->lock)
7483 {
7484         if (!sqd->thread)
7485                 return;
7486         kthread_unpark(sqd->thread);
7487         mutex_unlock(&sqd->lock);
7488 }
7489
7490 static void io_sq_thread_park(struct io_sq_data *sqd)
7491         __acquires(&sqd->lock)
7492 {
7493         if (!sqd->thread)
7494                 return;
7495         mutex_lock(&sqd->lock);
7496         kthread_park(sqd->thread);
7497 }
7498
7499 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
7500 {
7501         struct io_sq_data *sqd = ctx->sq_data;
7502
7503         if (sqd) {
7504                 if (sqd->thread) {
7505                         /*
7506                          * We may arrive here from the error branch in
7507                          * io_sq_offload_create() where the kthread is created
7508                          * without being waked up, thus wake it up now to make
7509                          * sure the wait will complete.
7510                          */
7511                         wake_up_process(sqd->thread);
7512                         wait_for_completion(&ctx->sq_thread_comp);
7513
7514                         io_sq_thread_park(sqd);
7515                 }
7516
7517                 mutex_lock(&sqd->ctx_lock);
7518                 list_del(&ctx->sqd_list);
7519                 io_sqd_update_thread_idle(sqd);
7520                 mutex_unlock(&sqd->ctx_lock);
7521
7522                 if (sqd->thread)
7523                         io_sq_thread_unpark(sqd);
7524
7525                 io_put_sq_data(sqd);
7526                 ctx->sq_data = NULL;
7527         }
7528 }
7529
7530 static void io_finish_async(struct io_ring_ctx *ctx)
7531 {
7532         io_sq_thread_stop(ctx);
7533
7534         if (ctx->io_wq) {
7535                 io_wq_destroy(ctx->io_wq);
7536                 ctx->io_wq = NULL;
7537         }
7538 }
7539
7540 #if defined(CONFIG_UNIX)
7541 /*
7542  * Ensure the UNIX gc is aware of our file set, so we are certain that
7543  * the io_uring can be safely unregistered on process exit, even if we have
7544  * loops in the file referencing.
7545  */
7546 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7547 {
7548         struct sock *sk = ctx->ring_sock->sk;
7549         struct scm_fp_list *fpl;
7550         struct sk_buff *skb;
7551         int i, nr_files;
7552
7553         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7554         if (!fpl)
7555                 return -ENOMEM;
7556
7557         skb = alloc_skb(0, GFP_KERNEL);
7558         if (!skb) {
7559                 kfree(fpl);
7560                 return -ENOMEM;
7561         }
7562
7563         skb->sk = sk;
7564
7565         nr_files = 0;
7566         fpl->user = get_uid(ctx->user);
7567         for (i = 0; i < nr; i++) {
7568                 struct file *file = io_file_from_index(ctx, i + offset);
7569
7570                 if (!file)
7571                         continue;
7572                 fpl->fp[nr_files] = get_file(file);
7573                 unix_inflight(fpl->user, fpl->fp[nr_files]);
7574                 nr_files++;
7575         }
7576
7577         if (nr_files) {
7578                 fpl->max = SCM_MAX_FD;
7579                 fpl->count = nr_files;
7580                 UNIXCB(skb).fp = fpl;
7581                 skb->destructor = unix_destruct_scm;
7582                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7583                 skb_queue_head(&sk->sk_receive_queue, skb);
7584
7585                 for (i = 0; i < nr_files; i++)
7586                         fput(fpl->fp[i]);
7587         } else {
7588                 kfree_skb(skb);
7589                 kfree(fpl);
7590         }
7591
7592         return 0;
7593 }
7594
7595 /*
7596  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7597  * causes regular reference counting to break down. We rely on the UNIX
7598  * garbage collection to take care of this problem for us.
7599  */
7600 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7601 {
7602         unsigned left, total;
7603         int ret = 0;
7604
7605         total = 0;
7606         left = ctx->nr_user_files;
7607         while (left) {
7608                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7609
7610                 ret = __io_sqe_files_scm(ctx, this_files, total);
7611                 if (ret)
7612                         break;
7613                 left -= this_files;
7614                 total += this_files;
7615         }
7616
7617         if (!ret)
7618                 return 0;
7619
7620         while (total < ctx->nr_user_files) {
7621                 struct file *file = io_file_from_index(ctx, total);
7622
7623                 if (file)
7624                         fput(file);
7625                 total++;
7626         }
7627
7628         return ret;
7629 }
7630 #else
7631 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7632 {
7633         return 0;
7634 }
7635 #endif
7636
7637 static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
7638                                     unsigned nr_tables, unsigned nr_files)
7639 {
7640         int i;
7641
7642         for (i = 0; i < nr_tables; i++) {
7643                 struct fixed_rsrc_table *table = &file_data->table[i];
7644                 unsigned this_files;
7645
7646                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7647                 table->files = kcalloc(this_files, sizeof(struct file *),
7648                                         GFP_KERNEL);
7649                 if (!table->files)
7650                         break;
7651                 nr_files -= this_files;
7652         }
7653
7654         if (i == nr_tables)
7655                 return 0;
7656
7657         for (i = 0; i < nr_tables; i++) {
7658                 struct fixed_rsrc_table *table = &file_data->table[i];
7659                 kfree(table->files);
7660         }
7661         return 1;
7662 }
7663
7664 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
7665 {
7666         struct file *file = prsrc->file;
7667 #if defined(CONFIG_UNIX)
7668         struct sock *sock = ctx->ring_sock->sk;
7669         struct sk_buff_head list, *head = &sock->sk_receive_queue;
7670         struct sk_buff *skb;
7671         int i;
7672
7673         __skb_queue_head_init(&list);
7674
7675         /*
7676          * Find the skb that holds this file in its SCM_RIGHTS. When found,
7677          * remove this entry and rearrange the file array.
7678          */
7679         skb = skb_dequeue(head);
7680         while (skb) {
7681                 struct scm_fp_list *fp;
7682
7683                 fp = UNIXCB(skb).fp;
7684                 for (i = 0; i < fp->count; i++) {
7685                         int left;
7686
7687                         if (fp->fp[i] != file)
7688                                 continue;
7689
7690                         unix_notinflight(fp->user, fp->fp[i]);
7691                         left = fp->count - 1 - i;
7692                         if (left) {
7693                                 memmove(&fp->fp[i], &fp->fp[i + 1],
7694                                                 left * sizeof(struct file *));
7695                         }
7696                         fp->count--;
7697                         if (!fp->count) {
7698                                 kfree_skb(skb);
7699                                 skb = NULL;
7700                         } else {
7701                                 __skb_queue_tail(&list, skb);
7702                         }
7703                         fput(file);
7704                         file = NULL;
7705                         break;
7706                 }
7707
7708                 if (!file)
7709                         break;
7710
7711                 __skb_queue_tail(&list, skb);
7712
7713                 skb = skb_dequeue(head);
7714         }
7715
7716         if (skb_peek(&list)) {
7717                 spin_lock_irq(&head->lock);
7718                 while ((skb = __skb_dequeue(&list)) != NULL)
7719                         __skb_queue_tail(head, skb);
7720                 spin_unlock_irq(&head->lock);
7721         }
7722 #else
7723         fput(file);
7724 #endif
7725 }
7726
7727 static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
7728 {
7729         struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
7730         struct io_ring_ctx *ctx = rsrc_data->ctx;
7731         struct io_rsrc_put *prsrc, *tmp;
7732
7733         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7734                 list_del(&prsrc->list);
7735                 ref_node->rsrc_put(ctx, prsrc);
7736                 kfree(prsrc);
7737         }
7738
7739         percpu_ref_exit(&ref_node->refs);
7740         kfree(ref_node);
7741         percpu_ref_put(&rsrc_data->refs);
7742 }
7743
7744 static void io_rsrc_put_work(struct work_struct *work)
7745 {
7746         struct io_ring_ctx *ctx;
7747         struct llist_node *node;
7748
7749         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7750         node = llist_del_all(&ctx->rsrc_put_llist);
7751
7752         while (node) {
7753                 struct fixed_rsrc_ref_node *ref_node;
7754                 struct llist_node *next = node->next;
7755
7756                 ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
7757                 __io_rsrc_put_work(ref_node);
7758                 node = next;
7759         }
7760 }
7761
7762 static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
7763                                         unsigned i)
7764 {
7765         struct fixed_rsrc_table *table;
7766
7767         table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7768         return &table->files[i & IORING_FILE_TABLE_MASK];
7769 }
7770
7771 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7772 {
7773         struct fixed_rsrc_ref_node *ref_node;
7774         struct fixed_rsrc_data *data;
7775         struct io_ring_ctx *ctx;
7776         bool first_add = false;
7777         int delay = HZ;
7778
7779         ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
7780         data = ref_node->rsrc_data;
7781         ctx = data->ctx;
7782
7783         io_rsrc_ref_lock(ctx);
7784         ref_node->done = true;
7785
7786         while (!list_empty(&ctx->rsrc_ref_list)) {
7787                 ref_node = list_first_entry(&ctx->rsrc_ref_list,
7788                                         struct fixed_rsrc_ref_node, node);
7789                 /* recycle ref nodes in order */
7790                 if (!ref_node->done)
7791                         break;
7792                 list_del(&ref_node->node);
7793                 first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
7794         }
7795         io_rsrc_ref_unlock(ctx);
7796
7797         if (percpu_ref_is_dying(&data->refs))
7798                 delay = 0;
7799
7800         if (!delay)
7801                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
7802         else if (first_add)
7803                 queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
7804 }
7805
7806 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
7807                         struct io_ring_ctx *ctx)
7808 {
7809         struct fixed_rsrc_ref_node *ref_node;
7810
7811         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7812         if (!ref_node)
7813                 return NULL;
7814
7815         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7816                             0, GFP_KERNEL)) {
7817                 kfree(ref_node);
7818                 return NULL;
7819         }
7820         INIT_LIST_HEAD(&ref_node->node);
7821         INIT_LIST_HEAD(&ref_node->rsrc_list);
7822         ref_node->done = false;
7823         return ref_node;
7824 }
7825
7826 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
7827                                      struct fixed_rsrc_ref_node *ref_node)
7828 {
7829         ref_node->rsrc_data = ctx->file_data;
7830         ref_node->rsrc_put = io_ring_file_put;
7831 }
7832
7833 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
7834 {
7835         percpu_ref_exit(&ref_node->refs);
7836         kfree(ref_node);
7837 }
7838
7839
7840 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7841                                  unsigned nr_args)
7842 {
7843         __s32 __user *fds = (__s32 __user *) arg;
7844         unsigned nr_tables, i;
7845         struct file *file;
7846         int fd, ret = -ENOMEM;
7847         struct fixed_rsrc_ref_node *ref_node;
7848         struct fixed_rsrc_data *file_data;
7849
7850         if (ctx->file_data)
7851                 return -EBUSY;
7852         if (!nr_args)
7853                 return -EINVAL;
7854         if (nr_args > IORING_MAX_FIXED_FILES)
7855                 return -EMFILE;
7856
7857         file_data = alloc_fixed_rsrc_data(ctx);
7858         if (!file_data)
7859                 return -ENOMEM;
7860         ctx->file_data = file_data;
7861
7862         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7863         file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
7864                                    GFP_KERNEL);
7865         if (!file_data->table)
7866                 goto out_free;
7867
7868         if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
7869                 goto out_free;
7870
7871         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7872                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7873                         ret = -EFAULT;
7874                         goto out_fput;
7875                 }
7876                 /* allow sparse sets */
7877                 if (fd == -1)
7878                         continue;
7879
7880                 file = fget(fd);
7881                 ret = -EBADF;
7882                 if (!file)
7883                         goto out_fput;
7884
7885                 /*
7886                  * Don't allow io_uring instances to be registered. If UNIX
7887                  * isn't enabled, then this causes a reference cycle and this
7888                  * instance can never get freed. If UNIX is enabled we'll
7889                  * handle it just fine, but there's still no point in allowing
7890                  * a ring fd as it doesn't support regular read/write anyway.
7891                  */
7892                 if (file->f_op == &io_uring_fops) {
7893                         fput(file);
7894                         goto out_fput;
7895                 }
7896                 *io_fixed_file_slot(file_data, i) = file;
7897         }
7898
7899         ret = io_sqe_files_scm(ctx);
7900         if (ret) {
7901                 io_sqe_files_unregister(ctx);
7902                 return ret;
7903         }
7904
7905         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7906         if (!ref_node) {
7907                 io_sqe_files_unregister(ctx);
7908                 return -ENOMEM;
7909         }
7910         init_fixed_file_ref_node(ctx, ref_node);
7911
7912         io_sqe_rsrc_set_node(ctx, file_data, ref_node);
7913         return ret;
7914 out_fput:
7915         for (i = 0; i < ctx->nr_user_files; i++) {
7916                 file = io_file_from_index(ctx, i);
7917                 if (file)
7918                         fput(file);
7919         }
7920         for (i = 0; i < nr_tables; i++)
7921                 kfree(file_data->table[i].files);
7922         ctx->nr_user_files = 0;
7923 out_free:
7924         free_fixed_rsrc_data(ctx->file_data);
7925         ctx->file_data = NULL;
7926         return ret;
7927 }
7928
7929 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7930                                 int index)
7931 {
7932 #if defined(CONFIG_UNIX)
7933         struct sock *sock = ctx->ring_sock->sk;
7934         struct sk_buff_head *head = &sock->sk_receive_queue;
7935         struct sk_buff *skb;
7936
7937         /*
7938          * See if we can merge this file into an existing skb SCM_RIGHTS
7939          * file set. If there's no room, fall back to allocating a new skb
7940          * and filling it in.
7941          */
7942         spin_lock_irq(&head->lock);
7943         skb = skb_peek(head);
7944         if (skb) {
7945                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7946
7947                 if (fpl->count < SCM_MAX_FD) {
7948                         __skb_unlink(skb, head);
7949                         spin_unlock_irq(&head->lock);
7950                         fpl->fp[fpl->count] = get_file(file);
7951                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7952                         fpl->count++;
7953                         spin_lock_irq(&head->lock);
7954                         __skb_queue_head(head, skb);
7955                 } else {
7956                         skb = NULL;
7957                 }
7958         }
7959         spin_unlock_irq(&head->lock);
7960
7961         if (skb) {
7962                 fput(file);
7963                 return 0;
7964         }
7965
7966         return __io_sqe_files_scm(ctx, 1, index);
7967 #else
7968         return 0;
7969 #endif
7970 }
7971
7972 static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
7973 {
7974         struct io_rsrc_put *prsrc;
7975         struct fixed_rsrc_ref_node *ref_node = data->node;
7976
7977         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7978         if (!prsrc)
7979                 return -ENOMEM;
7980
7981         prsrc->rsrc = rsrc;
7982         list_add(&prsrc->list, &ref_node->rsrc_list);
7983
7984         return 0;
7985 }
7986
7987 static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
7988                                         struct file *file)
7989 {
7990         return io_queue_rsrc_removal(data, (void *)file);
7991 }
7992
7993 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7994                                  struct io_uring_rsrc_update *up,
7995                                  unsigned nr_args)
7996 {
7997         struct fixed_rsrc_data *data = ctx->file_data;
7998         struct fixed_rsrc_ref_node *ref_node;
7999         struct file *file, **file_slot;
8000         __s32 __user *fds;
8001         int fd, i, err;
8002         __u32 done;
8003         bool needs_switch = false;
8004
8005         if (check_add_overflow(up->offset, nr_args, &done))
8006                 return -EOVERFLOW;
8007         if (done > ctx->nr_user_files)
8008                 return -EINVAL;
8009
8010         ref_node = alloc_fixed_rsrc_ref_node(ctx);
8011         if (!ref_node)
8012                 return -ENOMEM;
8013         init_fixed_file_ref_node(ctx, ref_node);
8014
8015         fds = u64_to_user_ptr(up->data);
8016         for (done = 0; done < nr_args; done++) {
8017                 err = 0;
8018                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
8019                         err = -EFAULT;
8020                         break;
8021                 }
8022                 if (fd == IORING_REGISTER_FILES_SKIP)
8023                         continue;
8024
8025                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
8026                 file_slot = io_fixed_file_slot(ctx->file_data, i);
8027
8028                 if (*file_slot) {
8029                         err = io_queue_file_removal(data, *file_slot);
8030                         if (err)
8031                                 break;
8032                         *file_slot = NULL;
8033                         needs_switch = true;
8034                 }
8035                 if (fd != -1) {
8036                         file = fget(fd);
8037                         if (!file) {
8038                                 err = -EBADF;
8039                                 break;
8040                         }
8041                         /*
8042                          * Don't allow io_uring instances to be registered. If
8043                          * UNIX isn't enabled, then this causes a reference
8044                          * cycle and this instance can never get freed. If UNIX
8045                          * is enabled we'll handle it just fine, but there's
8046                          * still no point in allowing a ring fd as it doesn't
8047                          * support regular read/write anyway.
8048                          */
8049                         if (file->f_op == &io_uring_fops) {
8050                                 fput(file);
8051                                 err = -EBADF;
8052                                 break;
8053                         }
8054                         *file_slot = file;
8055                         err = io_sqe_file_register(ctx, file, i);
8056                         if (err) {
8057                                 *file_slot = NULL;
8058                                 fput(file);
8059                                 break;
8060                         }
8061                 }
8062         }
8063
8064         if (needs_switch) {
8065                 percpu_ref_kill(&data->node->refs);
8066                 io_sqe_rsrc_set_node(ctx, data, ref_node);
8067         } else
8068                 destroy_fixed_rsrc_ref_node(ref_node);
8069
8070         return done ? done : err;
8071 }
8072
8073 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
8074                                unsigned nr_args)
8075 {
8076         struct io_uring_rsrc_update up;
8077
8078         if (!ctx->file_data)
8079                 return -ENXIO;
8080         if (!nr_args)
8081                 return -EINVAL;
8082         if (copy_from_user(&up, arg, sizeof(up)))
8083                 return -EFAULT;
8084         if (up.resv)
8085                 return -EINVAL;
8086
8087         return __io_sqe_files_update(ctx, &up, nr_args);
8088 }
8089
8090 static struct io_wq_work *io_free_work(struct io_wq_work *work)
8091 {
8092         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8093
8094         req = io_put_req_find_next(req);
8095         return req ? &req->work : NULL;
8096 }
8097
8098 static int io_init_wq_offload(struct io_ring_ctx *ctx,
8099                               struct io_uring_params *p)
8100 {
8101         struct io_wq_data data;
8102         struct fd f;
8103         struct io_ring_ctx *ctx_attach;
8104         unsigned int concurrency;
8105         int ret = 0;
8106
8107         data.user = ctx->user;
8108         data.free_work = io_free_work;
8109         data.do_work = io_wq_submit_work;
8110
8111         if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
8112                 /* Do QD, or 4 * CPUS, whatever is smallest */
8113                 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
8114
8115                 ctx->io_wq = io_wq_create(concurrency, &data);
8116                 if (IS_ERR(ctx->io_wq)) {
8117                         ret = PTR_ERR(ctx->io_wq);
8118                         ctx->io_wq = NULL;
8119                 }
8120                 return ret;
8121         }
8122
8123         f = fdget(p->wq_fd);
8124         if (!f.file)
8125                 return -EBADF;
8126
8127         if (f.file->f_op != &io_uring_fops) {
8128                 ret = -EINVAL;
8129                 goto out_fput;
8130         }
8131
8132         ctx_attach = f.file->private_data;
8133         /* @io_wq is protected by holding the fd */
8134         if (!io_wq_get(ctx_attach->io_wq, &data)) {
8135                 ret = -EINVAL;
8136                 goto out_fput;
8137         }
8138
8139         ctx->io_wq = ctx_attach->io_wq;
8140 out_fput:
8141         fdput(f);
8142         return ret;
8143 }
8144
8145 static int io_uring_alloc_task_context(struct task_struct *task)
8146 {
8147         struct io_uring_task *tctx;
8148         int ret;
8149
8150         tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
8151         if (unlikely(!tctx))
8152                 return -ENOMEM;
8153
8154         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8155         if (unlikely(ret)) {
8156                 kfree(tctx);
8157                 return ret;
8158         }
8159
8160         xa_init(&tctx->xa);
8161         init_waitqueue_head(&tctx->wait);
8162         tctx->last = NULL;
8163         atomic_set(&tctx->in_idle, 0);
8164         tctx->sqpoll = false;
8165         io_init_identity(&tctx->__identity);
8166         tctx->identity = &tctx->__identity;
8167         task->io_uring = tctx;
8168         spin_lock_init(&tctx->task_lock);
8169         INIT_WQ_LIST(&tctx->task_list);
8170         tctx->task_state = 0;
8171         init_task_work(&tctx->task_work, tctx_task_work);
8172         return 0;
8173 }
8174
8175 void __io_uring_free(struct task_struct *tsk)
8176 {
8177         struct io_uring_task *tctx = tsk->io_uring;
8178
8179         WARN_ON_ONCE(!xa_empty(&tctx->xa));
8180         WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
8181         if (tctx->identity != &tctx->__identity)
8182                 kfree(tctx->identity);
8183         percpu_counter_destroy(&tctx->inflight);
8184         kfree(tctx);
8185         tsk->io_uring = NULL;
8186 }
8187
8188 static int io_sq_offload_create(struct io_ring_ctx *ctx,
8189                                 struct io_uring_params *p)
8190 {
8191         int ret;
8192
8193         if (ctx->flags & IORING_SETUP_SQPOLL) {
8194                 struct io_sq_data *sqd;
8195
8196                 ret = -EPERM;
8197                 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
8198                         goto err;
8199
8200                 sqd = io_get_sq_data(p);
8201                 if (IS_ERR(sqd)) {
8202                         ret = PTR_ERR(sqd);
8203                         goto err;
8204                 }
8205
8206                 ctx->sq_data = sqd;
8207                 io_sq_thread_park(sqd);
8208                 mutex_lock(&sqd->ctx_lock);
8209                 list_add(&ctx->sqd_list, &sqd->ctx_new_list);
8210                 mutex_unlock(&sqd->ctx_lock);
8211                 io_sq_thread_unpark(sqd);
8212
8213                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8214                 if (!ctx->sq_thread_idle)
8215                         ctx->sq_thread_idle = HZ;
8216
8217                 if (sqd->thread)
8218                         goto done;
8219
8220                 if (p->flags & IORING_SETUP_SQ_AFF) {
8221                         int cpu = p->sq_thread_cpu;
8222
8223                         ret = -EINVAL;
8224                         if (cpu >= nr_cpu_ids)
8225                                 goto err;
8226                         if (!cpu_online(cpu))
8227                                 goto err;
8228
8229                         sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
8230                                                         cpu, "io_uring-sq");
8231                 } else {
8232                         sqd->thread = kthread_create(io_sq_thread, sqd,
8233                                                         "io_uring-sq");
8234                 }
8235                 if (IS_ERR(sqd->thread)) {
8236                         ret = PTR_ERR(sqd->thread);
8237                         sqd->thread = NULL;
8238                         goto err;
8239                 }
8240                 ret = io_uring_alloc_task_context(sqd->thread);
8241                 if (ret)
8242                         goto err;
8243         } else if (p->flags & IORING_SETUP_SQ_AFF) {
8244                 /* Can't have SQ_AFF without SQPOLL */
8245                 ret = -EINVAL;
8246                 goto err;
8247         }
8248
8249 done:
8250         ret = io_init_wq_offload(ctx, p);
8251         if (ret)
8252                 goto err;
8253
8254         return 0;
8255 err:
8256         io_finish_async(ctx);
8257         return ret;
8258 }
8259
8260 static void io_sq_offload_start(struct io_ring_ctx *ctx)
8261 {
8262         struct io_sq_data *sqd = ctx->sq_data;
8263
8264         if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
8265                 wake_up_process(sqd->thread);
8266 }
8267
8268 static inline void __io_unaccount_mem(struct user_struct *user,
8269                                       unsigned long nr_pages)
8270 {
8271         atomic_long_sub(nr_pages, &user->locked_vm);
8272 }
8273
8274 static inline int __io_account_mem(struct user_struct *user,
8275                                    unsigned long nr_pages)
8276 {
8277         unsigned long page_limit, cur_pages, new_pages;
8278
8279         /* Don't allow more pages than we can safely lock */
8280         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8281
8282         do {
8283                 cur_pages = atomic_long_read(&user->locked_vm);
8284                 new_pages = cur_pages + nr_pages;
8285                 if (new_pages > page_limit)
8286                         return -ENOMEM;
8287         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8288                                         new_pages) != cur_pages);
8289
8290         return 0;
8291 }
8292
8293 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8294 {
8295         if (ctx->limit_mem)
8296                 __io_unaccount_mem(ctx->user, nr_pages);
8297
8298         if (ctx->mm_account)
8299                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
8300 }
8301
8302 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8303 {
8304         int ret;
8305
8306         if (ctx->limit_mem) {
8307                 ret = __io_account_mem(ctx->user, nr_pages);
8308                 if (ret)
8309                         return ret;
8310         }
8311
8312         if (ctx->mm_account)
8313                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
8314
8315         return 0;
8316 }
8317
8318 static void io_mem_free(void *ptr)
8319 {
8320         struct page *page;
8321
8322         if (!ptr)
8323                 return;
8324
8325         page = virt_to_head_page(ptr);
8326         if (put_page_testzero(page))
8327                 free_compound_page(page);
8328 }
8329
8330 static void *io_mem_alloc(size_t size)
8331 {
8332         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
8333                                 __GFP_NORETRY | __GFP_ACCOUNT;
8334
8335         return (void *) __get_free_pages(gfp_flags, get_order(size));
8336 }
8337
8338 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8339                                 size_t *sq_offset)
8340 {
8341         struct io_rings *rings;
8342         size_t off, sq_array_size;
8343
8344         off = struct_size(rings, cqes, cq_entries);
8345         if (off == SIZE_MAX)
8346                 return SIZE_MAX;
8347
8348 #ifdef CONFIG_SMP
8349         off = ALIGN(off, SMP_CACHE_BYTES);
8350         if (off == 0)
8351                 return SIZE_MAX;
8352 #endif
8353
8354         if (sq_offset)
8355                 *sq_offset = off;
8356
8357         sq_array_size = array_size(sizeof(u32), sq_entries);
8358         if (sq_array_size == SIZE_MAX)
8359                 return SIZE_MAX;
8360
8361         if (check_add_overflow(off, sq_array_size, &off))
8362                 return SIZE_MAX;
8363
8364         return off;
8365 }
8366
8367 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8368 {
8369         int i, j;
8370
8371         if (!ctx->user_bufs)
8372                 return -ENXIO;
8373
8374         for (i = 0; i < ctx->nr_user_bufs; i++) {
8375                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8376
8377                 for (j = 0; j < imu->nr_bvecs; j++)
8378                         unpin_user_page(imu->bvec[j].bv_page);
8379
8380                 if (imu->acct_pages)
8381                         io_unaccount_mem(ctx, imu->acct_pages);
8382                 kvfree(imu->bvec);
8383                 imu->nr_bvecs = 0;
8384         }
8385
8386         kfree(ctx->user_bufs);
8387         ctx->user_bufs = NULL;
8388         ctx->nr_user_bufs = 0;
8389         return 0;
8390 }
8391
8392 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8393                        void __user *arg, unsigned index)
8394 {
8395         struct iovec __user *src;
8396
8397 #ifdef CONFIG_COMPAT
8398         if (ctx->compat) {
8399                 struct compat_iovec __user *ciovs;
8400                 struct compat_iovec ciov;
8401
8402                 ciovs = (struct compat_iovec __user *) arg;
8403                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8404                         return -EFAULT;
8405
8406                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8407                 dst->iov_len = ciov.iov_len;
8408                 return 0;
8409         }
8410 #endif
8411         src = (struct iovec __user *) arg;
8412         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8413                 return -EFAULT;
8414         return 0;
8415 }
8416
8417 /*
8418  * Not super efficient, but this is just a registration time. And we do cache
8419  * the last compound head, so generally we'll only do a full search if we don't
8420  * match that one.
8421  *
8422  * We check if the given compound head page has already been accounted, to
8423  * avoid double accounting it. This allows us to account the full size of the
8424  * page, not just the constituent pages of a huge page.
8425  */
8426 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8427                                   int nr_pages, struct page *hpage)
8428 {
8429         int i, j;
8430
8431         /* check current page array */
8432         for (i = 0; i < nr_pages; i++) {
8433                 if (!PageCompound(pages[i]))
8434                         continue;
8435                 if (compound_head(pages[i]) == hpage)
8436                         return true;
8437         }
8438
8439         /* check previously registered pages */
8440         for (i = 0; i < ctx->nr_user_bufs; i++) {
8441                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8442
8443                 for (j = 0; j < imu->nr_bvecs; j++) {
8444                         if (!PageCompound(imu->bvec[j].bv_page))
8445                                 continue;
8446                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8447                                 return true;
8448                 }
8449         }
8450
8451         return false;
8452 }
8453
8454 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8455                                  int nr_pages, struct io_mapped_ubuf *imu,
8456                                  struct page **last_hpage)
8457 {
8458         int i, ret;
8459
8460         for (i = 0; i < nr_pages; i++) {
8461                 if (!PageCompound(pages[i])) {
8462                         imu->acct_pages++;
8463                 } else {
8464                         struct page *hpage;
8465
8466                         hpage = compound_head(pages[i]);
8467                         if (hpage == *last_hpage)
8468                                 continue;
8469                         *last_hpage = hpage;
8470                         if (headpage_already_acct(ctx, pages, i, hpage))
8471                                 continue;
8472                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8473                 }
8474         }
8475
8476         if (!imu->acct_pages)
8477                 return 0;
8478
8479         ret = io_account_mem(ctx, imu->acct_pages);
8480         if (ret)
8481                 imu->acct_pages = 0;
8482         return ret;
8483 }
8484
8485 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8486                                   struct io_mapped_ubuf *imu,
8487                                   struct page **last_hpage)
8488 {
8489         struct vm_area_struct **vmas = NULL;
8490         struct page **pages = NULL;
8491         unsigned long off, start, end, ubuf;
8492         size_t size;
8493         int ret, pret, nr_pages, i;
8494
8495         ubuf = (unsigned long) iov->iov_base;
8496         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8497         start = ubuf >> PAGE_SHIFT;
8498         nr_pages = end - start;
8499
8500         ret = -ENOMEM;
8501
8502         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8503         if (!pages)
8504                 goto done;
8505
8506         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8507                               GFP_KERNEL);
8508         if (!vmas)
8509                 goto done;
8510
8511         imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8512                                    GFP_KERNEL);
8513         if (!imu->bvec)
8514                 goto done;
8515
8516         ret = 0;
8517         mmap_read_lock(current->mm);
8518         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8519                               pages, vmas);
8520         if (pret == nr_pages) {
8521                 /* don't support file backed memory */
8522                 for (i = 0; i < nr_pages; i++) {
8523                         struct vm_area_struct *vma = vmas[i];
8524
8525                         if (vma->vm_file &&
8526                             !is_file_hugepages(vma->vm_file)) {
8527                                 ret = -EOPNOTSUPP;
8528                                 break;
8529                         }
8530                 }
8531         } else {
8532                 ret = pret < 0 ? pret : -EFAULT;
8533         }
8534         mmap_read_unlock(current->mm);
8535         if (ret) {
8536                 /*
8537                  * if we did partial map, or found file backed vmas,
8538                  * release any pages we did get
8539                  */
8540                 if (pret > 0)
8541                         unpin_user_pages(pages, pret);
8542                 kvfree(imu->bvec);
8543                 goto done;
8544         }
8545
8546         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8547         if (ret) {
8548                 unpin_user_pages(pages, pret);
8549                 kvfree(imu->bvec);
8550                 goto done;
8551         }
8552
8553         off = ubuf & ~PAGE_MASK;
8554         size = iov->iov_len;
8555         for (i = 0; i < nr_pages; i++) {
8556                 size_t vec_len;
8557
8558                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8559                 imu->bvec[i].bv_page = pages[i];
8560                 imu->bvec[i].bv_len = vec_len;
8561                 imu->bvec[i].bv_offset = off;
8562                 off = 0;
8563                 size -= vec_len;
8564         }
8565         /* store original address for later verification */
8566         imu->ubuf = ubuf;
8567         imu->len = iov->iov_len;
8568         imu->nr_bvecs = nr_pages;
8569         ret = 0;
8570 done:
8571         kvfree(pages);
8572         kvfree(vmas);
8573         return ret;
8574 }
8575
8576 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
8577 {
8578         if (ctx->user_bufs)
8579                 return -EBUSY;
8580         if (!nr_args || nr_args > UIO_MAXIOV)
8581                 return -EINVAL;
8582
8583         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8584                                         GFP_KERNEL);
8585         if (!ctx->user_bufs)
8586                 return -ENOMEM;
8587
8588         return 0;
8589 }
8590
8591 static int io_buffer_validate(struct iovec *iov)
8592 {
8593         /*
8594          * Don't impose further limits on the size and buffer
8595          * constraints here, we'll -EINVAL later when IO is
8596          * submitted if they are wrong.
8597          */
8598         if (!iov->iov_base || !iov->iov_len)
8599                 return -EFAULT;
8600
8601         /* arbitrary limit, but we need something */
8602         if (iov->iov_len > SZ_1G)
8603                 return -EFAULT;
8604
8605         return 0;
8606 }
8607
8608 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8609                                    unsigned int nr_args)
8610 {
8611         int i, ret;
8612         struct iovec iov;
8613         struct page *last_hpage = NULL;
8614
8615         ret = io_buffers_map_alloc(ctx, nr_args);
8616         if (ret)
8617                 return ret;
8618
8619         for (i = 0; i < nr_args; i++) {
8620                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8621
8622                 ret = io_copy_iov(ctx, &iov, arg, i);
8623                 if (ret)
8624                         break;
8625
8626                 ret = io_buffer_validate(&iov);
8627                 if (ret)
8628                         break;
8629
8630                 ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
8631                 if (ret)
8632                         break;
8633
8634                 ctx->nr_user_bufs++;
8635         }
8636
8637         if (ret)
8638                 io_sqe_buffers_unregister(ctx);
8639
8640         return ret;
8641 }
8642
8643 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8644 {
8645         __s32 __user *fds = arg;
8646         int fd;
8647
8648         if (ctx->cq_ev_fd)
8649                 return -EBUSY;
8650
8651         if (copy_from_user(&fd, fds, sizeof(*fds)))
8652                 return -EFAULT;
8653
8654         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8655         if (IS_ERR(ctx->cq_ev_fd)) {
8656                 int ret = PTR_ERR(ctx->cq_ev_fd);
8657                 ctx->cq_ev_fd = NULL;
8658                 return ret;
8659         }
8660
8661         return 0;
8662 }
8663
8664 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8665 {
8666         if (ctx->cq_ev_fd) {
8667                 eventfd_ctx_put(ctx->cq_ev_fd);
8668                 ctx->cq_ev_fd = NULL;
8669                 return 0;
8670         }
8671
8672         return -ENXIO;
8673 }
8674
8675 static int __io_destroy_buffers(int id, void *p, void *data)
8676 {
8677         struct io_ring_ctx *ctx = data;
8678         struct io_buffer *buf = p;
8679
8680         __io_remove_buffers(ctx, buf, id, -1U);
8681         return 0;
8682 }
8683
8684 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8685 {
8686         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8687         idr_destroy(&ctx->io_buffer_idr);
8688 }
8689
8690 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
8691 {
8692         struct io_kiocb *req, *nxt;
8693
8694         list_for_each_entry_safe(req, nxt, list, compl.list) {
8695                 if (tsk && req->task != tsk)
8696                         continue;
8697                 list_del(&req->compl.list);
8698                 kmem_cache_free(req_cachep, req);
8699         }
8700 }
8701
8702 static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
8703 {
8704         struct io_submit_state *submit_state = &ctx->submit_state;
8705
8706         mutex_lock(&ctx->uring_lock);
8707
8708         if (submit_state->free_reqs)
8709                 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8710                                      submit_state->reqs);
8711
8712         io_req_cache_free(&submit_state->comp.free_list, NULL);
8713
8714         spin_lock_irq(&ctx->completion_lock);
8715         io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
8716         spin_unlock_irq(&ctx->completion_lock);
8717
8718         mutex_unlock(&ctx->uring_lock);
8719 }
8720
8721 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8722 {
8723         /*
8724          * Some may use context even when all refs and requests have been put,
8725          * and they are free to do so while still holding uring_lock, see
8726          * __io_req_task_submit(). Wait for them to finish.
8727          */
8728         mutex_lock(&ctx->uring_lock);
8729         mutex_unlock(&ctx->uring_lock);
8730
8731         io_finish_async(ctx);
8732         io_sqe_buffers_unregister(ctx);
8733
8734         if (ctx->sqo_task) {
8735                 put_task_struct(ctx->sqo_task);
8736                 ctx->sqo_task = NULL;
8737                 mmdrop(ctx->mm_account);
8738                 ctx->mm_account = NULL;
8739         }
8740
8741 #ifdef CONFIG_BLK_CGROUP
8742         if (ctx->sqo_blkcg_css)
8743                 css_put(ctx->sqo_blkcg_css);
8744 #endif
8745
8746         io_sqe_files_unregister(ctx);
8747         io_eventfd_unregister(ctx);
8748         io_destroy_buffers(ctx);
8749         idr_destroy(&ctx->personality_idr);
8750
8751 #if defined(CONFIG_UNIX)
8752         if (ctx->ring_sock) {
8753                 ctx->ring_sock->file = NULL; /* so that iput() is called */
8754                 sock_release(ctx->ring_sock);
8755         }
8756 #endif
8757
8758         io_mem_free(ctx->rings);
8759         io_mem_free(ctx->sq_sqes);
8760
8761         percpu_ref_exit(&ctx->refs);
8762         free_uid(ctx->user);
8763         put_cred(ctx->creds);
8764         io_req_caches_free(ctx, NULL);
8765         kfree(ctx->cancel_hash);
8766         kfree(ctx);
8767 }
8768
8769 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8770 {
8771         struct io_ring_ctx *ctx = file->private_data;
8772         __poll_t mask = 0;
8773
8774         poll_wait(file, &ctx->cq_wait, wait);
8775         /*
8776          * synchronizes with barrier from wq_has_sleeper call in
8777          * io_commit_cqring
8778          */
8779         smp_rmb();
8780         if (!io_sqring_full(ctx))
8781                 mask |= EPOLLOUT | EPOLLWRNORM;
8782
8783         /*
8784          * Don't flush cqring overflow list here, just do a simple check.
8785          * Otherwise there could possible be ABBA deadlock:
8786          *      CPU0                    CPU1
8787          *      ----                    ----
8788          * lock(&ctx->uring_lock);
8789          *                              lock(&ep->mtx);
8790          *                              lock(&ctx->uring_lock);
8791          * lock(&ep->mtx);
8792          *
8793          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8794          * pushs them to do the flush.
8795          */
8796         if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
8797                 mask |= EPOLLIN | EPOLLRDNORM;
8798
8799         return mask;
8800 }
8801
8802 static int io_uring_fasync(int fd, struct file *file, int on)
8803 {
8804         struct io_ring_ctx *ctx = file->private_data;
8805
8806         return fasync_helper(fd, file, on, &ctx->cq_fasync);
8807 }
8808
8809 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8810 {
8811         struct io_identity *iod;
8812
8813         iod = idr_remove(&ctx->personality_idr, id);
8814         if (iod) {
8815                 put_cred(iod->creds);
8816                 if (refcount_dec_and_test(&iod->count))
8817                         kfree(iod);
8818                 return 0;
8819         }
8820
8821         return -EINVAL;
8822 }
8823
8824 static int io_remove_personalities(int id, void *p, void *data)
8825 {
8826         struct io_ring_ctx *ctx = data;
8827
8828         io_unregister_personality(ctx, id);
8829         return 0;
8830 }
8831
8832 static void io_ring_exit_work(struct work_struct *work)
8833 {
8834         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8835                                                exit_work);
8836
8837         /*
8838          * If we're doing polled IO and end up having requests being
8839          * submitted async (out-of-line), then completions can come in while
8840          * we're waiting for refs to drop. We need to reap these manually,
8841          * as nobody else will be looking for them.
8842          */
8843         do {
8844                 io_uring_try_cancel_requests(ctx, NULL, NULL);
8845         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8846         io_ring_ctx_free(ctx);
8847 }
8848
8849 static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
8850 {
8851         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8852
8853         return req->ctx == data;
8854 }
8855
8856 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8857 {
8858         mutex_lock(&ctx->uring_lock);
8859         percpu_ref_kill(&ctx->refs);
8860
8861         if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead))
8862                 ctx->sqo_dead = 1;
8863
8864         /* if force is set, the ring is going away. always drop after that */
8865         ctx->cq_overflow_flushed = 1;
8866         if (ctx->rings)
8867                 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
8868         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
8869         mutex_unlock(&ctx->uring_lock);
8870
8871         io_kill_timeouts(ctx, NULL, NULL);
8872         io_poll_remove_all(ctx, NULL, NULL);
8873
8874         if (ctx->io_wq)
8875                 io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
8876
8877         /* if we failed setting up the ctx, we might not have any rings */
8878         io_iopoll_try_reap_events(ctx);
8879
8880         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8881         /*
8882          * Use system_unbound_wq to avoid spawning tons of event kworkers
8883          * if we're exiting a ton of rings at the same time. It just adds
8884          * noise and overhead, there's no discernable change in runtime
8885          * over using system_wq.
8886          */
8887         queue_work(system_unbound_wq, &ctx->exit_work);
8888 }
8889
8890 static int io_uring_release(struct inode *inode, struct file *file)
8891 {
8892         struct io_ring_ctx *ctx = file->private_data;
8893
8894         file->private_data = NULL;
8895         io_ring_ctx_wait_and_kill(ctx);
8896         return 0;
8897 }
8898
8899 struct io_task_cancel {
8900         struct task_struct *task;
8901         struct files_struct *files;
8902 };
8903
8904 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8905 {
8906         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8907         struct io_task_cancel *cancel = data;
8908         bool ret;
8909
8910         if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
8911                 unsigned long flags;
8912                 struct io_ring_ctx *ctx = req->ctx;
8913
8914                 /* protect against races with linked timeouts */
8915                 spin_lock_irqsave(&ctx->completion_lock, flags);
8916                 ret = io_match_task(req, cancel->task, cancel->files);
8917                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8918         } else {
8919                 ret = io_match_task(req, cancel->task, cancel->files);
8920         }
8921         return ret;
8922 }
8923
8924 static void io_cancel_defer_files(struct io_ring_ctx *ctx,
8925                                   struct task_struct *task,
8926                                   struct files_struct *files)
8927 {
8928         struct io_defer_entry *de = NULL;
8929         LIST_HEAD(list);
8930
8931         spin_lock_irq(&ctx->completion_lock);
8932         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8933                 if (io_match_task(de->req, task, files)) {
8934                         list_cut_position(&list, &ctx->defer_list, &de->list);
8935                         break;
8936                 }
8937         }
8938         spin_unlock_irq(&ctx->completion_lock);
8939
8940         while (!list_empty(&list)) {
8941                 de = list_first_entry(&list, struct io_defer_entry, list);
8942                 list_del_init(&de->list);
8943                 req_set_fail_links(de->req);
8944                 io_put_req(de->req);
8945                 io_req_complete(de->req, -ECANCELED);
8946                 kfree(de);
8947         }
8948 }
8949
8950 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8951                                          struct task_struct *task,
8952                                          struct files_struct *files)
8953 {
8954         struct io_task_cancel cancel = { .task = task, .files = files, };
8955
8956         while (1) {
8957                 enum io_wq_cancel cret;
8958                 bool ret = false;
8959
8960                 if (ctx->io_wq) {
8961                         cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
8962                                                &cancel, true);
8963                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8964                 }
8965
8966                 /* SQPOLL thread does its own polling */
8967                 if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
8968                         while (!list_empty_careful(&ctx->iopoll_list)) {
8969                                 io_iopoll_try_reap_events(ctx);
8970                                 ret = true;
8971                         }
8972                 }
8973
8974                 ret |= io_poll_remove_all(ctx, task, files);
8975                 ret |= io_kill_timeouts(ctx, task, files);
8976                 ret |= io_run_task_work();
8977                 io_cqring_overflow_flush(ctx, true, task, files);
8978                 if (!ret)
8979                         break;
8980                 cond_resched();
8981         }
8982 }
8983
8984 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
8985                                    struct task_struct *task,
8986                                    struct files_struct *files)
8987 {
8988         struct io_kiocb *req;
8989         int cnt = 0;
8990
8991         spin_lock_irq(&ctx->inflight_lock);
8992         list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
8993                 cnt += io_match_task(req, task, files);
8994         spin_unlock_irq(&ctx->inflight_lock);
8995         return cnt;
8996 }
8997
8998 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
8999                                   struct task_struct *task,
9000                                   struct files_struct *files)
9001 {
9002         while (!list_empty_careful(&ctx->inflight_list)) {
9003                 DEFINE_WAIT(wait);
9004                 int inflight;
9005
9006                 inflight = io_uring_count_inflight(ctx, task, files);
9007                 if (!inflight)
9008                         break;
9009
9010                 io_uring_try_cancel_requests(ctx, task, files);
9011
9012                 if (ctx->sq_data)
9013                         io_sq_thread_unpark(ctx->sq_data);
9014                 prepare_to_wait(&task->io_uring->wait, &wait,
9015                                 TASK_UNINTERRUPTIBLE);
9016                 if (inflight == io_uring_count_inflight(ctx, task, files))
9017                         schedule();
9018                 finish_wait(&task->io_uring->wait, &wait);
9019                 if (ctx->sq_data)
9020                         io_sq_thread_park(ctx->sq_data);
9021         }
9022 }
9023
9024 static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
9025 {
9026         mutex_lock(&ctx->uring_lock);
9027         ctx->sqo_dead = 1;
9028         mutex_unlock(&ctx->uring_lock);
9029
9030         /* make sure callers enter the ring to get error */
9031         if (ctx->rings)
9032                 io_ring_set_wakeup_flag(ctx);
9033 }
9034
9035 /*
9036  * We need to iteratively cancel requests, in case a request has dependent
9037  * hard links. These persist even for failure of cancelations, hence keep
9038  * looping until none are found.
9039  */
9040 static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
9041                                           struct files_struct *files)
9042 {
9043         struct task_struct *task = current;
9044
9045         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
9046                 io_disable_sqo_submit(ctx);
9047                 task = ctx->sq_data->thread;
9048                 atomic_inc(&task->io_uring->in_idle);
9049                 io_sq_thread_park(ctx->sq_data);
9050         }
9051
9052         io_cancel_defer_files(ctx, task, files);
9053
9054         io_uring_cancel_files(ctx, task, files);
9055         if (!files)
9056                 io_uring_try_cancel_requests(ctx, task, NULL);
9057
9058         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
9059                 atomic_dec(&task->io_uring->in_idle);
9060                 /*
9061                  * If the files that are going away are the ones in the thread
9062                  * identity, clear them out.
9063                  */
9064                 if (task->io_uring->identity->files == files)
9065                         task->io_uring->identity->files = NULL;
9066                 io_sq_thread_unpark(ctx->sq_data);
9067         }
9068 }
9069
9070 /*
9071  * Note that this task has used io_uring. We use it for cancelation purposes.
9072  */
9073 static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
9074 {
9075         struct io_uring_task *tctx = current->io_uring;
9076         int ret;
9077
9078         if (unlikely(!tctx)) {
9079                 ret = io_uring_alloc_task_context(current);
9080                 if (unlikely(ret))
9081                         return ret;
9082                 tctx = current->io_uring;
9083         }
9084         if (tctx->last != file) {
9085                 void *old = xa_load(&tctx->xa, (unsigned long)file);
9086
9087                 if (!old) {
9088                         get_file(file);
9089                         ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
9090                                                 file, GFP_KERNEL));
9091                         if (ret) {
9092                                 fput(file);
9093                                 return ret;
9094                         }
9095
9096                         /* one and only SQPOLL file note, held by sqo_task */
9097                         WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
9098                                      current != ctx->sqo_task);
9099                 }
9100                 tctx->last = file;
9101         }
9102
9103         /*
9104          * This is race safe in that the task itself is doing this, hence it
9105          * cannot be going through the exit/cancel paths at the same time.
9106          * This cannot be modified while exit/cancel is running.
9107          */
9108         if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
9109                 tctx->sqpoll = true;
9110
9111         return 0;
9112 }
9113
9114 /*
9115  * Remove this io_uring_file -> task mapping.
9116  */
9117 static void io_uring_del_task_file(struct file *file)
9118 {
9119         struct io_uring_task *tctx = current->io_uring;
9120
9121         if (tctx->last == file)
9122                 tctx->last = NULL;
9123         file = xa_erase(&tctx->xa, (unsigned long)file);
9124         if (file)
9125                 fput(file);
9126 }
9127
9128 static void io_uring_remove_task_files(struct io_uring_task *tctx)
9129 {
9130         struct file *file;
9131         unsigned long index;
9132
9133         xa_for_each(&tctx->xa, index, file)
9134                 io_uring_del_task_file(file);
9135 }
9136
9137 void __io_uring_files_cancel(struct files_struct *files)
9138 {
9139         struct io_uring_task *tctx = current->io_uring;
9140         struct file *file;
9141         unsigned long index;
9142
9143         /* make sure overflow events are dropped */
9144         atomic_inc(&tctx->in_idle);
9145         xa_for_each(&tctx->xa, index, file)
9146                 io_uring_cancel_task_requests(file->private_data, files);
9147         atomic_dec(&tctx->in_idle);
9148
9149         if (files)
9150                 io_uring_remove_task_files(tctx);
9151 }
9152
9153 static s64 tctx_inflight(struct io_uring_task *tctx)
9154 {
9155         return percpu_counter_sum(&tctx->inflight);
9156 }
9157
9158 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
9159 {
9160         struct io_uring_task *tctx;
9161         s64 inflight;
9162         DEFINE_WAIT(wait);
9163
9164         if (!ctx->sq_data)
9165                 return;
9166         tctx = ctx->sq_data->thread->io_uring;
9167         io_disable_sqo_submit(ctx);
9168
9169         atomic_inc(&tctx->in_idle);
9170         do {
9171                 /* read completions before cancelations */
9172                 inflight = tctx_inflight(tctx);
9173                 if (!inflight)
9174                         break;
9175                 io_uring_cancel_task_requests(ctx, NULL);
9176
9177                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9178                 /*
9179                  * If we've seen completions, retry without waiting. This
9180                  * avoids a race where a completion comes in before we did
9181                  * prepare_to_wait().
9182                  */
9183                 if (inflight == tctx_inflight(tctx))
9184                         schedule();
9185                 finish_wait(&tctx->wait, &wait);
9186         } while (1);
9187         atomic_dec(&tctx->in_idle);
9188 }
9189
9190 /*
9191  * Find any io_uring fd that this task has registered or done IO on, and cancel
9192  * requests.
9193  */
9194 void __io_uring_task_cancel(void)
9195 {
9196         struct io_uring_task *tctx = current->io_uring;
9197         DEFINE_WAIT(wait);
9198         s64 inflight;
9199
9200         /* make sure overflow events are dropped */
9201         atomic_inc(&tctx->in_idle);
9202
9203         /* trigger io_disable_sqo_submit() */
9204         if (tctx->sqpoll) {
9205                 struct file *file;
9206                 unsigned long index;
9207
9208                 xa_for_each(&tctx->xa, index, file)
9209                         io_uring_cancel_sqpoll(file->private_data);
9210         }
9211
9212         do {
9213                 /* read completions before cancelations */
9214                 inflight = tctx_inflight(tctx);
9215                 if (!inflight)
9216                         break;
9217                 __io_uring_files_cancel(NULL);
9218
9219                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9220
9221                 /*
9222                  * If we've seen completions, retry without waiting. This
9223                  * avoids a race where a completion comes in before we did
9224                  * prepare_to_wait().
9225                  */
9226                 if (inflight == tctx_inflight(tctx))
9227                         schedule();
9228                 finish_wait(&tctx->wait, &wait);
9229         } while (1);
9230
9231         atomic_dec(&tctx->in_idle);
9232
9233         io_uring_remove_task_files(tctx);
9234 }
9235
9236 static int io_uring_flush(struct file *file, void *data)
9237 {
9238         struct io_uring_task *tctx = current->io_uring;
9239         struct io_ring_ctx *ctx = file->private_data;
9240
9241         if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
9242                 io_uring_cancel_task_requests(ctx, NULL);
9243                 io_req_caches_free(ctx, current);
9244         }
9245
9246         if (!tctx)
9247                 return 0;
9248
9249         /* we should have cancelled and erased it before PF_EXITING */
9250         WARN_ON_ONCE((current->flags & PF_EXITING) &&
9251                      xa_load(&tctx->xa, (unsigned long)file));
9252
9253         /*
9254          * fput() is pending, will be 2 if the only other ref is our potential
9255          * task file note. If the task is exiting, drop regardless of count.
9256          */
9257         if (atomic_long_read(&file->f_count) != 2)
9258                 return 0;
9259
9260         if (ctx->flags & IORING_SETUP_SQPOLL) {
9261                 /* there is only one file note, which is owned by sqo_task */
9262                 WARN_ON_ONCE(ctx->sqo_task != current &&
9263                              xa_load(&tctx->xa, (unsigned long)file));
9264                 /* sqo_dead check is for when this happens after cancellation */
9265                 WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead &&
9266                              !xa_load(&tctx->xa, (unsigned long)file));
9267
9268                 io_disable_sqo_submit(ctx);
9269         }
9270
9271         if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current)
9272                 io_uring_del_task_file(file);
9273         return 0;
9274 }
9275
9276 static void *io_uring_validate_mmap_request(struct file *file,
9277                                             loff_t pgoff, size_t sz)
9278 {
9279         struct io_ring_ctx *ctx = file->private_data;
9280         loff_t offset = pgoff << PAGE_SHIFT;
9281         struct page *page;
9282         void *ptr;
9283
9284         switch (offset) {
9285         case IORING_OFF_SQ_RING:
9286         case IORING_OFF_CQ_RING:
9287                 ptr = ctx->rings;
9288                 break;
9289         case IORING_OFF_SQES:
9290                 ptr = ctx->sq_sqes;
9291                 break;
9292         default:
9293                 return ERR_PTR(-EINVAL);
9294         }
9295
9296         page = virt_to_head_page(ptr);
9297         if (sz > page_size(page))
9298                 return ERR_PTR(-EINVAL);
9299
9300         return ptr;
9301 }
9302
9303 #ifdef CONFIG_MMU
9304
9305 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9306 {
9307         size_t sz = vma->vm_end - vma->vm_start;
9308         unsigned long pfn;
9309         void *ptr;
9310
9311         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9312         if (IS_ERR(ptr))
9313                 return PTR_ERR(ptr);
9314
9315         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9316         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9317 }
9318
9319 #else /* !CONFIG_MMU */
9320
9321 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9322 {
9323         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9324 }
9325
9326 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9327 {
9328         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9329 }
9330
9331 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9332         unsigned long addr, unsigned long len,
9333         unsigned long pgoff, unsigned long flags)
9334 {
9335         void *ptr;
9336
9337         ptr = io_uring_validate_mmap_request(file, pgoff, len);
9338         if (IS_ERR(ptr))
9339                 return PTR_ERR(ptr);
9340
9341         return (unsigned long) ptr;
9342 }
9343
9344 #endif /* !CONFIG_MMU */
9345
9346 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9347 {
9348         int ret = 0;
9349         DEFINE_WAIT(wait);
9350
9351         do {
9352                 if (!io_sqring_full(ctx))
9353                         break;
9354
9355                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9356
9357                 if (unlikely(ctx->sqo_dead)) {
9358                         ret = -EOWNERDEAD;
9359                         goto out;
9360                 }
9361
9362                 if (!io_sqring_full(ctx))
9363                         break;
9364
9365                 schedule();
9366         } while (!signal_pending(current));
9367
9368         finish_wait(&ctx->sqo_sq_wait, &wait);
9369 out:
9370         return ret;
9371 }
9372
9373 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9374                           struct __kernel_timespec __user **ts,
9375                           const sigset_t __user **sig)
9376 {
9377         struct io_uring_getevents_arg arg;
9378
9379         /*
9380          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9381          * is just a pointer to the sigset_t.
9382          */
9383         if (!(flags & IORING_ENTER_EXT_ARG)) {
9384                 *sig = (const sigset_t __user *) argp;
9385                 *ts = NULL;
9386                 return 0;
9387         }
9388
9389         /*
9390          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9391          * timespec and sigset_t pointers if good.
9392          */
9393         if (*argsz != sizeof(arg))
9394                 return -EINVAL;
9395         if (copy_from_user(&arg, argp, sizeof(arg)))
9396                 return -EFAULT;
9397         *sig = u64_to_user_ptr(arg.sigmask);
9398         *argsz = arg.sigmask_sz;
9399         *ts = u64_to_user_ptr(arg.ts);
9400         return 0;
9401 }
9402
9403 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9404                 u32, min_complete, u32, flags, const void __user *, argp,
9405                 size_t, argsz)
9406 {
9407         struct io_ring_ctx *ctx;
9408         long ret = -EBADF;
9409         int submitted = 0;
9410         struct fd f;
9411
9412         io_run_task_work();
9413
9414         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9415                         IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
9416                 return -EINVAL;
9417
9418         f = fdget(fd);
9419         if (!f.file)
9420                 return -EBADF;
9421
9422         ret = -EOPNOTSUPP;
9423         if (f.file->f_op != &io_uring_fops)
9424                 goto out_fput;
9425
9426         ret = -ENXIO;
9427         ctx = f.file->private_data;
9428         if (!percpu_ref_tryget(&ctx->refs))
9429                 goto out_fput;
9430
9431         ret = -EBADFD;
9432         if (ctx->flags & IORING_SETUP_R_DISABLED)
9433                 goto out;
9434
9435         /*
9436          * For SQ polling, the thread will do all submissions and completions.
9437          * Just return the requested submit count, and wake the thread if
9438          * we were asked to.
9439          */
9440         ret = 0;
9441         if (ctx->flags & IORING_SETUP_SQPOLL) {
9442                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
9443
9444                 ret = -EOWNERDEAD;
9445                 if (unlikely(ctx->sqo_dead))
9446                         goto out;
9447                 if (flags & IORING_ENTER_SQ_WAKEUP)
9448                         wake_up(&ctx->sq_data->wait);
9449                 if (flags & IORING_ENTER_SQ_WAIT) {
9450                         ret = io_sqpoll_wait_sq(ctx);
9451                         if (ret)
9452                                 goto out;
9453                 }
9454                 submitted = to_submit;
9455         } else if (to_submit) {
9456                 ret = io_uring_add_task_file(ctx, f.file);
9457                 if (unlikely(ret))
9458                         goto out;
9459                 mutex_lock(&ctx->uring_lock);
9460                 submitted = io_submit_sqes(ctx, to_submit);
9461                 mutex_unlock(&ctx->uring_lock);
9462
9463                 if (submitted != to_submit)
9464                         goto out;
9465         }
9466         if (flags & IORING_ENTER_GETEVENTS) {
9467                 const sigset_t __user *sig;
9468                 struct __kernel_timespec __user *ts;
9469
9470                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9471                 if (unlikely(ret))
9472                         goto out;
9473
9474                 min_complete = min(min_complete, ctx->cq_entries);
9475
9476                 /*
9477                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9478                  * space applications don't need to do io completion events
9479                  * polling again, they can rely on io_sq_thread to do polling
9480                  * work, which can reduce cpu usage and uring_lock contention.
9481                  */
9482                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9483                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9484                         ret = io_iopoll_check(ctx, min_complete);
9485                 } else {
9486                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9487                 }
9488         }
9489
9490 out:
9491         percpu_ref_put(&ctx->refs);
9492 out_fput:
9493         fdput(f);
9494         return submitted ? submitted : ret;
9495 }
9496
9497 #ifdef CONFIG_PROC_FS
9498 static int io_uring_show_cred(int id, void *p, void *data)
9499 {
9500         struct io_identity *iod = p;
9501         const struct cred *cred = iod->creds;
9502         struct seq_file *m = data;
9503         struct user_namespace *uns = seq_user_ns(m);
9504         struct group_info *gi;
9505         kernel_cap_t cap;
9506         unsigned __capi;
9507         int g;
9508
9509         seq_printf(m, "%5d\n", id);
9510         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9511         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9512         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9513         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9514         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9515         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9516         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9517         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9518         seq_puts(m, "\n\tGroups:\t");
9519         gi = cred->group_info;
9520         for (g = 0; g < gi->ngroups; g++) {
9521                 seq_put_decimal_ull(m, g ? " " : "",
9522                                         from_kgid_munged(uns, gi->gid[g]));
9523         }
9524         seq_puts(m, "\n\tCapEff:\t");
9525         cap = cred->cap_effective;
9526         CAP_FOR_EACH_U32(__capi)
9527                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9528         seq_putc(m, '\n');
9529         return 0;
9530 }
9531
9532 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9533 {
9534         struct io_sq_data *sq = NULL;
9535         bool has_lock;
9536         int i;
9537
9538         /*
9539          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9540          * since fdinfo case grabs it in the opposite direction of normal use
9541          * cases. If we fail to get the lock, we just don't iterate any
9542          * structures that could be going away outside the io_uring mutex.
9543          */
9544         has_lock = mutex_trylock(&ctx->uring_lock);
9545
9546         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
9547                 sq = ctx->sq_data;
9548
9549         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9550         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9551         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9552         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9553                 struct file *f = *io_fixed_file_slot(ctx->file_data, i);
9554
9555                 if (f)
9556                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9557                 else
9558                         seq_printf(m, "%5u: <none>\n", i);
9559         }
9560         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9561         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9562                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9563
9564                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9565                                                 (unsigned int) buf->len);
9566         }
9567         if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
9568                 seq_printf(m, "Personalities:\n");
9569                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9570         }
9571         seq_printf(m, "PollList:\n");
9572         spin_lock_irq(&ctx->completion_lock);
9573         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9574                 struct hlist_head *list = &ctx->cancel_hash[i];
9575                 struct io_kiocb *req;
9576
9577                 hlist_for_each_entry(req, list, hash_node)
9578                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9579                                         req->task->task_works != NULL);
9580         }
9581         spin_unlock_irq(&ctx->completion_lock);
9582         if (has_lock)
9583                 mutex_unlock(&ctx->uring_lock);
9584 }
9585
9586 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9587 {
9588         struct io_ring_ctx *ctx = f->private_data;
9589
9590         if (percpu_ref_tryget(&ctx->refs)) {
9591                 __io_uring_show_fdinfo(ctx, m);
9592                 percpu_ref_put(&ctx->refs);
9593         }
9594 }
9595 #endif
9596
9597 static const struct file_operations io_uring_fops = {
9598         .release        = io_uring_release,
9599         .flush          = io_uring_flush,
9600         .mmap           = io_uring_mmap,
9601 #ifndef CONFIG_MMU
9602         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9603         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9604 #endif
9605         .poll           = io_uring_poll,
9606         .fasync         = io_uring_fasync,
9607 #ifdef CONFIG_PROC_FS
9608         .show_fdinfo    = io_uring_show_fdinfo,
9609 #endif
9610 };
9611
9612 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9613                                   struct io_uring_params *p)
9614 {
9615         struct io_rings *rings;
9616         size_t size, sq_array_offset;
9617
9618         /* make sure these are sane, as we already accounted them */
9619         ctx->sq_entries = p->sq_entries;
9620         ctx->cq_entries = p->cq_entries;
9621
9622         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9623         if (size == SIZE_MAX)
9624                 return -EOVERFLOW;
9625
9626         rings = io_mem_alloc(size);
9627         if (!rings)
9628                 return -ENOMEM;
9629
9630         ctx->rings = rings;
9631         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9632         rings->sq_ring_mask = p->sq_entries - 1;
9633         rings->cq_ring_mask = p->cq_entries - 1;
9634         rings->sq_ring_entries = p->sq_entries;
9635         rings->cq_ring_entries = p->cq_entries;
9636         ctx->sq_mask = rings->sq_ring_mask;
9637         ctx->cq_mask = rings->cq_ring_mask;
9638
9639         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9640         if (size == SIZE_MAX) {
9641                 io_mem_free(ctx->rings);
9642                 ctx->rings = NULL;
9643                 return -EOVERFLOW;
9644         }
9645
9646         ctx->sq_sqes = io_mem_alloc(size);
9647         if (!ctx->sq_sqes) {
9648                 io_mem_free(ctx->rings);
9649                 ctx->rings = NULL;
9650                 return -ENOMEM;
9651         }
9652
9653         return 0;
9654 }
9655
9656 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9657 {
9658         int ret, fd;
9659
9660         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9661         if (fd < 0)
9662                 return fd;
9663
9664         ret = io_uring_add_task_file(ctx, file);
9665         if (ret) {
9666                 put_unused_fd(fd);
9667                 return ret;
9668         }
9669         fd_install(fd, file);
9670         return fd;
9671 }
9672
9673 /*
9674  * Allocate an anonymous fd, this is what constitutes the application
9675  * visible backing of an io_uring instance. The application mmaps this
9676  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9677  * we have to tie this fd to a socket for file garbage collection purposes.
9678  */
9679 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
9680 {
9681         struct file *file;
9682 #if defined(CONFIG_UNIX)
9683         int ret;
9684
9685         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9686                                 &ctx->ring_sock);
9687         if (ret)
9688                 return ERR_PTR(ret);
9689 #endif
9690
9691         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9692                                         O_RDWR | O_CLOEXEC);
9693 #if defined(CONFIG_UNIX)
9694         if (IS_ERR(file)) {
9695                 sock_release(ctx->ring_sock);
9696                 ctx->ring_sock = NULL;
9697         } else {
9698                 ctx->ring_sock->file = file;
9699         }
9700 #endif
9701         return file;
9702 }
9703
9704 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9705                            struct io_uring_params __user *params)
9706 {
9707         struct user_struct *user = NULL;
9708         struct io_ring_ctx *ctx;
9709         struct file *file;
9710         int ret;
9711
9712         if (!entries)
9713                 return -EINVAL;
9714         if (entries > IORING_MAX_ENTRIES) {
9715                 if (!(p->flags & IORING_SETUP_CLAMP))
9716                         return -EINVAL;
9717                 entries = IORING_MAX_ENTRIES;
9718         }
9719
9720         /*
9721          * Use twice as many entries for the CQ ring. It's possible for the
9722          * application to drive a higher depth than the size of the SQ ring,
9723          * since the sqes are only used at submission time. This allows for
9724          * some flexibility in overcommitting a bit. If the application has
9725          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9726          * of CQ ring entries manually.
9727          */
9728         p->sq_entries = roundup_pow_of_two(entries);
9729         if (p->flags & IORING_SETUP_CQSIZE) {
9730                 /*
9731                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
9732                  * to a power-of-two, if it isn't already. We do NOT impose
9733                  * any cq vs sq ring sizing.
9734                  */
9735                 if (!p->cq_entries)
9736                         return -EINVAL;
9737                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9738                         if (!(p->flags & IORING_SETUP_CLAMP))
9739                                 return -EINVAL;
9740                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
9741                 }
9742                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9743                 if (p->cq_entries < p->sq_entries)
9744                         return -EINVAL;
9745         } else {
9746                 p->cq_entries = 2 * p->sq_entries;
9747         }
9748
9749         user = get_uid(current_user());
9750
9751         ctx = io_ring_ctx_alloc(p);
9752         if (!ctx) {
9753                 free_uid(user);
9754                 return -ENOMEM;
9755         }
9756         ctx->compat = in_compat_syscall();
9757         ctx->limit_mem = !capable(CAP_IPC_LOCK);
9758         ctx->user = user;
9759         ctx->creds = get_current_cred();
9760 #ifdef CONFIG_AUDIT
9761         ctx->loginuid = current->loginuid;
9762         ctx->sessionid = current->sessionid;
9763 #endif
9764         ctx->sqo_task = get_task_struct(current);
9765
9766         /*
9767          * This is just grabbed for accounting purposes. When a process exits,
9768          * the mm is exited and dropped before the files, hence we need to hang
9769          * on to this mm purely for the purposes of being able to unaccount
9770          * memory (locked/pinned vm). It's not used for anything else.
9771          */
9772         mmgrab(current->mm);
9773         ctx->mm_account = current->mm;
9774
9775 #ifdef CONFIG_BLK_CGROUP
9776         /*
9777          * The sq thread will belong to the original cgroup it was inited in.
9778          * If the cgroup goes offline (e.g. disabling the io controller), then
9779          * issued bios will be associated with the closest cgroup later in the
9780          * block layer.
9781          */
9782         rcu_read_lock();
9783         ctx->sqo_blkcg_css = blkcg_css();
9784         ret = css_tryget_online(ctx->sqo_blkcg_css);
9785         rcu_read_unlock();
9786         if (!ret) {
9787                 /* don't init against a dying cgroup, have the user try again */
9788                 ctx->sqo_blkcg_css = NULL;
9789                 ret = -ENODEV;
9790                 goto err;
9791         }
9792 #endif
9793         ret = io_allocate_scq_urings(ctx, p);
9794         if (ret)
9795                 goto err;
9796
9797         ret = io_sq_offload_create(ctx, p);
9798         if (ret)
9799                 goto err;
9800
9801         if (!(p->flags & IORING_SETUP_R_DISABLED))
9802                 io_sq_offload_start(ctx);
9803
9804         memset(&p->sq_off, 0, sizeof(p->sq_off));
9805         p->sq_off.head = offsetof(struct io_rings, sq.head);
9806         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9807         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9808         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9809         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9810         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9811         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9812
9813         memset(&p->cq_off, 0, sizeof(p->cq_off));
9814         p->cq_off.head = offsetof(struct io_rings, cq.head);
9815         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9816         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9817         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9818         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9819         p->cq_off.cqes = offsetof(struct io_rings, cqes);
9820         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9821
9822         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9823                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9824                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9825                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9826                         IORING_FEAT_EXT_ARG;
9827
9828         if (copy_to_user(params, p, sizeof(*p))) {
9829                 ret = -EFAULT;
9830                 goto err;
9831         }
9832
9833         file = io_uring_get_file(ctx);
9834         if (IS_ERR(file)) {
9835                 ret = PTR_ERR(file);
9836                 goto err;
9837         }
9838
9839         /*
9840          * Install ring fd as the very last thing, so we don't risk someone
9841          * having closed it before we finish setup
9842          */
9843         ret = io_uring_install_fd(ctx, file);
9844         if (ret < 0) {
9845                 io_disable_sqo_submit(ctx);
9846                 /* fput will clean it up */
9847                 fput(file);
9848                 return ret;
9849         }
9850
9851         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9852         return ret;
9853 err:
9854         io_disable_sqo_submit(ctx);
9855         io_ring_ctx_wait_and_kill(ctx);
9856         return ret;
9857 }
9858
9859 /*
9860  * Sets up an aio uring context, and returns the fd. Applications asks for a
9861  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9862  * params structure passed in.
9863  */
9864 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9865 {
9866         struct io_uring_params p;
9867         int i;
9868
9869         if (copy_from_user(&p, params, sizeof(p)))
9870                 return -EFAULT;
9871         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9872                 if (p.resv[i])
9873                         return -EINVAL;
9874         }
9875
9876         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9877                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9878                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9879                         IORING_SETUP_R_DISABLED))
9880                 return -EINVAL;
9881
9882         return  io_uring_create(entries, &p, params);
9883 }
9884
9885 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9886                 struct io_uring_params __user *, params)
9887 {
9888         return io_uring_setup(entries, params);
9889 }
9890
9891 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9892 {
9893         struct io_uring_probe *p;
9894         size_t size;
9895         int i, ret;
9896
9897         size = struct_size(p, ops, nr_args);
9898         if (size == SIZE_MAX)
9899                 return -EOVERFLOW;
9900         p = kzalloc(size, GFP_KERNEL);
9901         if (!p)
9902                 return -ENOMEM;
9903
9904         ret = -EFAULT;
9905         if (copy_from_user(p, arg, size))
9906                 goto out;
9907         ret = -EINVAL;
9908         if (memchr_inv(p, 0, size))
9909                 goto out;
9910
9911         p->last_op = IORING_OP_LAST - 1;
9912         if (nr_args > IORING_OP_LAST)
9913                 nr_args = IORING_OP_LAST;
9914
9915         for (i = 0; i < nr_args; i++) {
9916                 p->ops[i].op = i;
9917                 if (!io_op_defs[i].not_supported)
9918                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
9919         }
9920         p->ops_len = i;
9921
9922         ret = 0;
9923         if (copy_to_user(arg, p, size))
9924                 ret = -EFAULT;
9925 out:
9926         kfree(p);
9927         return ret;
9928 }
9929
9930 static int io_register_personality(struct io_ring_ctx *ctx)
9931 {
9932         struct io_identity *id;
9933         int ret;
9934
9935         id = kmalloc(sizeof(*id), GFP_KERNEL);
9936         if (unlikely(!id))
9937                 return -ENOMEM;
9938
9939         io_init_identity(id);
9940         id->creds = get_current_cred();
9941
9942         ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
9943         if (ret < 0) {
9944                 put_cred(id->creds);
9945                 kfree(id);
9946         }
9947         return ret;
9948 }
9949
9950 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9951                                     unsigned int nr_args)
9952 {
9953         struct io_uring_restriction *res;
9954         size_t size;
9955         int i, ret;
9956
9957         /* Restrictions allowed only if rings started disabled */
9958         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9959                 return -EBADFD;
9960
9961         /* We allow only a single restrictions registration */
9962         if (ctx->restrictions.registered)
9963                 return -EBUSY;
9964
9965         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9966                 return -EINVAL;
9967
9968         size = array_size(nr_args, sizeof(*res));
9969         if (size == SIZE_MAX)
9970                 return -EOVERFLOW;
9971
9972         res = memdup_user(arg, size);
9973         if (IS_ERR(res))
9974                 return PTR_ERR(res);
9975
9976         ret = 0;
9977
9978         for (i = 0; i < nr_args; i++) {
9979                 switch (res[i].opcode) {
9980                 case IORING_RESTRICTION_REGISTER_OP:
9981                         if (res[i].register_op >= IORING_REGISTER_LAST) {
9982                                 ret = -EINVAL;
9983                                 goto out;
9984                         }
9985
9986                         __set_bit(res[i].register_op,
9987                                   ctx->restrictions.register_op);
9988                         break;
9989                 case IORING_RESTRICTION_SQE_OP:
9990                         if (res[i].sqe_op >= IORING_OP_LAST) {
9991                                 ret = -EINVAL;
9992                                 goto out;
9993                         }
9994
9995                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9996                         break;
9997                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9998                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9999                         break;
10000                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10001                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10002                         break;
10003                 default:
10004                         ret = -EINVAL;
10005                         goto out;
10006                 }
10007         }
10008
10009 out:
10010         /* Reset all restrictions if an error happened */
10011         if (ret != 0)
10012                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10013         else
10014                 ctx->restrictions.registered = true;
10015
10016         kfree(res);
10017         return ret;
10018 }
10019
10020 static int io_register_enable_rings(struct io_ring_ctx *ctx)
10021 {
10022         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10023                 return -EBADFD;
10024
10025         if (ctx->restrictions.registered)
10026                 ctx->restricted = 1;
10027
10028         ctx->flags &= ~IORING_SETUP_R_DISABLED;
10029
10030         io_sq_offload_start(ctx);
10031
10032         return 0;
10033 }
10034
10035 static bool io_register_op_must_quiesce(int op)
10036 {
10037         switch (op) {
10038         case IORING_UNREGISTER_FILES:
10039         case IORING_REGISTER_FILES_UPDATE:
10040         case IORING_REGISTER_PROBE:
10041         case IORING_REGISTER_PERSONALITY:
10042         case IORING_UNREGISTER_PERSONALITY:
10043                 return false;
10044         default:
10045                 return true;
10046         }
10047 }
10048
10049 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10050                                void __user *arg, unsigned nr_args)
10051         __releases(ctx->uring_lock)
10052         __acquires(ctx->uring_lock)
10053 {
10054         int ret;
10055
10056         /*
10057          * We're inside the ring mutex, if the ref is already dying, then
10058          * someone else killed the ctx or is already going through
10059          * io_uring_register().
10060          */
10061         if (percpu_ref_is_dying(&ctx->refs))
10062                 return -ENXIO;
10063
10064         if (io_register_op_must_quiesce(opcode)) {
10065                 percpu_ref_kill(&ctx->refs);
10066
10067                 /*
10068                  * Drop uring mutex before waiting for references to exit. If
10069                  * another thread is currently inside io_uring_enter() it might
10070                  * need to grab the uring_lock to make progress. If we hold it
10071                  * here across the drain wait, then we can deadlock. It's safe
10072                  * to drop the mutex here, since no new references will come in
10073                  * after we've killed the percpu ref.
10074                  */
10075                 mutex_unlock(&ctx->uring_lock);
10076                 do {
10077                         ret = wait_for_completion_interruptible(&ctx->ref_comp);
10078                         if (!ret)
10079                                 break;
10080                         ret = io_run_task_work_sig();
10081                         if (ret < 0)
10082                                 break;
10083                 } while (1);
10084
10085                 mutex_lock(&ctx->uring_lock);
10086
10087                 if (ret) {
10088                         percpu_ref_resurrect(&ctx->refs);
10089                         goto out_quiesce;
10090                 }
10091         }
10092
10093         if (ctx->restricted) {
10094                 if (opcode >= IORING_REGISTER_LAST) {
10095                         ret = -EINVAL;
10096                         goto out;
10097                 }
10098
10099                 if (!test_bit(opcode, ctx->restrictions.register_op)) {
10100                         ret = -EACCES;
10101                         goto out;
10102                 }
10103         }
10104
10105         switch (opcode) {
10106         case IORING_REGISTER_BUFFERS:
10107                 ret = io_sqe_buffers_register(ctx, arg, nr_args);
10108                 break;
10109         case IORING_UNREGISTER_BUFFERS:
10110                 ret = -EINVAL;
10111                 if (arg || nr_args)
10112                         break;
10113                 ret = io_sqe_buffers_unregister(ctx);
10114                 break;
10115         case IORING_REGISTER_FILES:
10116                 ret = io_sqe_files_register(ctx, arg, nr_args);
10117                 break;
10118         case IORING_UNREGISTER_FILES:
10119                 ret = -EINVAL;
10120                 if (arg || nr_args)
10121                         break;
10122                 ret = io_sqe_files_unregister(ctx);
10123                 break;
10124         case IORING_REGISTER_FILES_UPDATE:
10125                 ret = io_sqe_files_update(ctx, arg, nr_args);
10126                 break;
10127         case IORING_REGISTER_EVENTFD:
10128         case IORING_REGISTER_EVENTFD_ASYNC:
10129                 ret = -EINVAL;
10130                 if (nr_args != 1)
10131                         break;
10132                 ret = io_eventfd_register(ctx, arg);
10133                 if (ret)
10134                         break;
10135                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10136                         ctx->eventfd_async = 1;
10137                 else
10138                         ctx->eventfd_async = 0;
10139                 break;
10140         case IORING_UNREGISTER_EVENTFD:
10141                 ret = -EINVAL;
10142                 if (arg || nr_args)
10143                         break;
10144                 ret = io_eventfd_unregister(ctx);
10145                 break;
10146         case IORING_REGISTER_PROBE:
10147                 ret = -EINVAL;
10148                 if (!arg || nr_args > 256)
10149                         break;
10150                 ret = io_probe(ctx, arg, nr_args);
10151                 break;
10152         case IORING_REGISTER_PERSONALITY:
10153                 ret = -EINVAL;
10154                 if (arg || nr_args)
10155                         break;
10156                 ret = io_register_personality(ctx);
10157                 break;
10158         case IORING_UNREGISTER_PERSONALITY:
10159                 ret = -EINVAL;
10160                 if (arg)
10161                         break;
10162                 ret = io_unregister_personality(ctx, nr_args);
10163                 break;
10164         case IORING_REGISTER_ENABLE_RINGS:
10165                 ret = -EINVAL;
10166                 if (arg || nr_args)
10167                         break;
10168                 ret = io_register_enable_rings(ctx);
10169                 break;
10170         case IORING_REGISTER_RESTRICTIONS:
10171                 ret = io_register_restrictions(ctx, arg, nr_args);
10172                 break;
10173         default:
10174                 ret = -EINVAL;
10175                 break;
10176         }
10177
10178 out:
10179         if (io_register_op_must_quiesce(opcode)) {
10180                 /* bring the ctx back to life */
10181                 percpu_ref_reinit(&ctx->refs);
10182 out_quiesce:
10183                 reinit_completion(&ctx->ref_comp);
10184         }
10185         return ret;
10186 }
10187
10188 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10189                 void __user *, arg, unsigned int, nr_args)
10190 {
10191         struct io_ring_ctx *ctx;
10192         long ret = -EBADF;
10193         struct fd f;
10194
10195         f = fdget(fd);
10196         if (!f.file)
10197                 return -EBADF;
10198
10199         ret = -EOPNOTSUPP;
10200         if (f.file->f_op != &io_uring_fops)
10201                 goto out_fput;
10202
10203         ctx = f.file->private_data;
10204
10205         mutex_lock(&ctx->uring_lock);
10206         ret = __io_uring_register(ctx, opcode, arg, nr_args);
10207         mutex_unlock(&ctx->uring_lock);
10208         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10209                                                         ctx->cq_ev_fd != NULL, ret);
10210 out_fput:
10211         fdput(f);
10212         return ret;
10213 }
10214
10215 static int __init io_uring_init(void)
10216 {
10217 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10218         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10219         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10220 } while (0)
10221
10222 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10223         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10224         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10225         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
10226         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
10227         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
10228         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
10229         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
10230         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
10231         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
10232         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
10233         BUILD_BUG_SQE_ELEM(24, __u32,  len);
10234         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
10235         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
10236         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10237         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
10238         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
10239         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
10240         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
10241         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
10242         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
10243         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
10244         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
10245         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
10246         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
10247         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
10248         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
10249         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
10250         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
10251         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
10252         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
10253
10254         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
10255         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
10256         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
10257                                 SLAB_ACCOUNT);
10258         return 0;
10259 };
10260 __initcall(io_uring_init);