fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49
  50 #include <linux/sched/signal.h>
  51 #include <linux/fs.h>
  52 #include <linux/file.h>
  53 #include <linux/fdtable.h>
  54 #include <linux/mm.h>
  55 #include <linux/mman.h>
  56 #include <linux/mmu_context.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/kthread.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76
  77 #define CREATE_TRACE_POINTS
  78 #include <trace/events/io_uring.h>
  79
  80 #include <uapi/linux/io_uring.h>
  81
  82 #include "internal.h"
  83 #include "io-wq.h"
  84
  85 #define IORING_MAX_ENTRIES      32768
  86 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  87
  88 /*
  89  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  90  */
  91 #define IORING_FILE_TABLE_SHIFT 9
  92 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  93 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
  94 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
  95
  96 struct io_uring {
  97         u32 head ____cacheline_aligned_in_smp;
  98         u32 tail ____cacheline_aligned_in_smp;
  99 };
 100
 101 /*
 102  * This data is shared with the application through the mmap at offsets
 103  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 104  *
 105  * The offsets to the member fields are published through struct
 106  * io_sqring_offsets when calling io_uring_setup.
 107  */
 108 struct io_rings {
 109         /*
 110          * Head and tail offsets into the ring; the offsets need to be
 111          * masked to get valid indices.
 112          *
 113          * The kernel controls head of the sq ring and the tail of the cq ring,
 114          * and the application controls tail of the sq ring and the head of the
 115          * cq ring.
 116          */
 117         struct io_uring         sq, cq;
 118         /*
 119          * Bitmasks to apply to head and tail offsets (constant, equals
 120          * ring_entries - 1)
 121          */
 122         u32                     sq_ring_mask, cq_ring_mask;
 123         /* Ring sizes (constant, power of 2) */
 124         u32                     sq_ring_entries, cq_ring_entries;
 125         /*
 126          * Number of invalid entries dropped by the kernel due to
 127          * invalid index stored in array
 128          *
 129          * Written by the kernel, shouldn't be modified by the
 130          * application (i.e. get number of "new events" by comparing to
 131          * cached value).
 132          *
 133          * After a new SQ head value was read by the application this
 134          * counter includes all submissions that were dropped reaching
 135          * the new SQ head (and possibly more).
 136          */
 137         u32                     sq_dropped;
 138         /*
 139          * Runtime flags
 140          *
 141          * Written by the kernel, shouldn't be modified by the
 142          * application.
 143          *
 144          * The application needs a full memory barrier before checking
 145          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 146          */
 147         u32                     sq_flags;
 148         /*
 149          * Number of completion events lost because the queue was full;
 150          * this should be avoided by the application by making sure
 151          * there are not more requests pending than there is space in
 152          * the completion queue.
 153          *
 154          * Written by the kernel, shouldn't be modified by the
 155          * application (i.e. get number of "new events" by comparing to
 156          * cached value).
 157          *
 158          * As completion events come in out of order this counter is not
 159          * ordered with any other data.
 160          */
 161         u32                     cq_overflow;
 162         /*
 163          * Ring buffer of completion events.
 164          *
 165          * The kernel writes completion events fresh every time they are
 166          * produced, so the application is allowed to modify pending
 167          * entries.
 168          */
 169         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 170 };
 171
 172 struct io_mapped_ubuf {
 173         u64             ubuf;
 174         size_t          len;
 175         struct          bio_vec *bvec;
 176         unsigned int    nr_bvecs;
 177 };
 178
 179 struct fixed_file_table {
 180         struct file             **files;
 181 };
 182
 183 enum {
 184         FFD_F_ATOMIC,
 185 };
 186
 187 struct fixed_file_data {
 188         struct fixed_file_table         *table;
 189         struct io_ring_ctx              *ctx;
 190
 191         struct percpu_ref               refs;
 192         struct llist_head               put_llist;
 193         unsigned long                   state;
 194         struct work_struct              ref_work;
 195         struct completion               done;
 196 };
 197
 198 struct io_ring_ctx {
 199         struct {
 200                 struct percpu_ref       refs;
 201         } ____cacheline_aligned_in_smp;
 202
 203         struct {
 204                 unsigned int            flags;
 205                 bool                    compat;
 206                 bool                    account_mem;
 207                 bool                    cq_overflow_flushed;
 208                 bool                    drain_next;
 209
 210                 /*
 211                  * Ring buffer of indices into array of io_uring_sqe, which is
 212                  * mmapped by the application using the IORING_OFF_SQES offset.
 213                  *
 214                  * This indirection could e.g. be used to assign fixed
 215                  * io_uring_sqe entries to operations and only submit them to
 216                  * the queue when needed.
 217                  *
 218                  * The kernel modifies neither the indices array nor the entries
 219                  * array.
 220                  */
 221                 u32                     *sq_array;
 222                 unsigned                cached_sq_head;
 223                 unsigned                sq_entries;
 224                 unsigned                sq_mask;
 225                 unsigned                sq_thread_idle;
 226                 unsigned                cached_sq_dropped;
 227                 atomic_t                cached_cq_overflow;
 228                 unsigned long           sq_check_overflow;
 229
 230                 struct list_head        defer_list;
 231                 struct list_head        timeout_list;
 232                 struct list_head        cq_overflow_list;
 233
 234                 wait_queue_head_t       inflight_wait;
 235                 struct io_uring_sqe     *sq_sqes;
 236         } ____cacheline_aligned_in_smp;
 237
 238         struct io_rings *rings;
 239
 240         /* IO offload */
 241         struct io_wq            *io_wq;
 242         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 243         struct mm_struct        *sqo_mm;
 244         wait_queue_head_t       sqo_wait;
 245
 246         /*
 247          * If used, fixed file set. Writers must ensure that ->refs is dead,
 248          * readers must ensure that ->refs is alive as long as the file* is
 249          * used. Only updated through io_uring_register(2).
 250          */
 251         struct fixed_file_data  *file_data;
 252         unsigned                nr_user_files;
 253
 254         /* if used, fixed mapped user buffers */
 255         unsigned                nr_user_bufs;
 256         struct io_mapped_ubuf   *user_bufs;
 257
 258         struct user_struct      *user;
 259
 260         const struct cred       *creds;
 261
 262         /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 263         struct completion       *completions;
 264
 265         /* if all else fails... */
 266         struct io_kiocb         *fallback_req;
 267
 268 #if defined(CONFIG_UNIX)
 269         struct socket           *ring_sock;
 270 #endif
 271
 272         struct {
 273                 unsigned                cached_cq_tail;
 274                 unsigned                cq_entries;
 275                 unsigned                cq_mask;
 276                 atomic_t                cq_timeouts;
 277                 unsigned long           cq_check_overflow;
 278                 struct wait_queue_head  cq_wait;
 279                 struct fasync_struct    *cq_fasync;
 280                 struct eventfd_ctx      *cq_ev_fd;
 281         } ____cacheline_aligned_in_smp;
 282
 283         struct {
 284                 struct mutex            uring_lock;
 285                 wait_queue_head_t       wait;
 286         } ____cacheline_aligned_in_smp;
 287
 288         struct {
 289                 spinlock_t              completion_lock;
 290                 struct llist_head       poll_llist;
 291
 292                 /*
 293                  * ->poll_list is protected by the ctx->uring_lock for
 294                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 295                  * For SQPOLL, only the single threaded io_sq_thread() will
 296                  * manipulate the list, hence no extra locking is needed there.
 297                  */
 298                 struct list_head        poll_list;
 299                 struct hlist_head       *cancel_hash;
 300                 unsigned                cancel_hash_bits;
 301                 bool                    poll_multi_file;
 302
 303                 spinlock_t              inflight_lock;
 304                 struct list_head        inflight_list;
 305         } ____cacheline_aligned_in_smp;
 306 };
 307
 308 /*
 309  * First field must be the file pointer in all the
 310  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 311  */
 312 struct io_poll_iocb {
 313         struct file                     *file;
 314         union {
 315                 struct wait_queue_head  *head;
 316                 u64                     addr;
 317         };
 318         __poll_t                        events;
 319         bool                            done;
 320         bool                            canceled;
 321         struct wait_queue_entry         wait;
 322 };
 323
 324 struct io_close {
 325         struct file                     *file;
 326         struct file                     *put_file;
 327         int                             fd;
 328 };
 329
 330 struct io_timeout_data {
 331         struct io_kiocb                 *req;
 332         struct hrtimer                  timer;
 333         struct timespec64               ts;
 334         enum hrtimer_mode               mode;
 335         u32                             seq_offset;
 336 };
 337
 338 struct io_accept {
 339         struct file                     *file;
 340         struct sockaddr __user          *addr;
 341         int __user                      *addr_len;
 342         int                             flags;
 343 };
 344
 345 struct io_sync {
 346         struct file                     *file;
 347         loff_t                          len;
 348         loff_t                          off;
 349         int                             flags;
 350         int                             mode;
 351 };
 352
 353 struct io_cancel {
 354         struct file                     *file;
 355         u64                             addr;
 356 };
 357
 358 struct io_timeout {
 359         struct file                     *file;
 360         u64                             addr;
 361         int                             flags;
 362         unsigned                        count;
 363 };
 364
 365 struct io_rw {
 366         /* NOTE: kiocb has the file as the first member, so don't do it here */
 367         struct kiocb                    kiocb;
 368         u64                             addr;
 369         u64                             len;
 370 };
 371
 372 struct io_connect {
 373         struct file                     *file;
 374         struct sockaddr __user          *addr;
 375         int                             addr_len;
 376 };
 377
 378 struct io_sr_msg {
 379         struct file                     *file;
 380         struct user_msghdr __user       *msg;
 381         int                             msg_flags;
 382 };
 383
 384 struct io_open {
 385         struct file                     *file;
 386         int                             dfd;
 387         union {
 388                 umode_t                 mode;
 389                 unsigned                mask;
 390         };
 391         const char __user               *fname;
 392         struct filename                 *filename;
 393         struct statx __user             *buffer;
 394         int                             flags;
 395 };
 396
 397 struct io_files_update {
 398         struct file                     *file;
 399         u64                             arg;
 400         u32                             nr_args;
 401         u32                             offset;
 402 };
 403
 404 struct io_fadvise {
 405         struct file                     *file;
 406         u64                             offset;
 407         u32                             len;
 408         u32                             advice;
 409 };
 410
 411 struct io_madvise {
 412         struct file                     *file;
 413         u64                             addr;
 414         u32                             len;
 415         u32                             advice;
 416 };
 417
 418 struct io_async_connect {
 419         struct sockaddr_storage         address;
 420 };
 421
 422 struct io_async_msghdr {
 423         struct iovec                    fast_iov[UIO_FASTIOV];
 424         struct iovec                    *iov;
 425         struct sockaddr __user          *uaddr;
 426         struct msghdr                   msg;
 427 };
 428
 429 struct io_async_rw {
 430         struct iovec                    fast_iov[UIO_FASTIOV];
 431         struct iovec                    *iov;
 432         ssize_t                         nr_segs;
 433         ssize_t                         size;
 434 };
 435
 436 struct io_async_open {
 437         struct filename                 *filename;
 438 };
 439
 440 struct io_async_ctx {
 441         union {
 442                 struct io_async_rw      rw;
 443                 struct io_async_msghdr  msg;
 444                 struct io_async_connect connect;
 445                 struct io_timeout_data  timeout;
 446                 struct io_async_open    open;
 447         };
 448 };
 449
 450 /*
 451  * NOTE! Each of the iocb union members has the file pointer
 452  * as the first entry in their struct definition. So you can
 453  * access the file pointer through any of the sub-structs,
 454  * or directly as just 'ki_filp' in this struct.
 455  */
 456 struct io_kiocb {
 457         union {
 458                 struct file             *file;
 459                 struct io_rw            rw;
 460                 struct io_poll_iocb     poll;
 461                 struct io_accept        accept;
 462                 struct io_sync          sync;
 463                 struct io_cancel        cancel;
 464                 struct io_timeout       timeout;
 465                 struct io_connect       connect;
 466                 struct io_sr_msg        sr_msg;
 467                 struct io_open          open;
 468                 struct io_close         close;
 469                 struct io_files_update  files_update;
 470                 struct io_fadvise       fadvise;
 471                 struct io_madvise       madvise;
 472         };
 473
 474         struct io_async_ctx             *io;
 475         union {
 476                 /*
 477                  * ring_file is only used in the submission path, and
 478                  * llist_node is only used for poll deferred completions
 479                  */
 480                 struct file             *ring_file;
 481                 struct llist_node       llist_node;
 482         };
 483         int                             ring_fd;
 484         bool                            has_user;
 485         bool                            in_async;
 486         bool                            needs_fixed_file;
 487         u8                              opcode;
 488
 489         struct io_ring_ctx      *ctx;
 490         union {
 491                 struct list_head        list;
 492                 struct hlist_node       hash_node;
 493         };
 494         struct list_head        link_list;
 495         unsigned int            flags;
 496         refcount_t              refs;
 497 #define REQ_F_NOWAIT            1       /* must not punt to workers */
 498 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 499 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 500 #define REQ_F_LINK_NEXT         8       /* already grabbed next link */
 501 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 502 #define REQ_F_IO_DRAINED        32      /* drain done */
 503 #define REQ_F_LINK              64      /* linked sqes */
 504 #define REQ_F_LINK_TIMEOUT      128     /* has linked timeout */
 505 #define REQ_F_FAIL_LINK         256     /* fail rest of links */
 506 #define REQ_F_DRAIN_LINK        512     /* link should be fully drained */
 507 #define REQ_F_TIMEOUT           1024    /* timeout request */
 508 #define REQ_F_ISREG             2048    /* regular file */
 509 #define REQ_F_MUST_PUNT         4096    /* must be punted even for NONBLOCK */
 510 #define REQ_F_TIMEOUT_NOSEQ     8192    /* no timeout sequence */
 511 #define REQ_F_INFLIGHT          16384   /* on inflight list */
 512 #define REQ_F_COMP_LOCKED       32768   /* completion under lock */
 513 #define REQ_F_HARDLINK          65536   /* doesn't sever on completion < 0 */
 514 #define REQ_F_FORCE_ASYNC       131072  /* IOSQE_ASYNC */
 515 #define REQ_F_CUR_POS           262144  /* read/write uses file position */
 516         u64                     user_data;
 517         u32                     result;
 518         u32                     sequence;
 519
 520         struct list_head        inflight_entry;
 521
 522         struct io_wq_work       work;
 523 };
 524
 525 #define IO_PLUG_THRESHOLD               2
 526 #define IO_IOPOLL_BATCH                 8
 527
 528 struct io_submit_state {
 529         struct blk_plug         plug;
 530
 531         /*
 532          * io_kiocb alloc cache
 533          */
 534         void                    *reqs[IO_IOPOLL_BATCH];
 535         unsigned                int free_reqs;
 536         unsigned                int cur_req;
 537
 538         /*
 539          * File reference cache
 540          */
 541         struct file             *file;
 542         unsigned int            fd;
 543         unsigned int            has_refs;
 544         unsigned int            used_refs;
 545         unsigned int            ios_left;
 546 };
 547
 548 struct io_op_def {
 549         /* needs req->io allocated for deferral/async */
 550         unsigned                async_ctx : 1;
 551         /* needs current->mm setup, does mm access */
 552         unsigned                needs_mm : 1;
 553         /* needs req->file assigned */
 554         unsigned                needs_file : 1;
 555         /* needs req->file assigned IFF fd is >= 0 */
 556         unsigned                fd_non_neg : 1;
 557         /* hash wq insertion if file is a regular file */
 558         unsigned                hash_reg_file : 1;
 559         /* unbound wq insertion if file is a non-regular file */
 560         unsigned                unbound_nonreg_file : 1;
 561 };
 562
 563 static const struct io_op_def io_op_defs[] = {
 564         {
 565                 /* IORING_OP_NOP */
 566         },
 567         {
 568                 /* IORING_OP_READV */
 569                 .async_ctx              = 1,
 570                 .needs_mm               = 1,
 571                 .needs_file             = 1,
 572                 .unbound_nonreg_file    = 1,
 573         },
 574         {
 575                 /* IORING_OP_WRITEV */
 576                 .async_ctx              = 1,
 577                 .needs_mm               = 1,
 578                 .needs_file             = 1,
 579                 .hash_reg_file          = 1,
 580                 .unbound_nonreg_file    = 1,
 581         },
 582         {
 583                 /* IORING_OP_FSYNC */
 584                 .needs_file             = 1,
 585         },
 586         {
 587                 /* IORING_OP_READ_FIXED */
 588                 .needs_file             = 1,
 589                 .unbound_nonreg_file    = 1,
 590         },
 591         {
 592                 /* IORING_OP_WRITE_FIXED */
 593                 .needs_file             = 1,
 594                 .hash_reg_file          = 1,
 595                 .unbound_nonreg_file    = 1,
 596         },
 597         {
 598                 /* IORING_OP_POLL_ADD */
 599                 .needs_file             = 1,
 600                 .unbound_nonreg_file    = 1,
 601         },
 602         {
 603                 /* IORING_OP_POLL_REMOVE */
 604         },
 605         {
 606                 /* IORING_OP_SYNC_FILE_RANGE */
 607                 .needs_file             = 1,
 608         },
 609         {
 610                 /* IORING_OP_SENDMSG */
 611                 .async_ctx              = 1,
 612                 .needs_mm               = 1,
 613                 .needs_file             = 1,
 614                 .unbound_nonreg_file    = 1,
 615         },
 616         {
 617                 /* IORING_OP_RECVMSG */
 618                 .async_ctx              = 1,
 619                 .needs_mm               = 1,
 620                 .needs_file             = 1,
 621                 .unbound_nonreg_file    = 1,
 622         },
 623         {
 624                 /* IORING_OP_TIMEOUT */
 625                 .async_ctx              = 1,
 626                 .needs_mm               = 1,
 627         },
 628         {
 629                 /* IORING_OP_TIMEOUT_REMOVE */
 630         },
 631         {
 632                 /* IORING_OP_ACCEPT */
 633                 .needs_mm               = 1,
 634                 .needs_file             = 1,
 635                 .unbound_nonreg_file    = 1,
 636         },
 637         {
 638                 /* IORING_OP_ASYNC_CANCEL */
 639         },
 640         {
 641                 /* IORING_OP_LINK_TIMEOUT */
 642                 .async_ctx              = 1,
 643                 .needs_mm               = 1,
 644         },
 645         {
 646                 /* IORING_OP_CONNECT */
 647                 .async_ctx              = 1,
 648                 .needs_mm               = 1,
 649                 .needs_file             = 1,
 650                 .unbound_nonreg_file    = 1,
 651         },
 652         {
 653                 /* IORING_OP_FALLOCATE */
 654                 .needs_file             = 1,
 655         },
 656         {
 657                 /* IORING_OP_OPENAT */
 658                 .needs_file             = 1,
 659                 .fd_non_neg             = 1,
 660         },
 661         {
 662                 /* IORING_OP_CLOSE */
 663                 .needs_file             = 1,
 664         },
 665         {
 666                 /* IORING_OP_FILES_UPDATE */
 667                 .needs_mm               = 1,
 668         },
 669         {
 670                 /* IORING_OP_STATX */
 671                 .needs_mm               = 1,
 672                 .needs_file             = 1,
 673                 .fd_non_neg             = 1,
 674         },
 675         {
 676                 /* IORING_OP_READ */
 677                 .needs_mm               = 1,
 678                 .needs_file             = 1,
 679                 .unbound_nonreg_file    = 1,
 680         },
 681         {
 682                 /* IORING_OP_WRITE */
 683                 .needs_mm               = 1,
 684                 .needs_file             = 1,
 685                 .unbound_nonreg_file    = 1,
 686         },
 687         {
 688                 /* IORING_OP_FADVISE */
 689                 .needs_file             = 1,
 690         },
 691         {
 692                 /* IORING_OP_MADVISE */
 693                 .needs_mm               = 1,
 694         },
 695 };
 696
 697 static void io_wq_submit_work(struct io_wq_work **workptr);
 698 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 699 static void io_put_req(struct io_kiocb *req);
 700 static void __io_double_put_req(struct io_kiocb *req);
 701 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 702 static void io_queue_linked_timeout(struct io_kiocb *req);
 703 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 704                                  struct io_uring_files_update *ip,
 705                                  unsigned nr_args);
 706
 707 static struct kmem_cache *req_cachep;
 708
 709 static const struct file_operations io_uring_fops;
 710
 711 struct sock *io_uring_get_socket(struct file *file)
 712 {
 713 #if defined(CONFIG_UNIX)
 714         if (file->f_op == &io_uring_fops) {
 715                 struct io_ring_ctx *ctx = file->private_data;
 716
 717                 return ctx->ring_sock->sk;
 718         }
 719 #endif
 720         return NULL;
 721 }
 722 EXPORT_SYMBOL(io_uring_get_socket);
 723
 724 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 725 {
 726         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 727
 728         complete(&ctx->completions[0]);
 729 }
 730
 731 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 732 {
 733         struct io_ring_ctx *ctx;
 734         int hash_bits;
 735
 736         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 737         if (!ctx)
 738                 return NULL;
 739
 740         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 741         if (!ctx->fallback_req)
 742                 goto err;
 743
 744         ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
 745         if (!ctx->completions)
 746                 goto err;
 747
 748         /*
 749          * Use 5 bits less than the max cq entries, that should give us around
 750          * 32 entries per hash list if totally full and uniformly spread.
 751          */
 752         hash_bits = ilog2(p->cq_entries);
 753         hash_bits -= 5;
 754         if (hash_bits <= 0)
 755                 hash_bits = 1;
 756         ctx->cancel_hash_bits = hash_bits;
 757         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 758                                         GFP_KERNEL);
 759         if (!ctx->cancel_hash)
 760                 goto err;
 761         __hash_init(ctx->cancel_hash, 1U << hash_bits);
 762
 763         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 764                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 765                 goto err;
 766
 767         ctx->flags = p->flags;
 768         init_waitqueue_head(&ctx->cq_wait);
 769         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 770         init_completion(&ctx->completions[0]);
 771         init_completion(&ctx->completions[1]);
 772         mutex_init(&ctx->uring_lock);
 773         init_waitqueue_head(&ctx->wait);
 774         spin_lock_init(&ctx->completion_lock);
 775         init_llist_head(&ctx->poll_llist);
 776         INIT_LIST_HEAD(&ctx->poll_list);
 777         INIT_LIST_HEAD(&ctx->defer_list);
 778         INIT_LIST_HEAD(&ctx->timeout_list);
 779         init_waitqueue_head(&ctx->inflight_wait);
 780         spin_lock_init(&ctx->inflight_lock);
 781         INIT_LIST_HEAD(&ctx->inflight_list);
 782         return ctx;
 783 err:
 784         if (ctx->fallback_req)
 785                 kmem_cache_free(req_cachep, ctx->fallback_req);
 786         kfree(ctx->completions);
 787         kfree(ctx->cancel_hash);
 788         kfree(ctx);
 789         return NULL;
 790 }
 791
 792 static inline bool __req_need_defer(struct io_kiocb *req)
 793 {
 794         struct io_ring_ctx *ctx = req->ctx;
 795
 796         return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
 797                                         + atomic_read(&ctx->cached_cq_overflow);
 798 }
 799
 800 static inline bool req_need_defer(struct io_kiocb *req)
 801 {
 802         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
 803                 return __req_need_defer(req);
 804
 805         return false;
 806 }
 807
 808 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 809 {
 810         struct io_kiocb *req;
 811
 812         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 813         if (req && !req_need_defer(req)) {
 814                 list_del_init(&req->list);
 815                 return req;
 816         }
 817
 818         return NULL;
 819 }
 820
 821 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 822 {
 823         struct io_kiocb *req;
 824
 825         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 826         if (req) {
 827                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 828                         return NULL;
 829                 if (!__req_need_defer(req)) {
 830                         list_del_init(&req->list);
 831                         return req;
 832                 }
 833         }
 834
 835         return NULL;
 836 }
 837
 838 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 839 {
 840         struct io_rings *rings = ctx->rings;
 841
 842         if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
 843                 /* order cqe stores with ring update */
 844                 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 845
 846                 if (wq_has_sleeper(&ctx->cq_wait)) {
 847                         wake_up_interruptible(&ctx->cq_wait);
 848                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 849                 }
 850         }
 851 }
 852
 853 static inline bool io_prep_async_work(struct io_kiocb *req,
 854                                       struct io_kiocb **link)
 855 {
 856         const struct io_op_def *def = &io_op_defs[req->opcode];
 857         bool do_hashed = false;
 858
 859         if (req->flags & REQ_F_ISREG) {
 860                 if (def->hash_reg_file)
 861                         do_hashed = true;
 862         } else {
 863                 if (def->unbound_nonreg_file)
 864                         req->work.flags |= IO_WQ_WORK_UNBOUND;
 865         }
 866         if (def->needs_mm)
 867                 req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 868
 869         *link = io_prep_linked_timeout(req);
 870         return do_hashed;
 871 }
 872
 873 static inline void io_queue_async_work(struct io_kiocb *req)
 874 {
 875         struct io_ring_ctx *ctx = req->ctx;
 876         struct io_kiocb *link;
 877         bool do_hashed;
 878
 879         do_hashed = io_prep_async_work(req, &link);
 880
 881         trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
 882                                         req->flags);
 883         if (!do_hashed) {
 884                 io_wq_enqueue(ctx->io_wq, &req->work);
 885         } else {
 886                 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
 887                                         file_inode(req->file));
 888         }
 889
 890         if (link)
 891                 io_queue_linked_timeout(link);
 892 }
 893
 894 static void io_kill_timeout(struct io_kiocb *req)
 895 {
 896         int ret;
 897
 898         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
 899         if (ret != -1) {
 900                 atomic_inc(&req->ctx->cq_timeouts);
 901                 list_del_init(&req->list);
 902                 io_cqring_fill_event(req, 0);
 903                 io_put_req(req);
 904         }
 905 }
 906
 907 static void io_kill_timeouts(struct io_ring_ctx *ctx)
 908 {
 909         struct io_kiocb *req, *tmp;
 910
 911         spin_lock_irq(&ctx->completion_lock);
 912         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
 913                 io_kill_timeout(req);
 914         spin_unlock_irq(&ctx->completion_lock);
 915 }
 916
 917 static void io_commit_cqring(struct io_ring_ctx *ctx)
 918 {
 919         struct io_kiocb *req;
 920
 921         while ((req = io_get_timeout_req(ctx)) != NULL)
 922                 io_kill_timeout(req);
 923
 924         __io_commit_cqring(ctx);
 925
 926         while ((req = io_get_deferred_req(ctx)) != NULL) {
 927                 req->flags |= REQ_F_IO_DRAINED;
 928                 io_queue_async_work(req);
 929         }
 930 }
 931
 932 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 933 {
 934         struct io_rings *rings = ctx->rings;
 935         unsigned tail;
 936
 937         tail = ctx->cached_cq_tail;
 938         /*
 939          * writes to the cq entry need to come after reading head; the
 940          * control dependency is enough as we're using WRITE_ONCE to
 941          * fill the cq entry
 942          */
 943         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 944                 return NULL;
 945
 946         ctx->cached_cq_tail++;
 947         return &rings->cqes[tail & ctx->cq_mask];
 948 }
 949
 950 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 951 {
 952         if (waitqueue_active(&ctx->wait))
 953                 wake_up(&ctx->wait);
 954         if (waitqueue_active(&ctx->sqo_wait))
 955                 wake_up(&ctx->sqo_wait);
 956         if (ctx->cq_ev_fd)
 957                 eventfd_signal(ctx->cq_ev_fd, 1);
 958 }
 959
 960 /* Returns true if there are no backlogged entries after the flush */
 961 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 962 {
 963         struct io_rings *rings = ctx->rings;
 964         struct io_uring_cqe *cqe;
 965         struct io_kiocb *req;
 966         unsigned long flags;
 967         LIST_HEAD(list);
 968
 969         if (!force) {
 970                 if (list_empty_careful(&ctx->cq_overflow_list))
 971                         return true;
 972                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
 973                     rings->cq_ring_entries))
 974                         return false;
 975         }
 976
 977         spin_lock_irqsave(&ctx->completion_lock, flags);
 978
 979         /* if force is set, the ring is going away. always drop after that */
 980         if (force)
 981                 ctx->cq_overflow_flushed = true;
 982
 983         cqe = NULL;
 984         while (!list_empty(&ctx->cq_overflow_list)) {
 985                 cqe = io_get_cqring(ctx);
 986                 if (!cqe && !force)
 987                         break;
 988
 989                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
 990                                                 list);
 991                 list_move(&req->list, &list);
 992                 if (cqe) {
 993                         WRITE_ONCE(cqe->user_data, req->user_data);
 994                         WRITE_ONCE(cqe->res, req->result);
 995                         WRITE_ONCE(cqe->flags, 0);
 996                 } else {
 997                         WRITE_ONCE(ctx->rings->cq_overflow,
 998                                 atomic_inc_return(&ctx->cached_cq_overflow));
 999                 }
1000         }
1001
1002         io_commit_cqring(ctx);
1003         if (cqe) {
1004                 clear_bit(0, &ctx->sq_check_overflow);
1005                 clear_bit(0, &ctx->cq_check_overflow);
1006         }
1007         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1008         io_cqring_ev_posted(ctx);
1009
1010         while (!list_empty(&list)) {
1011                 req = list_first_entry(&list, struct io_kiocb, list);
1012                 list_del(&req->list);
1013                 io_put_req(req);
1014         }
1015
1016         return cqe != NULL;
1017 }
1018
1019 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1020 {
1021         struct io_ring_ctx *ctx = req->ctx;
1022         struct io_uring_cqe *cqe;
1023
1024         trace_io_uring_complete(ctx, req->user_data, res);
1025
1026         /*
1027          * If we can't get a cq entry, userspace overflowed the
1028          * submission (by quite a lot). Increment the overflow count in
1029          * the ring.
1030          */
1031         cqe = io_get_cqring(ctx);
1032         if (likely(cqe)) {
1033                 WRITE_ONCE(cqe->user_data, req->user_data);
1034                 WRITE_ONCE(cqe->res, res);
1035                 WRITE_ONCE(cqe->flags, 0);
1036         } else if (ctx->cq_overflow_flushed) {
1037                 WRITE_ONCE(ctx->rings->cq_overflow,
1038                                 atomic_inc_return(&ctx->cached_cq_overflow));
1039         } else {
1040                 if (list_empty(&ctx->cq_overflow_list)) {
1041                         set_bit(0, &ctx->sq_check_overflow);
1042                         set_bit(0, &ctx->cq_check_overflow);
1043                 }
1044                 refcount_inc(&req->refs);
1045                 req->result = res;
1046                 list_add_tail(&req->list, &ctx->cq_overflow_list);
1047         }
1048 }
1049
1050 static void io_cqring_add_event(struct io_kiocb *req, long res)
1051 {
1052         struct io_ring_ctx *ctx = req->ctx;
1053         unsigned long flags;
1054
1055         spin_lock_irqsave(&ctx->completion_lock, flags);
1056         io_cqring_fill_event(req, res);
1057         io_commit_cqring(ctx);
1058         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1059
1060         io_cqring_ev_posted(ctx);
1061 }
1062
1063 static inline bool io_is_fallback_req(struct io_kiocb *req)
1064 {
1065         return req == (struct io_kiocb *)
1066                         ((unsigned long) req->ctx->fallback_req & ~1UL);
1067 }
1068
1069 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1070 {
1071         struct io_kiocb *req;
1072
1073         req = ctx->fallback_req;
1074         if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
1075                 return req;
1076
1077         return NULL;
1078 }
1079
1080 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
1081                                    struct io_submit_state *state)
1082 {
1083         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1084         struct io_kiocb *req;
1085
1086         if (!state) {
1087                 req = kmem_cache_alloc(req_cachep, gfp);
1088                 if (unlikely(!req))
1089                         goto fallback;
1090         } else if (!state->free_reqs) {
1091                 size_t sz;
1092                 int ret;
1093
1094                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1095                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1096
1097                 /*
1098                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1099                  * retry single alloc to be on the safe side.
1100                  */
1101                 if (unlikely(ret <= 0)) {
1102                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1103                         if (!state->reqs[0])
1104                                 goto fallback;
1105                         ret = 1;
1106                 }
1107                 state->free_reqs = ret - 1;
1108                 state->cur_req = 1;
1109                 req = state->reqs[0];
1110         } else {
1111                 req = state->reqs[state->cur_req];
1112                 state->free_reqs--;
1113                 state->cur_req++;
1114         }
1115
1116 got_it:
1117         req->io = NULL;
1118         req->ring_file = NULL;
1119         req->file = NULL;
1120         req->ctx = ctx;
1121         req->flags = 0;
1122         /* one is dropped after submission, the other at completion */
1123         refcount_set(&req->refs, 2);
1124         req->result = 0;
1125         INIT_IO_WORK(&req->work, io_wq_submit_work);
1126         return req;
1127 fallback:
1128         req = io_get_fallback_req(ctx);
1129         if (req)
1130                 goto got_it;
1131         percpu_ref_put(&ctx->refs);
1132         return NULL;
1133 }
1134
1135 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
1136 {
1137         if (*nr) {
1138                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
1139                 percpu_ref_put_many(&ctx->refs, *nr);
1140                 percpu_ref_put_many(&ctx->file_data->refs, *nr);
1141                 *nr = 0;
1142         }
1143 }
1144
1145 static void __io_req_do_free(struct io_kiocb *req)
1146 {
1147         if (likely(!io_is_fallback_req(req)))
1148                 kmem_cache_free(req_cachep, req);
1149         else
1150                 clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
1151 }
1152
1153 static void __io_free_req(struct io_kiocb *req)
1154 {
1155         struct io_ring_ctx *ctx = req->ctx;
1156
1157         if (req->io)
1158                 kfree(req->io);
1159         if (req->file) {
1160                 if (req->flags & REQ_F_FIXED_FILE)
1161                         percpu_ref_put(&ctx->file_data->refs);
1162                 else
1163                         fput(req->file);
1164         }
1165         if (req->flags & REQ_F_INFLIGHT) {
1166                 unsigned long flags;
1167
1168                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1169                 list_del(&req->inflight_entry);
1170                 if (waitqueue_active(&ctx->inflight_wait))
1171                         wake_up(&ctx->inflight_wait);
1172                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1173         }
1174
1175         percpu_ref_put(&req->ctx->refs);
1176         __io_req_do_free(req);
1177 }
1178
1179 static bool io_link_cancel_timeout(struct io_kiocb *req)
1180 {
1181         struct io_ring_ctx *ctx = req->ctx;
1182         int ret;
1183
1184         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1185         if (ret != -1) {
1186                 io_cqring_fill_event(req, -ECANCELED);
1187                 io_commit_cqring(ctx);
1188                 req->flags &= ~REQ_F_LINK;
1189                 io_put_req(req);
1190                 return true;
1191         }
1192
1193         return false;
1194 }
1195
1196 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1197 {
1198         struct io_ring_ctx *ctx = req->ctx;
1199         bool wake_ev = false;
1200
1201         /* Already got next link */
1202         if (req->flags & REQ_F_LINK_NEXT)
1203                 return;
1204
1205         /*
1206          * The list should never be empty when we are called here. But could
1207          * potentially happen if the chain is messed up, check to be on the
1208          * safe side.
1209          */
1210         while (!list_empty(&req->link_list)) {
1211                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1212                                                 struct io_kiocb, link_list);
1213
1214                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1215                              (nxt->flags & REQ_F_TIMEOUT))) {
1216                         list_del_init(&nxt->link_list);
1217                         wake_ev |= io_link_cancel_timeout(nxt);
1218                         req->flags &= ~REQ_F_LINK_TIMEOUT;
1219                         continue;
1220                 }
1221
1222                 list_del_init(&req->link_list);
1223                 if (!list_empty(&nxt->link_list))
1224                         nxt->flags |= REQ_F_LINK;
1225                 *nxtptr = nxt;
1226                 break;
1227         }
1228
1229         req->flags |= REQ_F_LINK_NEXT;
1230         if (wake_ev)
1231                 io_cqring_ev_posted(ctx);
1232 }
1233
1234 /*
1235  * Called if REQ_F_LINK is set, and we fail the head request
1236  */
1237 static void io_fail_links(struct io_kiocb *req)
1238 {
1239         struct io_ring_ctx *ctx = req->ctx;
1240         unsigned long flags;
1241
1242         spin_lock_irqsave(&ctx->completion_lock, flags);
1243
1244         while (!list_empty(&req->link_list)) {
1245                 struct io_kiocb *link = list_first_entry(&req->link_list,
1246                                                 struct io_kiocb, link_list);
1247
1248                 list_del_init(&link->link_list);
1249                 trace_io_uring_fail_link(req, link);
1250
1251                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1252                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1253                         io_link_cancel_timeout(link);
1254                 } else {
1255                         io_cqring_fill_event(link, -ECANCELED);
1256                         __io_double_put_req(link);
1257                 }
1258                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1259         }
1260
1261         io_commit_cqring(ctx);
1262         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1263         io_cqring_ev_posted(ctx);
1264 }
1265
1266 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1267 {
1268         if (likely(!(req->flags & REQ_F_LINK)))
1269                 return;
1270
1271         /*
1272          * If LINK is set, we have dependent requests in this chain. If we
1273          * didn't fail this request, queue the first one up, moving any other
1274          * dependencies to the next request. In case of failure, fail the rest
1275          * of the chain.
1276          */
1277         if (req->flags & REQ_F_FAIL_LINK) {
1278                 io_fail_links(req);
1279         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1280                         REQ_F_LINK_TIMEOUT) {
1281                 struct io_ring_ctx *ctx = req->ctx;
1282                 unsigned long flags;
1283
1284                 /*
1285                  * If this is a timeout link, we could be racing with the
1286                  * timeout timer. Grab the completion lock for this case to
1287                  * protect against that.
1288                  */
1289                 spin_lock_irqsave(&ctx->completion_lock, flags);
1290                 io_req_link_next(req, nxt);
1291                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1292         } else {
1293                 io_req_link_next(req, nxt);
1294         }
1295 }
1296
1297 static void io_free_req(struct io_kiocb *req)
1298 {
1299         struct io_kiocb *nxt = NULL;
1300
1301         io_req_find_next(req, &nxt);
1302         __io_free_req(req);
1303
1304         if (nxt)
1305                 io_queue_async_work(nxt);
1306 }
1307
1308 /*
1309  * Drop reference to request, return next in chain (if there is one) if this
1310  * was the last reference to this request.
1311  */
1312 __attribute__((nonnull))
1313 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1314 {
1315         io_req_find_next(req, nxtptr);
1316
1317         if (refcount_dec_and_test(&req->refs))
1318                 __io_free_req(req);
1319 }
1320
1321 static void io_put_req(struct io_kiocb *req)
1322 {
1323         if (refcount_dec_and_test(&req->refs))
1324                 io_free_req(req);
1325 }
1326
1327 /*
1328  * Must only be used if we don't need to care about links, usually from
1329  * within the completion handling itself.
1330  */
1331 static void __io_double_put_req(struct io_kiocb *req)
1332 {
1333         /* drop both submit and complete references */
1334         if (refcount_sub_and_test(2, &req->refs))
1335                 __io_free_req(req);
1336 }
1337
1338 static void io_double_put_req(struct io_kiocb *req)
1339 {
1340         /* drop both submit and complete references */
1341         if (refcount_sub_and_test(2, &req->refs))
1342                 io_free_req(req);
1343 }
1344
1345 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1346 {
1347         struct io_rings *rings = ctx->rings;
1348
1349         if (test_bit(0, &ctx->cq_check_overflow)) {
1350                 /*
1351                  * noflush == true is from the waitqueue handler, just ensure
1352                  * we wake up the task, and the next invocation will flush the
1353                  * entries. We cannot safely to it from here.
1354                  */
1355                 if (noflush && !list_empty(&ctx->cq_overflow_list))
1356                         return -1U;
1357
1358                 io_cqring_overflow_flush(ctx, false);
1359         }
1360
1361         /* See comment at the top of this file */
1362         smp_rmb();
1363         return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
1364 }
1365
1366 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1367 {
1368         struct io_rings *rings = ctx->rings;
1369
1370         /* make sure SQ entry isn't read before tail */
1371         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1372 }
1373
1374 static inline bool io_req_multi_free(struct io_kiocb *req)
1375 {
1376         /*
1377          * If we're not using fixed files, we have to pair the completion part
1378          * with the file put. Use regular completions for those, only batch
1379          * free for fixed file and non-linked commands.
1380          */
1381         if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE)
1382             && !io_is_fallback_req(req) && !req->io)
1383                 return true;
1384
1385         return false;
1386 }
1387
1388 /*
1389  * Find and free completed poll iocbs
1390  */
1391 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1392                                struct list_head *done)
1393 {
1394         void *reqs[IO_IOPOLL_BATCH];
1395         struct io_kiocb *req;
1396         int to_free;
1397
1398         to_free = 0;
1399         while (!list_empty(done)) {
1400                 req = list_first_entry(done, struct io_kiocb, list);
1401                 list_del(&req->list);
1402
1403                 io_cqring_fill_event(req, req->result);
1404                 (*nr_events)++;
1405
1406                 if (refcount_dec_and_test(&req->refs)) {
1407                         if (io_req_multi_free(req)) {
1408                                 reqs[to_free++] = req;
1409                                 if (to_free == ARRAY_SIZE(reqs))
1410                                         io_free_req_many(ctx, reqs, &to_free);
1411                         } else {
1412                                 io_free_req(req);
1413                         }
1414                 }
1415         }
1416
1417         io_commit_cqring(ctx);
1418         io_free_req_many(ctx, reqs, &to_free);
1419 }
1420
1421 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1422                         long min)
1423 {
1424         struct io_kiocb *req, *tmp;
1425         LIST_HEAD(done);
1426         bool spin;
1427         int ret;
1428
1429         /*
1430          * Only spin for completions if we don't have multiple devices hanging
1431          * off our complete list, and we're under the requested amount.
1432          */
1433         spin = !ctx->poll_multi_file && *nr_events < min;
1434
1435         ret = 0;
1436         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1437                 struct kiocb *kiocb = &req->rw.kiocb;
1438
1439                 /*
1440                  * Move completed entries to our local list. If we find a
1441                  * request that requires polling, break out and complete
1442                  * the done list first, if we have entries there.
1443                  */
1444                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1445                         list_move_tail(&req->list, &done);
1446                         continue;
1447                 }
1448                 if (!list_empty(&done))
1449                         break;
1450
1451                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1452                 if (ret < 0)
1453                         break;
1454
1455                 if (ret && spin)
1456                         spin = false;
1457                 ret = 0;
1458         }
1459
1460         if (!list_empty(&done))
1461                 io_iopoll_complete(ctx, nr_events, &done);
1462
1463         return ret;
1464 }
1465
1466 /*
1467  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1468  * non-spinning poll check - we'll still enter the driver poll loop, but only
1469  * as a non-spinning completion check.
1470  */
1471 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1472                                 long min)
1473 {
1474         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1475                 int ret;
1476
1477                 ret = io_do_iopoll(ctx, nr_events, min);
1478                 if (ret < 0)
1479                         return ret;
1480                 if (!min || *nr_events >= min)
1481                         return 0;
1482         }
1483
1484         return 1;
1485 }
1486
1487 /*
1488  * We can't just wait for polled events to come to us, we have to actively
1489  * find and complete them.
1490  */
1491 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1492 {
1493         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1494                 return;
1495
1496         mutex_lock(&ctx->uring_lock);
1497         while (!list_empty(&ctx->poll_list)) {
1498                 unsigned int nr_events = 0;
1499
1500                 io_iopoll_getevents(ctx, &nr_events, 1);
1501
1502                 /*
1503                  * Ensure we allow local-to-the-cpu processing to take place,
1504                  * in this case we need to ensure that we reap all events.
1505                  */
1506                 cond_resched();
1507         }
1508         mutex_unlock(&ctx->uring_lock);
1509 }
1510
1511 static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1512                             long min)
1513 {
1514         int iters = 0, ret = 0;
1515
1516         do {
1517                 int tmin = 0;
1518
1519                 /*
1520                  * Don't enter poll loop if we already have events pending.
1521                  * If we do, we can potentially be spinning for commands that
1522                  * already triggered a CQE (eg in error).
1523                  */
1524                 if (io_cqring_events(ctx, false))
1525                         break;
1526
1527                 /*
1528                  * If a submit got punted to a workqueue, we can have the
1529                  * application entering polling for a command before it gets
1530                  * issued. That app will hold the uring_lock for the duration
1531                  * of the poll right here, so we need to take a breather every
1532                  * now and then to ensure that the issue has a chance to add
1533                  * the poll to the issued list. Otherwise we can spin here
1534                  * forever, while the workqueue is stuck trying to acquire the
1535                  * very same mutex.
1536                  */
1537                 if (!(++iters & 7)) {
1538                         mutex_unlock(&ctx->uring_lock);
1539                         mutex_lock(&ctx->uring_lock);
1540                 }
1541
1542                 if (*nr_events < min)
1543                         tmin = min - *nr_events;
1544
1545                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1546                 if (ret <= 0)
1547                         break;
1548                 ret = 0;
1549         } while (min && !*nr_events && !need_resched());
1550
1551         return ret;
1552 }
1553
1554 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1555                            long min)
1556 {
1557         int ret;
1558
1559         /*
1560          * We disallow the app entering submit/complete with polling, but we
1561          * still need to lock the ring to prevent racing with polled issue
1562          * that got punted to a workqueue.
1563          */
1564         mutex_lock(&ctx->uring_lock);
1565         ret = __io_iopoll_check(ctx, nr_events, min);
1566         mutex_unlock(&ctx->uring_lock);
1567         return ret;
1568 }
1569
1570 static void kiocb_end_write(struct io_kiocb *req)
1571 {
1572         /*
1573          * Tell lockdep we inherited freeze protection from submission
1574          * thread.
1575          */
1576         if (req->flags & REQ_F_ISREG) {
1577                 struct inode *inode = file_inode(req->file);
1578
1579                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1580         }
1581         file_end_write(req->file);
1582 }
1583
1584 static inline void req_set_fail_links(struct io_kiocb *req)
1585 {
1586         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1587                 req->flags |= REQ_F_FAIL_LINK;
1588 }
1589
1590 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1591 {
1592         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1593
1594         if (kiocb->ki_flags & IOCB_WRITE)
1595                 kiocb_end_write(req);
1596
1597         if (res != req->result)
1598                 req_set_fail_links(req);
1599         io_cqring_add_event(req, res);
1600 }
1601
1602 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1603 {
1604         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1605
1606         io_complete_rw_common(kiocb, res);
1607         io_put_req(req);
1608 }
1609
1610 static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1611 {
1612         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1613         struct io_kiocb *nxt = NULL;
1614
1615         io_complete_rw_common(kiocb, res);
1616         io_put_req_find_next(req, &nxt);
1617
1618         return nxt;
1619 }
1620
1621 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1622 {
1623         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1624
1625         if (kiocb->ki_flags & IOCB_WRITE)
1626                 kiocb_end_write(req);
1627
1628         if (res != req->result)
1629                 req_set_fail_links(req);
1630         req->result = res;
1631         if (res != -EAGAIN)
1632                 req->flags |= REQ_F_IOPOLL_COMPLETED;
1633 }
1634
1635 /*
1636  * After the iocb has been issued, it's safe to be found on the poll list.
1637  * Adding the kiocb to the list AFTER submission ensures that we don't
1638  * find it from a io_iopoll_getevents() thread before the issuer is done
1639  * accessing the kiocb cookie.
1640  */
1641 static void io_iopoll_req_issued(struct io_kiocb *req)
1642 {
1643         struct io_ring_ctx *ctx = req->ctx;
1644
1645         /*
1646          * Track whether we have multiple files in our lists. This will impact
1647          * how we do polling eventually, not spinning if we're on potentially
1648          * different devices.
1649          */
1650         if (list_empty(&ctx->poll_list)) {
1651                 ctx->poll_multi_file = false;
1652         } else if (!ctx->poll_multi_file) {
1653                 struct io_kiocb *list_req;
1654
1655                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1656                                                 list);
1657                 if (list_req->file != req->file)
1658                         ctx->poll_multi_file = true;
1659         }
1660
1661         /*
1662          * For fast devices, IO may have already completed. If it has, add
1663          * it to the front so we find it first.
1664          */
1665         if (req->flags & REQ_F_IOPOLL_COMPLETED)
1666                 list_add(&req->list, &ctx->poll_list);
1667         else
1668                 list_add_tail(&req->list, &ctx->poll_list);
1669 }
1670
1671 static void io_file_put(struct io_submit_state *state)
1672 {
1673         if (state->file) {
1674                 int diff = state->has_refs - state->used_refs;
1675
1676                 if (diff)
1677                         fput_many(state->file, diff);
1678                 state->file = NULL;
1679         }
1680 }
1681
1682 /*
1683  * Get as many references to a file as we have IOs left in this submission,
1684  * assuming most submissions are for one file, or at least that each file
1685  * has more than one submission.
1686  */
1687 static struct file *io_file_get(struct io_submit_state *state, int fd)
1688 {
1689         if (!state)
1690                 return fget(fd);
1691
1692         if (state->file) {
1693                 if (state->fd == fd) {
1694                         state->used_refs++;
1695                         state->ios_left--;
1696                         return state->file;
1697                 }
1698                 io_file_put(state);
1699         }
1700         state->file = fget_many(fd, state->ios_left);
1701         if (!state->file)
1702                 return NULL;
1703
1704         state->fd = fd;
1705         state->has_refs = state->ios_left;
1706         state->used_refs = 1;
1707         state->ios_left--;
1708         return state->file;
1709 }
1710
1711 /*
1712  * If we tracked the file through the SCM inflight mechanism, we could support
1713  * any file. For now, just ensure that anything potentially problematic is done
1714  * inline.
1715  */
1716 static bool io_file_supports_async(struct file *file)
1717 {
1718         umode_t mode = file_inode(file)->i_mode;
1719
1720         if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
1721                 return true;
1722         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1723                 return true;
1724
1725         return false;
1726 }
1727
1728 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1729                       bool force_nonblock)
1730 {
1731         struct io_ring_ctx *ctx = req->ctx;
1732         struct kiocb *kiocb = &req->rw.kiocb;
1733         unsigned ioprio;
1734         int ret;
1735
1736         if (!req->file)
1737                 return -EBADF;
1738
1739         if (S_ISREG(file_inode(req->file)->i_mode))
1740                 req->flags |= REQ_F_ISREG;
1741
1742         kiocb->ki_pos = READ_ONCE(sqe->off);
1743         if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
1744                 req->flags |= REQ_F_CUR_POS;
1745                 kiocb->ki_pos = req->file->f_pos;
1746         }
1747         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1748         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1749
1750         ioprio = READ_ONCE(sqe->ioprio);
1751         if (ioprio) {
1752                 ret = ioprio_check_cap(ioprio);
1753                 if (ret)
1754                         return ret;
1755
1756                 kiocb->ki_ioprio = ioprio;
1757         } else
1758                 kiocb->ki_ioprio = get_current_ioprio();
1759
1760         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1761         if (unlikely(ret))
1762                 return ret;
1763
1764         /* don't allow async punt if RWF_NOWAIT was requested */
1765         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1766             (req->file->f_flags & O_NONBLOCK))
1767                 req->flags |= REQ_F_NOWAIT;
1768
1769         if (force_nonblock)
1770                 kiocb->ki_flags |= IOCB_NOWAIT;
1771
1772         if (ctx->flags & IORING_SETUP_IOPOLL) {
1773                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1774                     !kiocb->ki_filp->f_op->iopoll)
1775                         return -EOPNOTSUPP;
1776
1777                 kiocb->ki_flags |= IOCB_HIPRI;
1778                 kiocb->ki_complete = io_complete_rw_iopoll;
1779                 req->result = 0;
1780         } else {
1781                 if (kiocb->ki_flags & IOCB_HIPRI)
1782                         return -EINVAL;
1783                 kiocb->ki_complete = io_complete_rw;
1784         }
1785
1786         req->rw.addr = READ_ONCE(sqe->addr);
1787         req->rw.len = READ_ONCE(sqe->len);
1788         /* we own ->private, reuse it for the buffer index */
1789         req->rw.kiocb.private = (void *) (unsigned long)
1790                                         READ_ONCE(sqe->buf_index);
1791         return 0;
1792 }
1793
1794 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1795 {
1796         switch (ret) {
1797         case -EIOCBQUEUED:
1798                 break;
1799         case -ERESTARTSYS:
1800         case -ERESTARTNOINTR:
1801         case -ERESTARTNOHAND:
1802         case -ERESTART_RESTARTBLOCK:
1803                 /*
1804                  * We can't just restart the syscall, since previously
1805                  * submitted sqes may already be in progress. Just fail this
1806                  * IO with EINTR.
1807                  */
1808                 ret = -EINTR;
1809                 /* fall through */
1810         default:
1811                 kiocb->ki_complete(kiocb, ret, 0);
1812         }
1813 }
1814
1815 static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1816                        bool in_async)
1817 {
1818         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1819
1820         if (req->flags & REQ_F_CUR_POS)
1821                 req->file->f_pos = kiocb->ki_pos;
1822         if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
1823                 *nxt = __io_complete_rw(kiocb, ret);
1824         else
1825                 io_rw_done(kiocb, ret);
1826 }
1827
1828 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
1829                                struct iov_iter *iter)
1830 {
1831         struct io_ring_ctx *ctx = req->ctx;
1832         size_t len = req->rw.len;
1833         struct io_mapped_ubuf *imu;
1834         unsigned index, buf_index;
1835         size_t offset;
1836         u64 buf_addr;
1837
1838         /* attempt to use fixed buffers without having provided iovecs */
1839         if (unlikely(!ctx->user_bufs))
1840                 return -EFAULT;
1841
1842         buf_index = (unsigned long) req->rw.kiocb.private;
1843         if (unlikely(buf_index >= ctx->nr_user_bufs))
1844                 return -EFAULT;
1845
1846         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1847         imu = &ctx->user_bufs[index];
1848         buf_addr = req->rw.addr;
1849
1850         /* overflow */
1851         if (buf_addr + len < buf_addr)
1852                 return -EFAULT;
1853         /* not inside the mapped region */
1854         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1855                 return -EFAULT;
1856
1857         /*
1858          * May not be a start of buffer, set size appropriately
1859          * and advance us to the beginning.
1860          */
1861         offset = buf_addr - imu->ubuf;
1862         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1863
1864         if (offset) {
1865                 /*
1866                  * Don't use iov_iter_advance() here, as it's really slow for
1867                  * using the latter parts of a big fixed buffer - it iterates
1868                  * over each segment manually. We can cheat a bit here, because
1869                  * we know that:
1870                  *
1871                  * 1) it's a BVEC iter, we set it up
1872                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
1873                  *    first and last bvec
1874                  *
1875                  * So just find our index, and adjust the iterator afterwards.
1876                  * If the offset is within the first bvec (or the whole first
1877                  * bvec, just use iov_iter_advance(). This makes it easier
1878                  * since we can just skip the first segment, which may not
1879                  * be PAGE_SIZE aligned.
1880                  */
1881                 const struct bio_vec *bvec = imu->bvec;
1882
1883                 if (offset <= bvec->bv_len) {
1884                         iov_iter_advance(iter, offset);
1885                 } else {
1886                         unsigned long seg_skip;
1887
1888                         /* skip first vec */
1889                         offset -= bvec->bv_len;
1890                         seg_skip = 1 + (offset >> PAGE_SHIFT);
1891
1892                         iter->bvec = bvec + seg_skip;
1893                         iter->nr_segs -= seg_skip;
1894                         iter->count -= bvec->bv_len + offset;
1895                         iter->iov_offset = offset & ~PAGE_MASK;
1896                 }
1897         }
1898
1899         return len;
1900 }
1901
1902 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
1903                                struct iovec **iovec, struct iov_iter *iter)
1904 {
1905         void __user *buf = u64_to_user_ptr(req->rw.addr);
1906         size_t sqe_len = req->rw.len;
1907         u8 opcode;
1908
1909         opcode = req->opcode;
1910         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
1911                 *iovec = NULL;
1912                 return io_import_fixed(req, rw, iter);
1913         }
1914
1915         /* buffer index only valid with fixed read/write */
1916         if (req->rw.kiocb.private)
1917                 return -EINVAL;
1918
1919         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
1920                 ssize_t ret;
1921                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
1922                 *iovec = NULL;
1923                 return ret;
1924         }
1925
1926         if (req->io) {
1927                 struct io_async_rw *iorw = &req->io->rw;
1928
1929                 *iovec = iorw->iov;
1930                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
1931                 if (iorw->iov == iorw->fast_iov)
1932                         *iovec = NULL;
1933                 return iorw->size;
1934         }
1935
1936         if (!req->has_user)
1937                 return -EFAULT;
1938
1939 #ifdef CONFIG_COMPAT
1940         if (req->ctx->compat)
1941                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1942                                                 iovec, iter);
1943 #endif
1944
1945         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1946 }
1947
1948 /*
1949  * For files that don't have ->read_iter() and ->write_iter(), handle them
1950  * by looping over ->read() or ->write() manually.
1951  */
1952 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
1953                            struct iov_iter *iter)
1954 {
1955         ssize_t ret = 0;
1956
1957         /*
1958          * Don't support polled IO through this interface, and we can't
1959          * support non-blocking either. For the latter, this just causes
1960          * the kiocb to be handled from an async context.
1961          */
1962         if (kiocb->ki_flags & IOCB_HIPRI)
1963                 return -EOPNOTSUPP;
1964         if (kiocb->ki_flags & IOCB_NOWAIT)
1965                 return -EAGAIN;
1966
1967         while (iov_iter_count(iter)) {
1968                 struct iovec iovec;
1969                 ssize_t nr;
1970
1971                 if (!iov_iter_is_bvec(iter)) {
1972                         iovec = iov_iter_iovec(iter);
1973                 } else {
1974                         /* fixed buffers import bvec */
1975                         iovec.iov_base = kmap(iter->bvec->bv_page)
1976                                                 + iter->iov_offset;
1977                         iovec.iov_len = min(iter->count,
1978                                         iter->bvec->bv_len - iter->iov_offset);
1979                 }
1980
1981                 if (rw == READ) {
1982                         nr = file->f_op->read(file, iovec.iov_base,
1983                                               iovec.iov_len, &kiocb->ki_pos);
1984                 } else {
1985                         nr = file->f_op->write(file, iovec.iov_base,
1986                                                iovec.iov_len, &kiocb->ki_pos);
1987                 }
1988
1989                 if (iov_iter_is_bvec(iter))
1990                         kunmap(iter->bvec->bv_page);
1991
1992                 if (nr < 0) {
1993                         if (!ret)
1994                                 ret = nr;
1995                         break;
1996                 }
1997                 ret += nr;
1998                 if (nr != iovec.iov_len)
1999                         break;
2000                 iov_iter_advance(iter, nr);
2001         }
2002
2003         return ret;
2004 }
2005
2006 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
2007                           struct iovec *iovec, struct iovec *fast_iov,
2008                           struct iov_iter *iter)
2009 {
2010         req->io->rw.nr_segs = iter->nr_segs;
2011         req->io->rw.size = io_size;
2012         req->io->rw.iov = iovec;
2013         if (!req->io->rw.iov) {
2014                 req->io->rw.iov = req->io->rw.fast_iov;
2015                 memcpy(req->io->rw.iov, fast_iov,
2016                         sizeof(struct iovec) * iter->nr_segs);
2017         }
2018 }
2019
2020 static int io_alloc_async_ctx(struct io_kiocb *req)
2021 {
2022         if (!io_op_defs[req->opcode].async_ctx)
2023                 return 0;
2024         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
2025         return req->io == NULL;
2026 }
2027
2028 static void io_rw_async(struct io_wq_work **workptr)
2029 {
2030         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2031         struct iovec *iov = NULL;
2032
2033         if (req->io->rw.iov != req->io->rw.fast_iov)
2034                 iov = req->io->rw.iov;
2035         io_wq_submit_work(workptr);
2036         kfree(iov);
2037 }
2038
2039 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2040                              struct iovec *iovec, struct iovec *fast_iov,
2041                              struct iov_iter *iter)
2042 {
2043         if (req->opcode == IORING_OP_READ_FIXED ||
2044             req->opcode == IORING_OP_WRITE_FIXED)
2045                 return 0;
2046         if (!req->io && io_alloc_async_ctx(req))
2047                 return -ENOMEM;
2048
2049         io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2050         req->work.func = io_rw_async;
2051         return 0;
2052 }
2053
2054 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2055                         bool force_nonblock)
2056 {
2057         struct io_async_ctx *io;
2058         struct iov_iter iter;
2059         ssize_t ret;
2060
2061         ret = io_prep_rw(req, sqe, force_nonblock);
2062         if (ret)
2063                 return ret;
2064
2065         if (unlikely(!(req->file->f_mode & FMODE_READ)))
2066                 return -EBADF;
2067
2068         if (!req->io)
2069                 return 0;
2070
2071         io = req->io;
2072         io->rw.iov = io->rw.fast_iov;
2073         req->io = NULL;
2074         ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
2075         req->io = io;
2076         if (ret < 0)
2077                 return ret;
2078
2079         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2080         return 0;
2081 }
2082
2083 static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
2084                    bool force_nonblock)
2085 {
2086         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2087         struct kiocb *kiocb = &req->rw.kiocb;
2088         struct iov_iter iter;
2089         size_t iov_count;
2090         ssize_t io_size, ret;
2091
2092         ret = io_import_iovec(READ, req, &iovec, &iter);
2093         if (ret < 0)
2094                 return ret;
2095
2096         /* Ensure we clear previously set non-block flag */
2097         if (!force_nonblock)
2098                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
2099
2100         req->result = 0;
2101         io_size = ret;
2102         if (req->flags & REQ_F_LINK)
2103                 req->result = io_size;
2104
2105         /*
2106          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2107          * we know to async punt it even if it was opened O_NONBLOCK
2108          */
2109         if (force_nonblock && !io_file_supports_async(req->file)) {
2110                 req->flags |= REQ_F_MUST_PUNT;
2111                 goto copy_iov;
2112         }
2113
2114         iov_count = iov_iter_count(&iter);
2115         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2116         if (!ret) {
2117                 ssize_t ret2;
2118
2119                 if (req->file->f_op->read_iter)
2120                         ret2 = call_read_iter(req->file, kiocb, &iter);
2121                 else
2122                         ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
2123
2124                 /* Catch -EAGAIN return for forced non-blocking submission */
2125                 if (!force_nonblock || ret2 != -EAGAIN) {
2126                         kiocb_done(kiocb, ret2, nxt, req->in_async);
2127                 } else {
2128 copy_iov:
2129                         ret = io_setup_async_rw(req, io_size, iovec,
2130                                                 inline_vecs, &iter);
2131                         if (ret)
2132                                 goto out_free;
2133                         return -EAGAIN;
2134                 }
2135         }
2136 out_free:
2137         if (!io_wq_current_is_worker())
2138                 kfree(iovec);
2139         return ret;
2140 }
2141
2142 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2143                          bool force_nonblock)
2144 {
2145         struct io_async_ctx *io;
2146         struct iov_iter iter;
2147         ssize_t ret;
2148
2149         ret = io_prep_rw(req, sqe, force_nonblock);
2150         if (ret)
2151                 return ret;
2152
2153         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2154                 return -EBADF;
2155
2156         if (!req->io)
2157                 return 0;
2158
2159         io = req->io;
2160         io->rw.iov = io->rw.fast_iov;
2161         req->io = NULL;
2162         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
2163         req->io = io;
2164         if (ret < 0)
2165                 return ret;
2166
2167         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2168         return 0;
2169 }
2170
2171 static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
2172                     bool force_nonblock)
2173 {
2174         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2175         struct kiocb *kiocb = &req->rw.kiocb;
2176         struct iov_iter iter;
2177         size_t iov_count;
2178         ssize_t ret, io_size;
2179
2180         ret = io_import_iovec(WRITE, req, &iovec, &iter);
2181         if (ret < 0)
2182                 return ret;
2183
2184         /* Ensure we clear previously set non-block flag */
2185         if (!force_nonblock)
2186                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
2187
2188         req->result = 0;
2189         io_size = ret;
2190         if (req->flags & REQ_F_LINK)
2191                 req->result = io_size;
2192
2193         /*
2194          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2195          * we know to async punt it even if it was opened O_NONBLOCK
2196          */
2197         if (force_nonblock && !io_file_supports_async(req->file)) {
2198                 req->flags |= REQ_F_MUST_PUNT;
2199                 goto copy_iov;
2200         }
2201
2202         /* file path doesn't support NOWAIT for non-direct_IO */
2203         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2204             (req->flags & REQ_F_ISREG))
2205                 goto copy_iov;
2206
2207         iov_count = iov_iter_count(&iter);
2208         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2209         if (!ret) {
2210                 ssize_t ret2;
2211
2212                 /*
2213                  * Open-code file_start_write here to grab freeze protection,
2214                  * which will be released by another thread in
2215                  * io_complete_rw().  Fool lockdep by telling it the lock got
2216                  * released so that it doesn't complain about the held lock when
2217                  * we return to userspace.
2218                  */
2219                 if (req->flags & REQ_F_ISREG) {
2220                         __sb_start_write(file_inode(req->file)->i_sb,
2221                                                 SB_FREEZE_WRITE, true);
2222                         __sb_writers_release(file_inode(req->file)->i_sb,
2223                                                 SB_FREEZE_WRITE);
2224                 }
2225                 kiocb->ki_flags |= IOCB_WRITE;
2226
2227                 if (req->file->f_op->write_iter)
2228                         ret2 = call_write_iter(req->file, kiocb, &iter);
2229                 else
2230                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
2231                 if (!force_nonblock || ret2 != -EAGAIN) {
2232                         kiocb_done(kiocb, ret2, nxt, req->in_async);
2233                 } else {
2234 copy_iov:
2235                         ret = io_setup_async_rw(req, io_size, iovec,
2236                                                 inline_vecs, &iter);
2237                         if (ret)
2238                                 goto out_free;
2239                         return -EAGAIN;
2240                 }
2241         }
2242 out_free:
2243         if (!io_wq_current_is_worker())
2244                 kfree(iovec);
2245         return ret;
2246 }
2247
2248 /*
2249  * IORING_OP_NOP just posts a completion event, nothing else.
2250  */
2251 static int io_nop(struct io_kiocb *req)
2252 {
2253         struct io_ring_ctx *ctx = req->ctx;
2254
2255         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2256                 return -EINVAL;
2257
2258         io_cqring_add_event(req, 0);
2259         io_put_req(req);
2260         return 0;
2261 }
2262
2263 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2264 {
2265         struct io_ring_ctx *ctx = req->ctx;
2266
2267         if (!req->file)
2268                 return -EBADF;
2269
2270         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2271                 return -EINVAL;
2272         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2273                 return -EINVAL;
2274
2275         req->sync.flags = READ_ONCE(sqe->fsync_flags);
2276         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2277                 return -EINVAL;
2278
2279         req->sync.off = READ_ONCE(sqe->off);
2280         req->sync.len = READ_ONCE(sqe->len);
2281         return 0;
2282 }
2283
2284 static bool io_req_cancelled(struct io_kiocb *req)
2285 {
2286         if (req->work.flags & IO_WQ_WORK_CANCEL) {
2287                 req_set_fail_links(req);
2288                 io_cqring_add_event(req, -ECANCELED);
2289                 io_put_req(req);
2290                 return true;
2291         }
2292
2293         return false;
2294 }
2295
2296 static void io_link_work_cb(struct io_wq_work **workptr)
2297 {
2298         struct io_wq_work *work = *workptr;
2299         struct io_kiocb *link = work->data;
2300
2301         io_queue_linked_timeout(link);
2302         work->func = io_wq_submit_work;
2303 }
2304
2305 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
2306 {
2307         struct io_kiocb *link;
2308
2309         io_prep_async_work(nxt, &link);
2310         *workptr = &nxt->work;
2311         if (link) {
2312                 nxt->work.flags |= IO_WQ_WORK_CB;
2313                 nxt->work.func = io_link_work_cb;
2314                 nxt->work.data = link;
2315         }
2316 }
2317
2318 static void io_fsync_finish(struct io_wq_work **workptr)
2319 {
2320         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2321         loff_t end = req->sync.off + req->sync.len;
2322         struct io_kiocb *nxt = NULL;
2323         int ret;
2324
2325         if (io_req_cancelled(req))
2326                 return;
2327
2328         ret = vfs_fsync_range(req->file, req->sync.off,
2329                                 end > 0 ? end : LLONG_MAX,
2330                                 req->sync.flags & IORING_FSYNC_DATASYNC);
2331         if (ret < 0)
2332                 req_set_fail_links(req);
2333         io_cqring_add_event(req, ret);
2334         io_put_req_find_next(req, &nxt);
2335         if (nxt)
2336                 io_wq_assign_next(workptr, nxt);
2337 }
2338
2339 static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
2340                     bool force_nonblock)
2341 {
2342         struct io_wq_work *work, *old_work;
2343
2344         /* fsync always requires a blocking context */
2345         if (force_nonblock) {
2346                 io_put_req(req);
2347                 req->work.func = io_fsync_finish;
2348                 return -EAGAIN;
2349         }
2350
2351         work = old_work = &req->work;
2352         io_fsync_finish(&work);
2353         if (work && work != old_work)
2354                 *nxt = container_of(work, struct io_kiocb, work);
2355         return 0;
2356 }
2357
2358 static void io_fallocate_finish(struct io_wq_work **workptr)
2359 {
2360         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2361         struct io_kiocb *nxt = NULL;
2362         int ret;
2363
2364         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2365                                 req->sync.len);
2366         if (ret < 0)
2367                 req_set_fail_links(req);
2368         io_cqring_add_event(req, ret);
2369         io_put_req_find_next(req, &nxt);
2370         if (nxt)
2371                 io_wq_assign_next(workptr, nxt);
2372 }
2373
2374 static int io_fallocate_prep(struct io_kiocb *req,
2375                              const struct io_uring_sqe *sqe)
2376 {
2377         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2378                 return -EINVAL;
2379
2380         req->sync.off = READ_ONCE(sqe->off);
2381         req->sync.len = READ_ONCE(sqe->addr);
2382         req->sync.mode = READ_ONCE(sqe->len);
2383         return 0;
2384 }
2385
2386 static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
2387                         bool force_nonblock)
2388 {
2389         struct io_wq_work *work, *old_work;
2390
2391         /* fallocate always requiring blocking context */
2392         if (force_nonblock) {
2393                 io_put_req(req);
2394                 req->work.func = io_fallocate_finish;
2395                 return -EAGAIN;
2396         }
2397
2398         work = old_work = &req->work;
2399         io_fallocate_finish(&work);
2400         if (work && work != old_work)
2401                 *nxt = container_of(work, struct io_kiocb, work);
2402
2403         return 0;
2404 }
2405
2406 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2407 {
2408         int ret;
2409
2410         if (sqe->ioprio || sqe->buf_index)
2411                 return -EINVAL;
2412
2413         req->open.dfd = READ_ONCE(sqe->fd);
2414         req->open.mode = READ_ONCE(sqe->len);
2415         req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2416         req->open.flags = READ_ONCE(sqe->open_flags);
2417
2418         req->open.filename = getname(req->open.fname);
2419         if (IS_ERR(req->open.filename)) {
2420                 ret = PTR_ERR(req->open.filename);
2421                 req->open.filename = NULL;
2422                 return ret;
2423         }
2424
2425         return 0;
2426 }
2427
2428 static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
2429                      bool force_nonblock)
2430 {
2431         struct open_flags op;
2432         struct open_how how;
2433         struct file *file;
2434         int ret;
2435
2436         if (force_nonblock) {
2437                 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
2438                 return -EAGAIN;
2439         }
2440
2441         how = build_open_how(req->open.flags, req->open.mode);
2442         ret = build_open_flags(&how, &op);
2443         if (ret)
2444                 goto err;
2445
2446         ret = get_unused_fd_flags(how.flags);
2447         if (ret < 0)
2448                 goto err;
2449
2450         file = do_filp_open(req->open.dfd, req->open.filename, &op);
2451         if (IS_ERR(file)) {
2452                 put_unused_fd(ret);
2453                 ret = PTR_ERR(file);
2454         } else {
2455                 fsnotify_open(file);
2456                 fd_install(ret, file);
2457         }
2458 err:
2459         putname(req->open.filename);
2460         if (ret < 0)
2461                 req_set_fail_links(req);
2462         io_cqring_add_event(req, ret);
2463         io_put_req_find_next(req, nxt);
2464         return 0;
2465 }
2466
2467 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2468 {
2469 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2470         if (sqe->ioprio || sqe->buf_index || sqe->off)
2471                 return -EINVAL;
2472
2473         req->madvise.addr = READ_ONCE(sqe->addr);
2474         req->madvise.len = READ_ONCE(sqe->len);
2475         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
2476         return 0;
2477 #else
2478         return -EOPNOTSUPP;
2479 #endif
2480 }
2481
2482 static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
2483                       bool force_nonblock)
2484 {
2485 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2486         struct io_madvise *ma = &req->madvise;
2487         int ret;
2488
2489         if (force_nonblock)
2490                 return -EAGAIN;
2491
2492         ret = do_madvise(ma->addr, ma->len, ma->advice);
2493         if (ret < 0)
2494                 req_set_fail_links(req);
2495         io_cqring_add_event(req, ret);
2496         io_put_req_find_next(req, nxt);
2497         return 0;
2498 #else
2499         return -EOPNOTSUPP;
2500 #endif
2501 }
2502
2503 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2504 {
2505         if (sqe->ioprio || sqe->buf_index || sqe->addr)
2506                 return -EINVAL;
2507
2508         req->fadvise.offset = READ_ONCE(sqe->off);
2509         req->fadvise.len = READ_ONCE(sqe->len);
2510         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
2511         return 0;
2512 }
2513
2514 static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
2515                       bool force_nonblock)
2516 {
2517         struct io_fadvise *fa = &req->fadvise;
2518         int ret;
2519
2520         /* DONTNEED may block, others _should_ not */
2521         if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
2522                 return -EAGAIN;
2523
2524         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
2525         if (ret < 0)
2526                 req_set_fail_links(req);
2527         io_cqring_add_event(req, ret);
2528         io_put_req_find_next(req, nxt);
2529         return 0;
2530 }
2531
2532 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2533 {
2534         unsigned lookup_flags;
2535         int ret;
2536
2537         if (sqe->ioprio || sqe->buf_index)
2538                 return -EINVAL;
2539
2540         req->open.dfd = READ_ONCE(sqe->fd);
2541         req->open.mask = READ_ONCE(sqe->len);
2542         req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2543         req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2544         req->open.flags = READ_ONCE(sqe->statx_flags);
2545
2546         if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.flags))
2547                 return -EINVAL;
2548
2549         req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL);
2550         if (IS_ERR(req->open.filename)) {
2551                 ret = PTR_ERR(req->open.filename);
2552                 req->open.filename = NULL;
2553                 return ret;
2554         }
2555
2556         return 0;
2557 }
2558
2559 static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
2560                     bool force_nonblock)
2561 {
2562         struct io_open *ctx = &req->open;
2563         unsigned lookup_flags;
2564         struct path path;
2565         struct kstat stat;
2566         int ret;
2567
2568         if (force_nonblock)
2569                 return -EAGAIN;
2570
2571         if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->flags))
2572                 return -EINVAL;
2573
2574 retry:
2575         /* filename_lookup() drops it, keep a reference */
2576         ctx->filename->refcnt++;
2577
2578         ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
2579                                 NULL);
2580         if (ret)
2581                 goto err;
2582
2583         ret = vfs_getattr(&path, &stat, ctx->mask, ctx->flags);
2584         path_put(&path);
2585         if (retry_estale(ret, lookup_flags)) {
2586                 lookup_flags |= LOOKUP_REVAL;
2587                 goto retry;
2588         }
2589         if (!ret)
2590                 ret = cp_statx(&stat, ctx->buffer);
2591 err:
2592         putname(ctx->filename);
2593         if (ret < 0)
2594                 req_set_fail_links(req);
2595         io_cqring_add_event(req, ret);
2596         io_put_req_find_next(req, nxt);
2597         return 0;
2598 }
2599
2600 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2601 {
2602         /*
2603          * If we queue this for async, it must not be cancellable. That would
2604          * leave the 'file' in an undeterminate state.
2605          */
2606         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
2607
2608         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
2609             sqe->rw_flags || sqe->buf_index)
2610                 return -EINVAL;
2611         if (sqe->flags & IOSQE_FIXED_FILE)
2612                 return -EINVAL;
2613
2614         req->close.fd = READ_ONCE(sqe->fd);
2615         if (req->file->f_op == &io_uring_fops ||
2616             req->close.fd == req->ring_fd)
2617                 return -EBADF;
2618
2619         return 0;
2620 }
2621
2622 static void io_close_finish(struct io_wq_work **workptr)
2623 {
2624         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2625         struct io_kiocb *nxt = NULL;
2626
2627         /* Invoked with files, we need to do the close */
2628         if (req->work.files) {
2629                 int ret;
2630
2631                 ret = filp_close(req->close.put_file, req->work.files);
2632                 if (ret < 0) {
2633                         req_set_fail_links(req);
2634                 }
2635                 io_cqring_add_event(req, ret);
2636         }
2637
2638         fput(req->close.put_file);
2639
2640         /* we bypassed the re-issue, drop the submission reference */
2641         io_put_req(req);
2642         io_put_req_find_next(req, &nxt);
2643         if (nxt)
2644                 io_wq_assign_next(workptr, nxt);
2645 }
2646
2647 static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
2648                     bool force_nonblock)
2649 {
2650         int ret;
2651
2652         req->close.put_file = NULL;
2653         ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
2654         if (ret < 0)
2655                 return ret;
2656
2657         /* if the file has a flush method, be safe and punt to async */
2658         if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) {
2659                 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
2660                 goto eagain;
2661         }
2662
2663         /*
2664          * No ->flush(), safely close from here and just punt the
2665          * fput() to async context.
2666          */
2667         ret = filp_close(req->close.put_file, current->files);
2668
2669         if (ret < 0)
2670                 req_set_fail_links(req);
2671         io_cqring_add_event(req, ret);
2672
2673         if (io_wq_current_is_worker()) {
2674                 struct io_wq_work *old_work, *work;
2675
2676                 old_work = work = &req->work;
2677                 io_close_finish(&work);
2678                 if (work && work != old_work)
2679                         *nxt = container_of(work, struct io_kiocb, work);
2680                 return 0;
2681         }
2682
2683 eagain:
2684         req->work.func = io_close_finish;
2685         return -EAGAIN;
2686 }
2687
2688 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2689 {
2690         struct io_ring_ctx *ctx = req->ctx;
2691
2692         if (!req->file)
2693                 return -EBADF;
2694
2695         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2696                 return -EINVAL;
2697         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2698                 return -EINVAL;
2699
2700         req->sync.off = READ_ONCE(sqe->off);
2701         req->sync.len = READ_ONCE(sqe->len);
2702         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
2703         return 0;
2704 }
2705
2706 static void io_sync_file_range_finish(struct io_wq_work **workptr)
2707 {
2708         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2709         struct io_kiocb *nxt = NULL;
2710         int ret;
2711
2712         if (io_req_cancelled(req))
2713                 return;
2714
2715         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
2716                                 req->sync.flags);
2717         if (ret < 0)
2718                 req_set_fail_links(req);
2719         io_cqring_add_event(req, ret);
2720         io_put_req_find_next(req, &nxt);
2721         if (nxt)
2722                 io_wq_assign_next(workptr, nxt);
2723 }
2724
2725 static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
2726                               bool force_nonblock)
2727 {
2728         struct io_wq_work *work, *old_work;
2729
2730         /* sync_file_range always requires a blocking context */
2731         if (force_nonblock) {
2732                 io_put_req(req);
2733                 req->work.func = io_sync_file_range_finish;
2734                 return -EAGAIN;
2735         }
2736
2737         work = old_work = &req->work;
2738         io_sync_file_range_finish(&work);
2739         if (work && work != old_work)
2740                 *nxt = container_of(work, struct io_kiocb, work);
2741         return 0;
2742 }
2743
2744 #if defined(CONFIG_NET)
2745 static void io_sendrecv_async(struct io_wq_work **workptr)
2746 {
2747         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2748         struct iovec *iov = NULL;
2749
2750         if (req->io->rw.iov != req->io->rw.fast_iov)
2751                 iov = req->io->msg.iov;
2752         io_wq_submit_work(workptr);
2753         kfree(iov);
2754 }
2755 #endif
2756
2757 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2758 {
2759 #if defined(CONFIG_NET)
2760         struct io_sr_msg *sr = &req->sr_msg;
2761         struct io_async_ctx *io = req->io;
2762
2763         sr->msg_flags = READ_ONCE(sqe->msg_flags);
2764         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
2765
2766         if (!io)
2767                 return 0;
2768
2769         io->msg.iov = io->msg.fast_iov;
2770         return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
2771                                         &io->msg.iov);
2772 #else
2773         return -EOPNOTSUPP;
2774 #endif
2775 }
2776
2777 static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2778                       bool force_nonblock)
2779 {
2780 #if defined(CONFIG_NET)
2781         struct io_async_msghdr *kmsg = NULL;
2782         struct socket *sock;
2783         int ret;
2784
2785         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2786                 return -EINVAL;
2787
2788         sock = sock_from_file(req->file, &ret);
2789         if (sock) {
2790                 struct io_async_ctx io;
2791                 struct sockaddr_storage addr;
2792                 unsigned flags;
2793
2794                 if (req->io) {
2795                         kmsg = &req->io->msg;
2796                         kmsg->msg.msg_name = &addr;
2797                         /* if iov is set, it's allocated already */
2798                         if (!kmsg->iov)
2799                                 kmsg->iov = kmsg->fast_iov;
2800                         kmsg->msg.msg_iter.iov = kmsg->iov;
2801                 } else {
2802                         struct io_sr_msg *sr = &req->sr_msg;
2803
2804                         kmsg = &io.msg;
2805                         kmsg->msg.msg_name = &addr;
2806
2807                         io.msg.iov = io.msg.fast_iov;
2808                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
2809                                         sr->msg_flags, &io.msg.iov);
2810                         if (ret)
2811                                 return ret;
2812                 }
2813
2814                 flags = req->sr_msg.msg_flags;
2815                 if (flags & MSG_DONTWAIT)
2816                         req->flags |= REQ_F_NOWAIT;
2817                 else if (force_nonblock)
2818                         flags |= MSG_DONTWAIT;
2819
2820                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
2821                 if (force_nonblock && ret == -EAGAIN) {
2822                         if (req->io)
2823                                 return -EAGAIN;
2824                         if (io_alloc_async_ctx(req))
2825                                 return -ENOMEM;
2826                         memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2827                         req->work.func = io_sendrecv_async;
2828                         return -EAGAIN;
2829                 }
2830                 if (ret == -ERESTARTSYS)
2831                         ret = -EINTR;
2832         }
2833
2834         if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
2835                 kfree(kmsg->iov);
2836         io_cqring_add_event(req, ret);
2837         if (ret < 0)
2838                 req_set_fail_links(req);
2839         io_put_req_find_next(req, nxt);
2840         return 0;
2841 #else
2842         return -EOPNOTSUPP;
2843 #endif
2844 }
2845
2846 static int io_recvmsg_prep(struct io_kiocb *req,
2847                            const struct io_uring_sqe *sqe)
2848 {
2849 #if defined(CONFIG_NET)
2850         struct io_sr_msg *sr = &req->sr_msg;
2851         struct io_async_ctx *io = req->io;
2852
2853         sr->msg_flags = READ_ONCE(sqe->msg_flags);
2854         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
2855
2856         if (!io)
2857                 return 0;
2858
2859         io->msg.iov = io->msg.fast_iov;
2860         return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
2861                                         &io->msg.uaddr, &io->msg.iov);
2862 #else
2863         return -EOPNOTSUPP;
2864 #endif
2865 }
2866
2867 static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2868                       bool force_nonblock)
2869 {
2870 #if defined(CONFIG_NET)
2871         struct io_async_msghdr *kmsg = NULL;
2872         struct socket *sock;
2873         int ret;
2874
2875         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2876                 return -EINVAL;
2877
2878         sock = sock_from_file(req->file, &ret);
2879         if (sock) {
2880                 struct io_async_ctx io;
2881                 struct sockaddr_storage addr;
2882                 unsigned flags;
2883
2884                 if (req->io) {
2885                         kmsg = &req->io->msg;
2886                         kmsg->msg.msg_name = &addr;
2887                         /* if iov is set, it's allocated already */
2888                         if (!kmsg->iov)
2889                                 kmsg->iov = kmsg->fast_iov;
2890                         kmsg->msg.msg_iter.iov = kmsg->iov;
2891                 } else {
2892                         struct io_sr_msg *sr = &req->sr_msg;
2893
2894                         kmsg = &io.msg;
2895                         kmsg->msg.msg_name = &addr;
2896
2897                         io.msg.iov = io.msg.fast_iov;
2898                         ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
2899                                         sr->msg_flags, &io.msg.uaddr,
2900                                         &io.msg.iov);
2901                         if (ret)
2902                                 return ret;
2903                 }
2904
2905                 flags = req->sr_msg.msg_flags;
2906                 if (flags & MSG_DONTWAIT)
2907                         req->flags |= REQ_F_NOWAIT;
2908                 else if (force_nonblock)
2909                         flags |= MSG_DONTWAIT;
2910
2911                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
2912                                                 kmsg->uaddr, flags);
2913                 if (force_nonblock && ret == -EAGAIN) {
2914                         if (req->io)
2915                                 return -EAGAIN;
2916                         if (io_alloc_async_ctx(req))
2917                                 return -ENOMEM;
2918                         memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2919                         req->work.func = io_sendrecv_async;
2920                         return -EAGAIN;
2921                 }
2922                 if (ret == -ERESTARTSYS)
2923                         ret = -EINTR;
2924         }
2925
2926         if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
2927                 kfree(kmsg->iov);
2928         io_cqring_add_event(req, ret);
2929         if (ret < 0)
2930                 req_set_fail_links(req);
2931         io_put_req_find_next(req, nxt);
2932         return 0;
2933 #else
2934         return -EOPNOTSUPP;
2935 #endif
2936 }
2937
2938 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2939 {
2940 #if defined(CONFIG_NET)
2941         struct io_accept *accept = &req->accept;
2942
2943         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2944                 return -EINVAL;
2945         if (sqe->ioprio || sqe->len || sqe->buf_index)
2946                 return -EINVAL;
2947
2948         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
2949         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2950         accept->flags = READ_ONCE(sqe->accept_flags);
2951         return 0;
2952 #else
2953         return -EOPNOTSUPP;
2954 #endif
2955 }
2956
2957 #if defined(CONFIG_NET)
2958 static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2959                        bool force_nonblock)
2960 {
2961         struct io_accept *accept = &req->accept;
2962         unsigned file_flags;
2963         int ret;
2964
2965         file_flags = force_nonblock ? O_NONBLOCK : 0;
2966         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
2967                                         accept->addr_len, accept->flags);
2968         if (ret == -EAGAIN && force_nonblock)
2969                 return -EAGAIN;
2970         if (ret == -ERESTARTSYS)
2971                 ret = -EINTR;
2972         if (ret < 0)
2973                 req_set_fail_links(req);
2974         io_cqring_add_event(req, ret);
2975         io_put_req_find_next(req, nxt);
2976         return 0;
2977 }
2978
2979 static void io_accept_finish(struct io_wq_work **workptr)
2980 {
2981         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2982         struct io_kiocb *nxt = NULL;
2983
2984         if (io_req_cancelled(req))
2985                 return;
2986         __io_accept(req, &nxt, false);
2987         if (nxt)
2988                 io_wq_assign_next(workptr, nxt);
2989 }
2990 #endif
2991
2992 static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2993                      bool force_nonblock)
2994 {
2995 #if defined(CONFIG_NET)
2996         int ret;
2997
2998         ret = __io_accept(req, nxt, force_nonblock);
2999         if (ret == -EAGAIN && force_nonblock) {
3000                 req->work.func = io_accept_finish;
3001                 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
3002                 io_put_req(req);
3003                 return -EAGAIN;
3004         }
3005         return 0;
3006 #else
3007         return -EOPNOTSUPP;
3008 #endif
3009 }
3010
3011 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3012 {
3013 #if defined(CONFIG_NET)
3014         struct io_connect *conn = &req->connect;
3015         struct io_async_ctx *io = req->io;
3016
3017         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3018                 return -EINVAL;
3019         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3020                 return -EINVAL;
3021
3022         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3023         conn->addr_len =  READ_ONCE(sqe->addr2);
3024
3025         if (!io)
3026                 return 0;
3027
3028         return move_addr_to_kernel(conn->addr, conn->addr_len,
3029                                         &io->connect.address);
3030 #else
3031         return -EOPNOTSUPP;
3032 #endif
3033 }
3034
3035 static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
3036                       bool force_nonblock)
3037 {
3038 #if defined(CONFIG_NET)
3039         struct io_async_ctx __io, *io;
3040         unsigned file_flags;
3041         int ret;
3042
3043         if (req->io) {
3044                 io = req->io;
3045         } else {
3046                 ret = move_addr_to_kernel(req->connect.addr,
3047                                                 req->connect.addr_len,
3048                                                 &__io.connect.address);
3049                 if (ret)
3050                         goto out;
3051                 io = &__io;
3052         }
3053
3054         file_flags = force_nonblock ? O_NONBLOCK : 0;
3055
3056         ret = __sys_connect_file(req->file, &io->connect.address,
3057                                         req->connect.addr_len, file_flags);
3058         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
3059                 if (req->io)
3060                         return -EAGAIN;
3061                 if (io_alloc_async_ctx(req)) {
3062                         ret = -ENOMEM;
3063                         goto out;
3064                 }
3065                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
3066                 return -EAGAIN;
3067         }
3068         if (ret == -ERESTARTSYS)
3069                 ret = -EINTR;
3070 out:
3071         if (ret < 0)
3072                 req_set_fail_links(req);
3073         io_cqring_add_event(req, ret);
3074         io_put_req_find_next(req, nxt);
3075         return 0;
3076 #else
3077         return -EOPNOTSUPP;
3078 #endif
3079 }
3080
3081 static void io_poll_remove_one(struct io_kiocb *req)
3082 {
3083         struct io_poll_iocb *poll = &req->poll;
3084
3085         spin_lock(&poll->head->lock);
3086         WRITE_ONCE(poll->canceled, true);
3087         if (!list_empty(&poll->wait.entry)) {
3088                 list_del_init(&poll->wait.entry);
3089                 io_queue_async_work(req);
3090         }
3091         spin_unlock(&poll->head->lock);
3092         hash_del(&req->hash_node);
3093 }
3094
3095 static void io_poll_remove_all(struct io_ring_ctx *ctx)
3096 {
3097         struct hlist_node *tmp;
3098         struct io_kiocb *req;
3099         int i;
3100
3101         spin_lock_irq(&ctx->completion_lock);
3102         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
3103                 struct hlist_head *list;
3104
3105                 list = &ctx->cancel_hash[i];
3106                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
3107                         io_poll_remove_one(req);
3108         }
3109         spin_unlock_irq(&ctx->completion_lock);
3110 }
3111
3112 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
3113 {
3114         struct hlist_head *list;
3115         struct io_kiocb *req;
3116
3117         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
3118         hlist_for_each_entry(req, list, hash_node) {
3119                 if (sqe_addr == req->user_data) {
3120                         io_poll_remove_one(req);
3121                         return 0;
3122                 }
3123         }
3124
3125         return -ENOENT;
3126 }
3127
3128 static int io_poll_remove_prep(struct io_kiocb *req,
3129                                const struct io_uring_sqe *sqe)
3130 {
3131         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3132                 return -EINVAL;
3133         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3134             sqe->poll_events)
3135                 return -EINVAL;
3136
3137         req->poll.addr = READ_ONCE(sqe->addr);
3138         return 0;
3139 }
3140
3141 /*
3142  * Find a running poll command that matches one specified in sqe->addr,
3143  * and remove it if found.
3144  */
3145 static int io_poll_remove(struct io_kiocb *req)
3146 {
3147         struct io_ring_ctx *ctx = req->ctx;
3148         u64 addr;
3149         int ret;
3150
3151         addr = req->poll.addr;
3152         spin_lock_irq(&ctx->completion_lock);
3153         ret = io_poll_cancel(ctx, addr);
3154         spin_unlock_irq(&ctx->completion_lock);
3155
3156         io_cqring_add_event(req, ret);
3157         if (ret < 0)
3158                 req_set_fail_links(req);
3159         io_put_req(req);
3160         return 0;
3161 }
3162
3163 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
3164 {
3165         struct io_ring_ctx *ctx = req->ctx;
3166
3167         req->poll.done = true;
3168         if (error)
3169                 io_cqring_fill_event(req, error);
3170         else
3171                 io_cqring_fill_event(req, mangle_poll(mask));
3172         io_commit_cqring(ctx);
3173 }
3174
3175 static void io_poll_complete_work(struct io_wq_work **workptr)
3176 {
3177         struct io_wq_work *work = *workptr;
3178         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3179         struct io_poll_iocb *poll = &req->poll;
3180         struct poll_table_struct pt = { ._key = poll->events };
3181         struct io_ring_ctx *ctx = req->ctx;
3182         struct io_kiocb *nxt = NULL;
3183         __poll_t mask = 0;
3184         int ret = 0;
3185
3186         if (work->flags & IO_WQ_WORK_CANCEL) {
3187                 WRITE_ONCE(poll->canceled, true);
3188                 ret = -ECANCELED;
3189         } else if (READ_ONCE(poll->canceled)) {
3190                 ret = -ECANCELED;
3191         }
3192
3193         if (ret != -ECANCELED)
3194                 mask = vfs_poll(poll->file, &pt) & poll->events;
3195
3196         /*
3197          * Note that ->ki_cancel callers also delete iocb from active_reqs after
3198          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
3199          * synchronize with them.  In the cancellation case the list_del_init
3200          * itself is not actually needed, but harmless so we keep it in to
3201          * avoid further branches in the fast path.
3202          */
3203         spin_lock_irq(&ctx->completion_lock);
3204         if (!mask && ret != -ECANCELED) {
3205                 add_wait_queue(poll->head, &poll->wait);
3206                 spin_unlock_irq(&ctx->completion_lock);
3207                 return;
3208         }
3209         hash_del(&req->hash_node);
3210         io_poll_complete(req, mask, ret);
3211         spin_unlock_irq(&ctx->completion_lock);
3212
3213         io_cqring_ev_posted(ctx);
3214
3215         if (ret < 0)
3216                 req_set_fail_links(req);
3217         io_put_req_find_next(req, &nxt);
3218         if (nxt)
3219                 io_wq_assign_next(workptr, nxt);
3220 }
3221
3222 static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
3223 {
3224         void *reqs[IO_IOPOLL_BATCH];
3225         struct io_kiocb *req, *tmp;
3226         int to_free = 0;
3227
3228         spin_lock_irq(&ctx->completion_lock);
3229         llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
3230                 hash_del(&req->hash_node);
3231                 io_poll_complete(req, req->result, 0);
3232
3233                 if (refcount_dec_and_test(&req->refs)) {
3234                         if (io_req_multi_free(req)) {
3235                                 reqs[to_free++] = req;
3236                                 if (to_free == ARRAY_SIZE(reqs))
3237                                         io_free_req_many(ctx, reqs, &to_free);
3238                         } else {
3239                                 req->flags |= REQ_F_COMP_LOCKED;
3240                                 io_free_req(req);
3241                         }
3242                 }
3243         }
3244         spin_unlock_irq(&ctx->completion_lock);
3245
3246         io_cqring_ev_posted(ctx);
3247         io_free_req_many(ctx, reqs, &to_free);
3248 }
3249
3250 static void io_poll_flush(struct io_wq_work **workptr)
3251 {
3252         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3253         struct llist_node *nodes;
3254
3255         nodes = llist_del_all(&req->ctx->poll_llist);
3256         if (nodes)
3257                 __io_poll_flush(req->ctx, nodes);
3258 }
3259
3260 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
3261                         void *key)
3262 {
3263         struct io_poll_iocb *poll = wait->private;
3264         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
3265         struct io_ring_ctx *ctx = req->ctx;
3266         __poll_t mask = key_to_poll(key);
3267
3268         /* for instances that support it check for an event match first: */
3269         if (mask && !(mask & poll->events))
3270                 return 0;
3271
3272         list_del_init(&poll->wait.entry);
3273
3274         /*
3275          * Run completion inline if we can. We're using trylock here because
3276          * we are violating the completion_lock -> poll wq lock ordering.
3277          * If we have a link timeout we're going to need the completion_lock
3278          * for finalizing the request, mark us as having grabbed that already.
3279          */
3280         if (mask) {
3281                 unsigned long flags;
3282
3283                 if (llist_empty(&ctx->poll_llist) &&
3284                     spin_trylock_irqsave(&ctx->completion_lock, flags)) {
3285                         hash_del(&req->hash_node);
3286                         io_poll_complete(req, mask, 0);
3287                         req->flags |= REQ_F_COMP_LOCKED;
3288                         io_put_req(req);
3289                         spin_unlock_irqrestore(&ctx->completion_lock, flags);
3290
3291                         io_cqring_ev_posted(ctx);
3292                         req = NULL;
3293                 } else {
3294                         req->result = mask;
3295                         req->llist_node.next = NULL;
3296                         /* if the list wasn't empty, we're done */
3297                         if (!llist_add(&req->llist_node, &ctx->poll_llist))
3298                                 req = NULL;
3299                         else
3300                                 req->work.func = io_poll_flush;
3301                 }
3302         }
3303         if (req)
3304                 io_queue_async_work(req);
3305
3306         return 1;
3307 }
3308
3309 struct io_poll_table {
3310         struct poll_table_struct pt;
3311         struct io_kiocb *req;
3312         int error;
3313 };
3314
3315 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
3316                                struct poll_table_struct *p)
3317 {
3318         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
3319
3320         if (unlikely(pt->req->poll.head)) {
3321                 pt->error = -EINVAL;
3322                 return;
3323         }
3324
3325         pt->error = 0;
3326         pt->req->poll.head = head;
3327         add_wait_queue(head, &pt->req->poll.wait);
3328 }
3329
3330 static void io_poll_req_insert(struct io_kiocb *req)
3331 {
3332         struct io_ring_ctx *ctx = req->ctx;
3333         struct hlist_head *list;
3334
3335         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
3336         hlist_add_head(&req->hash_node, list);
3337 }
3338
3339 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3340 {
3341         struct io_poll_iocb *poll = &req->poll;
3342         u16 events;
3343
3344         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3345                 return -EINVAL;
3346         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
3347                 return -EINVAL;
3348         if (!poll->file)
3349                 return -EBADF;
3350
3351         events = READ_ONCE(sqe->poll_events);
3352         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
3353         return 0;
3354 }
3355
3356 static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
3357 {
3358         struct io_poll_iocb *poll = &req->poll;
3359         struct io_ring_ctx *ctx = req->ctx;
3360         struct io_poll_table ipt;
3361         bool cancel = false;
3362         __poll_t mask;
3363
3364         INIT_IO_WORK(&req->work, io_poll_complete_work);
3365         INIT_HLIST_NODE(&req->hash_node);
3366
3367         poll->head = NULL;
3368         poll->done = false;
3369         poll->canceled = false;
3370
3371         ipt.pt._qproc = io_poll_queue_proc;
3372         ipt.pt._key = poll->events;
3373         ipt.req = req;
3374         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
3375
3376         /* initialized the list so that we can do list_empty checks */
3377         INIT_LIST_HEAD(&poll->wait.entry);
3378         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
3379         poll->wait.private = poll;
3380
3381         INIT_LIST_HEAD(&req->list);
3382
3383         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
3384
3385         spin_lock_irq(&ctx->completion_lock);
3386         if (likely(poll->head)) {
3387                 spin_lock(&poll->head->lock);
3388                 if (unlikely(list_empty(&poll->wait.entry))) {
3389                         if (ipt.error)
3390                                 cancel = true;
3391                         ipt.error = 0;
3392                         mask = 0;
3393                 }
3394                 if (mask || ipt.error)
3395                         list_del_init(&poll->wait.entry);
3396                 else if (cancel)
3397                         WRITE_ONCE(poll->canceled, true);
3398                 else if (!poll->done) /* actually waiting for an event */
3399                         io_poll_req_insert(req);
3400                 spin_unlock(&poll->head->lock);
3401         }
3402         if (mask) { /* no async, we'd stolen it */
3403                 ipt.error = 0;
3404                 io_poll_complete(req, mask, 0);
3405         }
3406         spin_unlock_irq(&ctx->completion_lock);
3407
3408         if (mask) {
3409                 io_cqring_ev_posted(ctx);
3410                 io_put_req_find_next(req, nxt);
3411         }
3412         return ipt.error;
3413 }
3414
3415 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
3416 {
3417         struct io_timeout_data *data = container_of(timer,
3418                                                 struct io_timeout_data, timer);
3419         struct io_kiocb *req = data->req;
3420         struct io_ring_ctx *ctx = req->ctx;
3421         unsigned long flags;
3422
3423         atomic_inc(&ctx->cq_timeouts);
3424
3425         spin_lock_irqsave(&ctx->completion_lock, flags);
3426         /*
3427          * We could be racing with timeout deletion. If the list is empty,
3428          * then timeout lookup already found it and will be handling it.
3429          */
3430         if (!list_empty(&req->list)) {
3431                 struct io_kiocb *prev;
3432
3433                 /*
3434                  * Adjust the reqs sequence before the current one because it
3435                  * will consume a slot in the cq_ring and the cq_tail
3436                  * pointer will be increased, otherwise other timeout reqs may
3437                  * return in advance without waiting for enough wait_nr.
3438                  */
3439                 prev = req;
3440                 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
3441                         prev->sequence++;
3442                 list_del_init(&req->list);
3443         }
3444
3445         io_cqring_fill_event(req, -ETIME);
3446         io_commit_cqring(ctx);
3447         spin_unlock_irqrestore(&ctx->completion_lock, flags);
3448
3449         io_cqring_ev_posted(ctx);
3450         req_set_fail_links(req);
3451         io_put_req(req);
3452         return HRTIMER_NORESTART;
3453 }
3454
3455 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
3456 {
3457         struct io_kiocb *req;
3458         int ret = -ENOENT;
3459
3460         list_for_each_entry(req, &ctx->timeout_list, list) {
3461                 if (user_data == req->user_data) {
3462                         list_del_init(&req->list);
3463                         ret = 0;
3464                         break;
3465                 }
3466         }
3467
3468         if (ret == -ENOENT)
3469                 return ret;
3470
3471         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
3472         if (ret == -1)
3473                 return -EALREADY;
3474
3475         req_set_fail_links(req);
3476         io_cqring_fill_event(req, -ECANCELED);
3477         io_put_req(req);
3478         return 0;
3479 }
3480
3481 static int io_timeout_remove_prep(struct io_kiocb *req,
3482                                   const struct io_uring_sqe *sqe)
3483 {
3484         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3485                 return -EINVAL;
3486         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
3487                 return -EINVAL;
3488
3489         req->timeout.addr = READ_ONCE(sqe->addr);
3490         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
3491         if (req->timeout.flags)
3492                 return -EINVAL;
3493
3494         return 0;
3495 }
3496
3497 /*
3498  * Remove or update an existing timeout command
3499  */
3500 static int io_timeout_remove(struct io_kiocb *req)
3501 {
3502         struct io_ring_ctx *ctx = req->ctx;
3503         int ret;
3504
3505         spin_lock_irq(&ctx->completion_lock);
3506         ret = io_timeout_cancel(ctx, req->timeout.addr);
3507
3508         io_cqring_fill_event(req, ret);
3509         io_commit_cqring(ctx);
3510         spin_unlock_irq(&ctx->completion_lock);
3511         io_cqring_ev_posted(ctx);
3512         if (ret < 0)
3513                 req_set_fail_links(req);
3514         io_put_req(req);
3515         return 0;
3516 }
3517
3518 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3519                            bool is_timeout_link)
3520 {
3521         struct io_timeout_data *data;
3522         unsigned flags;
3523
3524         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3525                 return -EINVAL;
3526         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
3527                 return -EINVAL;
3528         if (sqe->off && is_timeout_link)
3529                 return -EINVAL;
3530         flags = READ_ONCE(sqe->timeout_flags);
3531         if (flags & ~IORING_TIMEOUT_ABS)
3532                 return -EINVAL;
3533
3534         req->timeout.count = READ_ONCE(sqe->off);
3535
3536         if (!req->io && io_alloc_async_ctx(req))
3537                 return -ENOMEM;
3538
3539         data = &req->io->timeout;
3540         data->req = req;
3541         req->flags |= REQ_F_TIMEOUT;
3542
3543         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
3544                 return -EFAULT;
3545
3546         if (flags & IORING_TIMEOUT_ABS)
3547                 data->mode = HRTIMER_MODE_ABS;
3548         else
3549                 data->mode = HRTIMER_MODE_REL;
3550
3551         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
3552         return 0;
3553 }
3554
3555 static int io_timeout(struct io_kiocb *req)
3556 {
3557         unsigned count;
3558         struct io_ring_ctx *ctx = req->ctx;
3559         struct io_timeout_data *data;
3560         struct list_head *entry;
3561         unsigned span = 0;
3562
3563         data = &req->io->timeout;
3564
3565         /*
3566          * sqe->off holds how many events that need to occur for this
3567          * timeout event to be satisfied. If it isn't set, then this is
3568          * a pure timeout request, sequence isn't used.
3569          */
3570         count = req->timeout.count;
3571         if (!count) {
3572                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
3573                 spin_lock_irq(&ctx->completion_lock);
3574                 entry = ctx->timeout_list.prev;
3575                 goto add;
3576         }
3577
3578         req->sequence = ctx->cached_sq_head + count - 1;
3579         data->seq_offset = count;
3580
3581         /*
3582          * Insertion sort, ensuring the first entry in the list is always
3583          * the one we need first.
3584          */
3585         spin_lock_irq(&ctx->completion_lock);
3586         list_for_each_prev(entry, &ctx->timeout_list) {
3587                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
3588                 unsigned nxt_sq_head;
3589                 long long tmp, tmp_nxt;
3590                 u32 nxt_offset = nxt->io->timeout.seq_offset;
3591
3592                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
3593                         continue;
3594
3595                 /*
3596                  * Since cached_sq_head + count - 1 can overflow, use type long
3597                  * long to store it.
3598                  */
3599                 tmp = (long long)ctx->cached_sq_head + count - 1;
3600                 nxt_sq_head = nxt->sequence - nxt_offset + 1;
3601                 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
3602
3603                 /*
3604                  * cached_sq_head may overflow, and it will never overflow twice
3605                  * once there is some timeout req still be valid.
3606                  */
3607                 if (ctx->cached_sq_head < nxt_sq_head)
3608                         tmp += UINT_MAX;
3609
3610                 if (tmp > tmp_nxt)
3611                         break;
3612
3613                 /*
3614                  * Sequence of reqs after the insert one and itself should
3615                  * be adjusted because each timeout req consumes a slot.
3616                  */
3617                 span++;
3618                 nxt->sequence++;
3619         }
3620         req->sequence -= span;
3621 add:
3622         list_add(&req->list, entry);
3623         data->timer.function = io_timeout_fn;
3624         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
3625         spin_unlock_irq(&ctx->completion_lock);
3626         return 0;
3627 }
3628
3629 static bool io_cancel_cb(struct io_wq_work *work, void *data)
3630 {
3631         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3632
3633         return req->user_data == (unsigned long) data;
3634 }
3635
3636 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
3637 {
3638         enum io_wq_cancel cancel_ret;
3639         int ret = 0;
3640
3641         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
3642         switch (cancel_ret) {
3643         case IO_WQ_CANCEL_OK:
3644                 ret = 0;
3645                 break;
3646         case IO_WQ_CANCEL_RUNNING:
3647                 ret = -EALREADY;
3648                 break;
3649         case IO_WQ_CANCEL_NOTFOUND:
3650                 ret = -ENOENT;
3651                 break;
3652         }
3653
3654         return ret;
3655 }
3656
3657 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
3658                                      struct io_kiocb *req, __u64 sqe_addr,
3659                                      struct io_kiocb **nxt, int success_ret)
3660 {
3661         unsigned long flags;
3662         int ret;
3663
3664         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
3665         if (ret != -ENOENT) {
3666                 spin_lock_irqsave(&ctx->completion_lock, flags);
3667                 goto done;
3668         }
3669
3670         spin_lock_irqsave(&ctx->completion_lock, flags);
3671         ret = io_timeout_cancel(ctx, sqe_addr);
3672         if (ret != -ENOENT)
3673                 goto done;
3674         ret = io_poll_cancel(ctx, sqe_addr);
3675 done:
3676         if (!ret)
3677                 ret = success_ret;
3678         io_cqring_fill_event(req, ret);
3679         io_commit_cqring(ctx);
3680         spin_unlock_irqrestore(&ctx->completion_lock, flags);
3681         io_cqring_ev_posted(ctx);
3682
3683         if (ret < 0)
3684                 req_set_fail_links(req);
3685         io_put_req_find_next(req, nxt);
3686 }
3687
3688 static int io_async_cancel_prep(struct io_kiocb *req,
3689                                 const struct io_uring_sqe *sqe)
3690 {
3691         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3692                 return -EINVAL;
3693         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
3694             sqe->cancel_flags)
3695                 return -EINVAL;
3696
3697         req->cancel.addr = READ_ONCE(sqe->addr);
3698         return 0;
3699 }
3700
3701 static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
3702 {
3703         struct io_ring_ctx *ctx = req->ctx;
3704
3705         io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
3706         return 0;
3707 }
3708
3709 static int io_files_update_prep(struct io_kiocb *req,
3710                                 const struct io_uring_sqe *sqe)
3711 {
3712         if (sqe->flags || sqe->ioprio || sqe->rw_flags)
3713                 return -EINVAL;
3714
3715         req->files_update.offset = READ_ONCE(sqe->off);
3716         req->files_update.nr_args = READ_ONCE(sqe->len);
3717         if (!req->files_update.nr_args)
3718                 return -EINVAL;
3719         req->files_update.arg = READ_ONCE(sqe->addr);
3720         return 0;
3721 }
3722
3723 static int io_files_update(struct io_kiocb *req, bool force_nonblock)
3724 {
3725         struct io_ring_ctx *ctx = req->ctx;
3726         struct io_uring_files_update up;
3727         int ret;
3728
3729         if (force_nonblock) {
3730                 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
3731                 return -EAGAIN;
3732         }
3733
3734         up.offset = req->files_update.offset;
3735         up.fds = req->files_update.arg;
3736
3737         mutex_lock(&ctx->uring_lock);
3738         ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
3739         mutex_unlock(&ctx->uring_lock);
3740
3741         if (ret < 0)
3742                 req_set_fail_links(req);
3743         io_cqring_add_event(req, ret);
3744         io_put_req(req);
3745         return 0;
3746 }
3747
3748 static int io_req_defer_prep(struct io_kiocb *req,
3749                              const struct io_uring_sqe *sqe)
3750 {
3751         ssize_t ret = 0;
3752
3753         switch (req->opcode) {
3754         case IORING_OP_NOP:
3755                 break;
3756         case IORING_OP_READV:
3757         case IORING_OP_READ_FIXED:
3758         case IORING_OP_READ:
3759                 ret = io_read_prep(req, sqe, true);
3760                 break;
3761         case IORING_OP_WRITEV:
3762         case IORING_OP_WRITE_FIXED:
3763         case IORING_OP_WRITE:
3764                 ret = io_write_prep(req, sqe, true);
3765                 break;
3766         case IORING_OP_POLL_ADD:
3767                 ret = io_poll_add_prep(req, sqe);
3768                 break;
3769         case IORING_OP_POLL_REMOVE:
3770                 ret = io_poll_remove_prep(req, sqe);
3771                 break;
3772         case IORING_OP_FSYNC:
3773                 ret = io_prep_fsync(req, sqe);
3774                 break;
3775         case IORING_OP_SYNC_FILE_RANGE:
3776                 ret = io_prep_sfr(req, sqe);
3777                 break;
3778         case IORING_OP_SENDMSG:
3779                 ret = io_sendmsg_prep(req, sqe);
3780                 break;
3781         case IORING_OP_RECVMSG:
3782                 ret = io_recvmsg_prep(req, sqe);
3783                 break;
3784         case IORING_OP_CONNECT:
3785                 ret = io_connect_prep(req, sqe);
3786                 break;
3787         case IORING_OP_TIMEOUT:
3788                 ret = io_timeout_prep(req, sqe, false);
3789                 break;
3790         case IORING_OP_TIMEOUT_REMOVE:
3791                 ret = io_timeout_remove_prep(req, sqe);
3792                 break;
3793         case IORING_OP_ASYNC_CANCEL:
3794                 ret = io_async_cancel_prep(req, sqe);
3795                 break;
3796         case IORING_OP_LINK_TIMEOUT:
3797                 ret = io_timeout_prep(req, sqe, true);
3798                 break;
3799         case IORING_OP_ACCEPT:
3800                 ret = io_accept_prep(req, sqe);
3801                 break;
3802         case IORING_OP_FALLOCATE:
3803                 ret = io_fallocate_prep(req, sqe);
3804                 break;
3805         case IORING_OP_OPENAT:
3806                 ret = io_openat_prep(req, sqe);
3807                 break;
3808         case IORING_OP_CLOSE:
3809                 ret = io_close_prep(req, sqe);
3810                 break;
3811         case IORING_OP_FILES_UPDATE:
3812                 ret = io_files_update_prep(req, sqe);
3813                 break;
3814         case IORING_OP_STATX:
3815                 ret = io_statx_prep(req, sqe);
3816                 break;
3817         case IORING_OP_FADVISE:
3818                 ret = io_fadvise_prep(req, sqe);
3819                 break;
3820         case IORING_OP_MADVISE:
3821                 ret = io_madvise_prep(req, sqe);
3822                 break;
3823         default:
3824                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
3825                                 req->opcode);
3826                 ret = -EINVAL;
3827                 break;
3828         }
3829
3830         return ret;
3831 }
3832
3833 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3834 {
3835         struct io_ring_ctx *ctx = req->ctx;
3836         int ret;
3837
3838         /* Still need defer if there is pending req in defer list. */
3839         if (!req_need_defer(req) && list_empty(&ctx->defer_list))
3840                 return 0;
3841
3842         if (!req->io && io_alloc_async_ctx(req))
3843                 return -EAGAIN;
3844
3845         ret = io_req_defer_prep(req, sqe);
3846         if (ret < 0)
3847                 return ret;
3848
3849         spin_lock_irq(&ctx->completion_lock);
3850         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
3851                 spin_unlock_irq(&ctx->completion_lock);
3852                 return 0;
3853         }
3854
3855         trace_io_uring_defer(ctx, req, req->user_data);
3856         list_add_tail(&req->list, &ctx->defer_list);
3857         spin_unlock_irq(&ctx->completion_lock);
3858         return -EIOCBQUEUED;
3859 }
3860
3861 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3862                         struct io_kiocb **nxt, bool force_nonblock)
3863 {
3864         struct io_ring_ctx *ctx = req->ctx;
3865         int ret;
3866
3867         switch (req->opcode) {
3868         case IORING_OP_NOP:
3869                 ret = io_nop(req);
3870                 break;
3871         case IORING_OP_READV:
3872         case IORING_OP_READ_FIXED:
3873         case IORING_OP_READ:
3874                 if (sqe) {
3875                         ret = io_read_prep(req, sqe, force_nonblock);
3876                         if (ret < 0)
3877                                 break;
3878                 }
3879                 ret = io_read(req, nxt, force_nonblock);
3880                 break;
3881         case IORING_OP_WRITEV:
3882         case IORING_OP_WRITE_FIXED:
3883         case IORING_OP_WRITE:
3884                 if (sqe) {
3885                         ret = io_write_prep(req, sqe, force_nonblock);
3886                         if (ret < 0)
3887                                 break;
3888                 }
3889                 ret = io_write(req, nxt, force_nonblock);
3890                 break;
3891         case IORING_OP_FSYNC:
3892                 if (sqe) {
3893                         ret = io_prep_fsync(req, sqe);
3894                         if (ret < 0)
3895                                 break;
3896                 }
3897                 ret = io_fsync(req, nxt, force_nonblock);
3898                 break;
3899         case IORING_OP_POLL_ADD:
3900                 if (sqe) {
3901                         ret = io_poll_add_prep(req, sqe);
3902                         if (ret)
3903                                 break;
3904                 }
3905                 ret = io_poll_add(req, nxt);
3906                 break;
3907         case IORING_OP_POLL_REMOVE:
3908                 if (sqe) {
3909                         ret = io_poll_remove_prep(req, sqe);
3910                         if (ret < 0)
3911                                 break;
3912                 }
3913                 ret = io_poll_remove(req);
3914                 break;
3915         case IORING_OP_SYNC_FILE_RANGE:
3916                 if (sqe) {
3917                         ret = io_prep_sfr(req, sqe);
3918                         if (ret < 0)
3919                                 break;
3920                 }
3921                 ret = io_sync_file_range(req, nxt, force_nonblock);
3922                 break;
3923         case IORING_OP_SENDMSG:
3924                 if (sqe) {
3925                         ret = io_sendmsg_prep(req, sqe);
3926                         if (ret < 0)
3927                                 break;
3928                 }
3929                 ret = io_sendmsg(req, nxt, force_nonblock);
3930                 break;
3931         case IORING_OP_RECVMSG:
3932                 if (sqe) {
3933                         ret = io_recvmsg_prep(req, sqe);
3934                         if (ret)
3935                                 break;
3936                 }
3937                 ret = io_recvmsg(req, nxt, force_nonblock);
3938                 break;
3939         case IORING_OP_TIMEOUT:
3940                 if (sqe) {
3941                         ret = io_timeout_prep(req, sqe, false);
3942                         if (ret)
3943                                 break;
3944                 }
3945                 ret = io_timeout(req);
3946                 break;
3947         case IORING_OP_TIMEOUT_REMOVE:
3948                 if (sqe) {
3949                         ret = io_timeout_remove_prep(req, sqe);
3950                         if (ret)
3951                                 break;
3952                 }
3953                 ret = io_timeout_remove(req);
3954                 break;
3955         case IORING_OP_ACCEPT:
3956                 if (sqe) {
3957                         ret = io_accept_prep(req, sqe);
3958                         if (ret)
3959                                 break;
3960                 }
3961                 ret = io_accept(req, nxt, force_nonblock);
3962                 break;
3963         case IORING_OP_CONNECT:
3964                 if (sqe) {
3965                         ret = io_connect_prep(req, sqe);
3966                         if (ret)
3967                                 break;
3968                 }
3969                 ret = io_connect(req, nxt, force_nonblock);
3970                 break;
3971         case IORING_OP_ASYNC_CANCEL:
3972                 if (sqe) {
3973                         ret = io_async_cancel_prep(req, sqe);
3974                         if (ret)
3975                                 break;
3976                 }
3977                 ret = io_async_cancel(req, nxt);
3978                 break;
3979         case IORING_OP_FALLOCATE:
3980                 if (sqe) {
3981                         ret = io_fallocate_prep(req, sqe);
3982                         if (ret)
3983                                 break;
3984                 }
3985                 ret = io_fallocate(req, nxt, force_nonblock);
3986                 break;
3987         case IORING_OP_OPENAT:
3988                 if (sqe) {
3989                         ret = io_openat_prep(req, sqe);
3990                         if (ret)
3991                                 break;
3992                 }
3993                 ret = io_openat(req, nxt, force_nonblock);
3994                 break;
3995         case IORING_OP_CLOSE:
3996                 if (sqe) {
3997                         ret = io_close_prep(req, sqe);
3998                         if (ret)
3999                                 break;
4000                 }
4001                 ret = io_close(req, nxt, force_nonblock);
4002                 break;
4003         case IORING_OP_FILES_UPDATE:
4004                 if (sqe) {
4005                         ret = io_files_update_prep(req, sqe);
4006                         if (ret)
4007                                 break;
4008                 }
4009                 ret = io_files_update(req, force_nonblock);
4010                 break;
4011         case IORING_OP_STATX:
4012                 if (sqe) {
4013                         ret = io_statx_prep(req, sqe);
4014                         if (ret)
4015                                 break;
4016                 }
4017                 ret = io_statx(req, nxt, force_nonblock);
4018                 break;
4019         case IORING_OP_FADVISE:
4020                 if (sqe) {
4021                         ret = io_fadvise_prep(req, sqe);
4022                         if (ret)
4023                                 break;
4024                 }
4025                 ret = io_fadvise(req, nxt, force_nonblock);
4026                 break;
4027         case IORING_OP_MADVISE:
4028                 if (sqe) {
4029                         ret = io_madvise_prep(req, sqe);
4030                         if (ret)
4031                                 break;
4032                 }
4033                 ret = io_madvise(req, nxt, force_nonblock);
4034                 break;
4035         default:
4036                 ret = -EINVAL;
4037                 break;
4038         }
4039
4040         if (ret)
4041                 return ret;
4042
4043         if (ctx->flags & IORING_SETUP_IOPOLL) {
4044                 const bool in_async = io_wq_current_is_worker();
4045
4046                 if (req->result == -EAGAIN)
4047                         return -EAGAIN;
4048
4049                 /* workqueue context doesn't hold uring_lock, grab it now */
4050                 if (in_async)
4051                         mutex_lock(&ctx->uring_lock);
4052
4053                 io_iopoll_req_issued(req);
4054
4055                 if (in_async)
4056                         mutex_unlock(&ctx->uring_lock);
4057         }
4058
4059         return 0;
4060 }
4061
4062 static void io_wq_submit_work(struct io_wq_work **workptr)
4063 {
4064         struct io_wq_work *work = *workptr;
4065         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4066         struct io_kiocb *nxt = NULL;
4067         int ret = 0;
4068
4069         /* if NO_CANCEL is set, we must still run the work */
4070         if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
4071                                 IO_WQ_WORK_CANCEL) {
4072                 ret = -ECANCELED;
4073         }
4074
4075         if (!ret) {
4076                 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
4077                 req->in_async = true;
4078                 do {
4079                         ret = io_issue_sqe(req, NULL, &nxt, false);
4080                         /*
4081                          * We can get EAGAIN for polled IO even though we're
4082                          * forcing a sync submission from here, since we can't
4083                          * wait for request slots on the block side.
4084                          */
4085                         if (ret != -EAGAIN)
4086                                 break;
4087                         cond_resched();
4088                 } while (1);
4089         }
4090
4091         /* drop submission reference */
4092         io_put_req(req);
4093
4094         if (ret) {
4095                 req_set_fail_links(req);
4096                 io_cqring_add_event(req, ret);
4097                 io_put_req(req);
4098         }
4099
4100         /* if a dependent link is ready, pass it back */
4101         if (!ret && nxt)
4102                 io_wq_assign_next(workptr, nxt);
4103 }
4104
4105 static int io_req_needs_file(struct io_kiocb *req, int fd)
4106 {
4107         if (!io_op_defs[req->opcode].needs_file)
4108                 return 0;
4109         if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
4110                 return 0;
4111         return 1;
4112 }
4113
4114 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
4115                                               int index)
4116 {
4117         struct fixed_file_table *table;
4118
4119         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
4120         return table->files[index & IORING_FILE_TABLE_MASK];;
4121 }
4122
4123 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
4124                            const struct io_uring_sqe *sqe)
4125 {
4126         struct io_ring_ctx *ctx = req->ctx;
4127         unsigned flags;
4128         int fd;
4129
4130         flags = READ_ONCE(sqe->flags);
4131         fd = READ_ONCE(sqe->fd);
4132
4133         if (flags & IOSQE_IO_DRAIN)
4134                 req->flags |= REQ_F_IO_DRAIN;
4135
4136         if (!io_req_needs_file(req, fd))
4137                 return 0;
4138
4139         if (flags & IOSQE_FIXED_FILE) {
4140                 if (unlikely(!ctx->file_data ||
4141                     (unsigned) fd >= ctx->nr_user_files))
4142                         return -EBADF;
4143                 fd = array_index_nospec(fd, ctx->nr_user_files);
4144                 req->file = io_file_from_index(ctx, fd);
4145                 if (!req->file)
4146                         return -EBADF;
4147                 req->flags |= REQ_F_FIXED_FILE;
4148                 percpu_ref_get(&ctx->file_data->refs);
4149         } else {
4150                 if (req->needs_fixed_file)
4151                         return -EBADF;
4152                 trace_io_uring_file_get(ctx, fd);
4153                 req->file = io_file_get(state, fd);
4154                 if (unlikely(!req->file))
4155                         return -EBADF;
4156         }
4157
4158         return 0;
4159 }
4160
4161 static int io_grab_files(struct io_kiocb *req)
4162 {
4163         int ret = -EBADF;
4164         struct io_ring_ctx *ctx = req->ctx;
4165
4166         if (!req->ring_file)
4167                 return -EBADF;
4168
4169         rcu_read_lock();
4170         spin_lock_irq(&ctx->inflight_lock);
4171         /*
4172          * We use the f_ops->flush() handler to ensure that we can flush
4173          * out work accessing these files if the fd is closed. Check if
4174          * the fd has changed since we started down this path, and disallow
4175          * this operation if it has.
4176          */
4177         if (fcheck(req->ring_fd) == req->ring_file) {
4178                 list_add(&req->inflight_entry, &ctx->inflight_list);
4179                 req->flags |= REQ_F_INFLIGHT;
4180                 req->work.files = current->files;
4181                 ret = 0;
4182         }
4183         spin_unlock_irq(&ctx->inflight_lock);
4184         rcu_read_unlock();
4185
4186         return ret;
4187 }
4188
4189 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
4190 {
4191         struct io_timeout_data *data = container_of(timer,
4192                                                 struct io_timeout_data, timer);
4193         struct io_kiocb *req = data->req;
4194         struct io_ring_ctx *ctx = req->ctx;
4195         struct io_kiocb *prev = NULL;
4196         unsigned long flags;
4197
4198         spin_lock_irqsave(&ctx->completion_lock, flags);
4199
4200         /*
4201          * We don't expect the list to be empty, that will only happen if we
4202          * race with the completion of the linked work.
4203          */
4204         if (!list_empty(&req->link_list)) {
4205                 prev = list_entry(req->link_list.prev, struct io_kiocb,
4206                                   link_list);
4207                 if (refcount_inc_not_zero(&prev->refs)) {
4208                         list_del_init(&req->link_list);
4209                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
4210                 } else
4211                         prev = NULL;
4212         }
4213
4214         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4215
4216         if (prev) {
4217                 req_set_fail_links(prev);
4218                 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
4219                                                 -ETIME);
4220                 io_put_req(prev);
4221         } else {
4222                 io_cqring_add_event(req, -ETIME);
4223                 io_put_req(req);
4224         }
4225         return HRTIMER_NORESTART;
4226 }
4227
4228 static void io_queue_linked_timeout(struct io_kiocb *req)
4229 {
4230         struct io_ring_ctx *ctx = req->ctx;
4231
4232         /*
4233          * If the list is now empty, then our linked request finished before
4234          * we got a chance to setup the timer
4235          */
4236         spin_lock_irq(&ctx->completion_lock);
4237         if (!list_empty(&req->link_list)) {
4238                 struct io_timeout_data *data = &req->io->timeout;
4239
4240                 data->timer.function = io_link_timeout_fn;
4241                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
4242                                 data->mode);
4243         }
4244         spin_unlock_irq(&ctx->completion_lock);
4245
4246         /* drop submission reference */
4247         io_put_req(req);
4248 }
4249
4250 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
4251 {
4252         struct io_kiocb *nxt;
4253
4254         if (!(req->flags & REQ_F_LINK))
4255                 return NULL;
4256
4257         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
4258                                         link_list);
4259         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
4260                 return NULL;
4261
4262         req->flags |= REQ_F_LINK_TIMEOUT;
4263         return nxt;
4264 }
4265
4266 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4267 {
4268         struct io_kiocb *linked_timeout;
4269         struct io_kiocb *nxt = NULL;
4270         int ret;
4271
4272 again:
4273         linked_timeout = io_prep_linked_timeout(req);
4274
4275         ret = io_issue_sqe(req, sqe, &nxt, true);
4276
4277         /*
4278          * We async punt it if the file wasn't marked NOWAIT, or if the file
4279          * doesn't support non-blocking read/write attempts
4280          */
4281         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
4282             (req->flags & REQ_F_MUST_PUNT))) {
4283                 if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
4284                         ret = io_grab_files(req);
4285                         if (ret)
4286                                 goto err;
4287                 }
4288
4289                 /*
4290                  * Queued up for async execution, worker will release
4291                  * submit reference when the iocb is actually submitted.
4292                  */
4293                 io_queue_async_work(req);
4294                 goto done_req;
4295         }
4296
4297 err:
4298         /* drop submission reference */
4299         io_put_req(req);
4300
4301         if (linked_timeout) {
4302                 if (!ret)
4303                         io_queue_linked_timeout(linked_timeout);
4304                 else
4305                         io_put_req(linked_timeout);
4306         }
4307
4308         /* and drop final reference, if we failed */
4309         if (ret) {
4310                 io_cqring_add_event(req, ret);
4311                 req_set_fail_links(req);
4312                 io_put_req(req);
4313         }
4314 done_req:
4315         if (nxt) {
4316                 req = nxt;
4317                 nxt = NULL;
4318                 goto again;
4319         }
4320 }
4321
4322 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4323 {
4324         int ret;
4325
4326         if (unlikely(req->ctx->drain_next)) {
4327                 req->flags |= REQ_F_IO_DRAIN;
4328                 req->ctx->drain_next = false;
4329         }
4330         req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
4331
4332         ret = io_req_defer(req, sqe);
4333         if (ret) {
4334                 if (ret != -EIOCBQUEUED) {
4335                         io_cqring_add_event(req, ret);
4336                         req_set_fail_links(req);
4337                         io_double_put_req(req);
4338                 }
4339         } else if ((req->flags & REQ_F_FORCE_ASYNC) &&
4340                    !io_wq_current_is_worker()) {
4341                 /*
4342                  * Never try inline submit of IOSQE_ASYNC is set, go straight
4343                  * to async execution.
4344                  */
4345                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
4346                 io_queue_async_work(req);
4347         } else {
4348                 __io_queue_sqe(req, sqe);
4349         }
4350 }
4351
4352 static inline void io_queue_link_head(struct io_kiocb *req)
4353 {
4354         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
4355                 io_cqring_add_event(req, -ECANCELED);
4356                 io_double_put_req(req);
4357         } else
4358                 io_queue_sqe(req, NULL);
4359 }
4360
4361 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
4362                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
4363
4364 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4365                           struct io_submit_state *state, struct io_kiocb **link)
4366 {
4367         struct io_ring_ctx *ctx = req->ctx;
4368         unsigned int sqe_flags;
4369         int ret;
4370
4371         sqe_flags = READ_ONCE(sqe->flags);
4372
4373         /* enforce forwards compatibility on users */
4374         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
4375                 ret = -EINVAL;
4376                 goto err_req;
4377         }
4378         if (sqe_flags & IOSQE_ASYNC)
4379                 req->flags |= REQ_F_FORCE_ASYNC;
4380
4381         ret = io_req_set_file(state, req, sqe);
4382         if (unlikely(ret)) {
4383 err_req:
4384                 io_cqring_add_event(req, ret);
4385                 io_double_put_req(req);
4386                 return false;
4387         }
4388
4389         /*
4390          * If we already have a head request, queue this one for async
4391          * submittal once the head completes. If we don't have a head but
4392          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
4393          * submitted sync once the chain is complete. If none of those
4394          * conditions are true (normal request), then just queue it.
4395          */
4396         if (*link) {
4397                 struct io_kiocb *head = *link;
4398
4399                 if (sqe_flags & IOSQE_IO_DRAIN)
4400                         head->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
4401
4402                 if (sqe_flags & IOSQE_IO_HARDLINK)
4403                         req->flags |= REQ_F_HARDLINK;
4404
4405                 if (io_alloc_async_ctx(req)) {
4406                         ret = -EAGAIN;
4407                         goto err_req;
4408                 }
4409
4410                 ret = io_req_defer_prep(req, sqe);
4411                 if (ret) {
4412                         /* fail even hard links since we don't submit */
4413                         head->flags |= REQ_F_FAIL_LINK;
4414                         goto err_req;
4415                 }
4416                 trace_io_uring_link(ctx, req, head);
4417                 list_add_tail(&req->link_list, &head->link_list);
4418
4419                 /* last request of a link, enqueue the link */
4420                 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
4421                         io_queue_link_head(head);
4422                         *link = NULL;
4423                 }
4424         } else if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
4425                 req->flags |= REQ_F_LINK;
4426                 if (sqe_flags & IOSQE_IO_HARDLINK)
4427                         req->flags |= REQ_F_HARDLINK;
4428
4429                 INIT_LIST_HEAD(&req->link_list);
4430                 ret = io_req_defer_prep(req, sqe);
4431                 if (ret)
4432                         req->flags |= REQ_F_FAIL_LINK;
4433                 *link = req;
4434         } else {
4435                 io_queue_sqe(req, sqe);
4436         }
4437
4438         return true;
4439 }
4440
4441 /*
4442  * Batched submission is done, ensure local IO is flushed out.
4443  */
4444 static void io_submit_state_end(struct io_submit_state *state)
4445 {
4446         blk_finish_plug(&state->plug);
4447         io_file_put(state);
4448         if (state->free_reqs)
4449                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
4450                                         &state->reqs[state->cur_req]);
4451 }
4452
4453 /*
4454  * Start submission side cache.
4455  */
4456 static void io_submit_state_start(struct io_submit_state *state,
4457                                   unsigned int max_ios)
4458 {
4459         blk_start_plug(&state->plug);
4460         state->free_reqs = 0;
4461         state->file = NULL;
4462         state->ios_left = max_ios;
4463 }
4464
4465 static void io_commit_sqring(struct io_ring_ctx *ctx)
4466 {
4467         struct io_rings *rings = ctx->rings;
4468
4469         if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
4470                 /*
4471                  * Ensure any loads from the SQEs are done at this point,
4472                  * since once we write the new head, the application could
4473                  * write new data to them.
4474                  */
4475                 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
4476         }
4477 }
4478
4479 /*
4480  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
4481  * that is mapped by userspace. This means that care needs to be taken to
4482  * ensure that reads are stable, as we cannot rely on userspace always
4483  * being a good citizen. If members of the sqe are validated and then later
4484  * used, it's important that those reads are done through READ_ONCE() to
4485  * prevent a re-load down the line.
4486  */
4487 static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
4488                           const struct io_uring_sqe **sqe_ptr)
4489 {
4490         struct io_rings *rings = ctx->rings;
4491         u32 *sq_array = ctx->sq_array;
4492         unsigned head;
4493
4494         /*
4495          * The cached sq head (or cq tail) serves two purposes:
4496          *
4497          * 1) allows us to batch the cost of updating the user visible
4498          *    head updates.
4499          * 2) allows the kernel side to track the head on its own, even
4500          *    though the application is the one updating it.
4501          */
4502         head = ctx->cached_sq_head;
4503         /* make sure SQ entry isn't read before tail */
4504         if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
4505                 return false;
4506
4507         head = READ_ONCE(sq_array[head & ctx->sq_mask]);
4508         if (likely(head < ctx->sq_entries)) {
4509                 /*
4510                  * All io need record the previous position, if LINK vs DARIN,
4511                  * it can be used to mark the position of the first IO in the
4512                  * link list.
4513                  */
4514                 req->sequence = ctx->cached_sq_head;
4515                 *sqe_ptr = &ctx->sq_sqes[head];
4516                 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
4517                 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
4518                 ctx->cached_sq_head++;
4519                 return true;
4520         }
4521
4522         /* drop invalid entries */
4523         ctx->cached_sq_head++;
4524         ctx->cached_sq_dropped++;
4525         WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
4526         return false;
4527 }
4528
4529 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
4530                           struct file *ring_file, int ring_fd,
4531                           struct mm_struct **mm, bool async)
4532 {
4533         struct io_submit_state state, *statep = NULL;
4534         struct io_kiocb *link = NULL;
4535         int i, submitted = 0;
4536         bool mm_fault = false;
4537
4538         /* if we have a backlog and couldn't flush it all, return BUSY */
4539         if (test_bit(0, &ctx->sq_check_overflow)) {
4540                 if (!list_empty(&ctx->cq_overflow_list) &&
4541                     !io_cqring_overflow_flush(ctx, false))
4542                         return -EBUSY;
4543         }
4544
4545         if (!percpu_ref_tryget_many(&ctx->refs, nr))
4546                 return -EAGAIN;
4547
4548         if (nr > IO_PLUG_THRESHOLD) {
4549                 io_submit_state_start(&state, nr);
4550                 statep = &state;
4551         }
4552
4553         for (i = 0; i < nr; i++) {
4554                 const struct io_uring_sqe *sqe;
4555                 struct io_kiocb *req;
4556
4557                 req = io_get_req(ctx, statep);
4558                 if (unlikely(!req)) {
4559                         if (!submitted)
4560                                 submitted = -EAGAIN;
4561                         break;
4562                 }
4563                 if (!io_get_sqring(ctx, req, &sqe)) {
4564                         __io_req_do_free(req);
4565                         break;
4566                 }
4567
4568                 /* will complete beyond this point, count as submitted */
4569                 submitted++;
4570
4571                 if (unlikely(req->opcode >= IORING_OP_LAST)) {
4572                         io_cqring_add_event(req, -EINVAL);
4573                         io_double_put_req(req);
4574                         break;
4575                 }
4576
4577                 if (io_op_defs[req->opcode].needs_mm && !*mm) {
4578                         mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
4579                         if (!mm_fault) {
4580                                 use_mm(ctx->sqo_mm);
4581                                 *mm = ctx->sqo_mm;
4582                         }
4583                 }
4584
4585                 req->ring_file = ring_file;
4586                 req->ring_fd = ring_fd;
4587                 req->has_user = *mm != NULL;
4588                 req->in_async = async;
4589                 req->needs_fixed_file = async;
4590                 trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
4591                 if (!io_submit_sqe(req, sqe, statep, &link))
4592                         break;
4593         }
4594
4595         if (submitted != nr)
4596                 percpu_ref_put_many(&ctx->refs, nr - submitted);
4597         if (link)
4598                 io_queue_link_head(link);
4599         if (statep)
4600                 io_submit_state_end(&state);
4601
4602          /* Commit SQ ring head once we've consumed and submitted all SQEs */
4603         io_commit_sqring(ctx);
4604
4605         return submitted;
4606 }
4607
4608 static int io_sq_thread(void *data)
4609 {
4610         struct io_ring_ctx *ctx = data;
4611         struct mm_struct *cur_mm = NULL;
4612         const struct cred *old_cred;
4613         mm_segment_t old_fs;
4614         DEFINE_WAIT(wait);
4615         unsigned inflight;
4616         unsigned long timeout;
4617         int ret;
4618
4619         complete(&ctx->completions[1]);
4620
4621         old_fs = get_fs();
4622         set_fs(USER_DS);
4623         old_cred = override_creds(ctx->creds);
4624
4625         ret = timeout = inflight = 0;
4626         while (!kthread_should_park()) {
4627                 unsigned int to_submit;
4628
4629                 if (inflight) {
4630                         unsigned nr_events = 0;
4631
4632                         if (ctx->flags & IORING_SETUP_IOPOLL) {
4633                                 /*
4634                                  * inflight is the count of the maximum possible
4635                                  * entries we submitted, but it can be smaller
4636                                  * if we dropped some of them. If we don't have
4637                                  * poll entries available, then we know that we
4638                                  * have nothing left to poll for. Reset the
4639                                  * inflight count to zero in that case.
4640                                  */
4641                                 mutex_lock(&ctx->uring_lock);
4642                                 if (!list_empty(&ctx->poll_list))
4643                                         __io_iopoll_check(ctx, &nr_events, 0);
4644                                 else
4645                                         inflight = 0;
4646                                 mutex_unlock(&ctx->uring_lock);
4647                         } else {
4648                                 /*
4649                                  * Normal IO, just pretend everything completed.
4650                                  * We don't have to poll completions for that.
4651                                  */
4652                                 nr_events = inflight;
4653                         }
4654
4655                         inflight -= nr_events;
4656                         if (!inflight)
4657                                 timeout = jiffies + ctx->sq_thread_idle;
4658                 }
4659
4660                 to_submit = io_sqring_entries(ctx);
4661
4662                 /*
4663                  * If submit got -EBUSY, flag us as needing the application
4664                  * to enter the kernel to reap and flush events.
4665                  */
4666                 if (!to_submit || ret == -EBUSY) {
4667                         /*
4668                          * We're polling. If we're within the defined idle
4669                          * period, then let us spin without work before going
4670                          * to sleep. The exception is if we got EBUSY doing
4671                          * more IO, we should wait for the application to
4672                          * reap events and wake us up.
4673                          */
4674                         if (inflight ||
4675                             (!time_after(jiffies, timeout) && ret != -EBUSY)) {
4676                                 cond_resched();
4677                                 continue;
4678                         }
4679
4680                         /*
4681                          * Drop cur_mm before scheduling, we can't hold it for
4682                          * long periods (or over schedule()). Do this before
4683                          * adding ourselves to the waitqueue, as the unuse/drop
4684                          * may sleep.
4685                          */
4686                         if (cur_mm) {
4687                                 unuse_mm(cur_mm);
4688                                 mmput(cur_mm);
4689                                 cur_mm = NULL;
4690                         }
4691
4692                         prepare_to_wait(&ctx->sqo_wait, &wait,
4693                                                 TASK_INTERRUPTIBLE);
4694
4695                         /* Tell userspace we may need a wakeup call */
4696                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
4697                         /* make sure to read SQ tail after writing flags */
4698                         smp_mb();
4699
4700                         to_submit = io_sqring_entries(ctx);
4701                         if (!to_submit || ret == -EBUSY) {
4702                                 if (kthread_should_park()) {
4703                                         finish_wait(&ctx->sqo_wait, &wait);
4704                                         break;
4705                                 }
4706                                 if (signal_pending(current))
4707                                         flush_signals(current);
4708                                 schedule();
4709                                 finish_wait(&ctx->sqo_wait, &wait);
4710
4711                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
4712                                 continue;
4713                         }
4714                         finish_wait(&ctx->sqo_wait, &wait);
4715
4716                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
4717                 }
4718
4719                 to_submit = min(to_submit, ctx->sq_entries);
4720                 mutex_lock(&ctx->uring_lock);
4721                 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
4722                 mutex_unlock(&ctx->uring_lock);
4723                 if (ret > 0)
4724                         inflight += ret;
4725         }
4726
4727         set_fs(old_fs);
4728         if (cur_mm) {
4729                 unuse_mm(cur_mm);
4730                 mmput(cur_mm);
4731         }
4732         revert_creds(old_cred);
4733
4734         kthread_parkme();
4735
4736         return 0;
4737 }
4738
4739 struct io_wait_queue {
4740         struct wait_queue_entry wq;
4741         struct io_ring_ctx *ctx;
4742         unsigned to_wait;
4743         unsigned nr_timeouts;
4744 };
4745
4746 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
4747 {
4748         struct io_ring_ctx *ctx = iowq->ctx;
4749
4750         /*
4751          * Wake up if we have enough events, or if a timeout occurred since we
4752          * started waiting. For timeouts, we always want to return to userspace,
4753          * regardless of event count.
4754          */
4755         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
4756                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
4757 }
4758
4759 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
4760                             int wake_flags, void *key)
4761 {
4762         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
4763                                                         wq);
4764
4765         /* use noflush == true, as we can't safely rely on locking context */
4766         if (!io_should_wake(iowq, true))
4767                 return -1;
4768
4769         return autoremove_wake_function(curr, mode, wake_flags, key);
4770 }
4771
4772 /*
4773  * Wait until events become available, if we don't already have some. The
4774  * application must reap them itself, as they reside on the shared cq ring.
4775  */
4776 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
4777                           const sigset_t __user *sig, size_t sigsz)
4778 {
4779         struct io_wait_queue iowq = {
4780                 .wq = {
4781                         .private        = current,
4782                         .func           = io_wake_function,
4783                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
4784                 },
4785                 .ctx            = ctx,
4786                 .to_wait        = min_events,
4787         };
4788         struct io_rings *rings = ctx->rings;
4789         int ret = 0;
4790
4791         if (io_cqring_events(ctx, false) >= min_events)
4792                 return 0;
4793
4794         if (sig) {
4795 #ifdef CONFIG_COMPAT
4796                 if (in_compat_syscall())
4797                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
4798                                                       sigsz);
4799                 else
4800 #endif
4801                         ret = set_user_sigmask(sig, sigsz);
4802
4803                 if (ret)
4804                         return ret;
4805         }
4806
4807         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
4808         trace_io_uring_cqring_wait(ctx, min_events);
4809         do {
4810                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
4811                                                 TASK_INTERRUPTIBLE);
4812                 if (io_should_wake(&iowq, false))
4813                         break;
4814                 schedule();
4815                 if (signal_pending(current)) {
4816                         ret = -EINTR;
4817                         break;
4818                 }
4819         } while (1);
4820         finish_wait(&ctx->wait, &iowq.wq);
4821
4822         restore_saved_sigmask_unless(ret == -EINTR);
4823
4824         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
4825 }
4826
4827 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
4828 {
4829 #if defined(CONFIG_UNIX)
4830         if (ctx->ring_sock) {
4831                 struct sock *sock = ctx->ring_sock->sk;
4832                 struct sk_buff *skb;
4833
4834                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
4835                         kfree_skb(skb);
4836         }
4837 #else
4838         int i;
4839
4840         for (i = 0; i < ctx->nr_user_files; i++) {
4841                 struct file *file;
4842
4843                 file = io_file_from_index(ctx, i);
4844                 if (file)
4845                         fput(file);
4846         }
4847 #endif
4848 }
4849
4850 static void io_file_ref_kill(struct percpu_ref *ref)
4851 {
4852         struct fixed_file_data *data;
4853
4854         data = container_of(ref, struct fixed_file_data, refs);
4855         complete(&data->done);
4856 }
4857
4858 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
4859 {
4860         struct fixed_file_data *data = ctx->file_data;
4861         unsigned nr_tables, i;
4862
4863         if (!data)
4864                 return -ENXIO;
4865
4866         /* protect against inflight atomic switch, which drops the ref */
4867         flush_work(&data->ref_work);
4868         percpu_ref_get(&data->refs);
4869         percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
4870         wait_for_completion(&data->done);
4871         percpu_ref_put(&data->refs);
4872         percpu_ref_exit(&data->refs);
4873
4874         __io_sqe_files_unregister(ctx);
4875         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
4876         for (i = 0; i < nr_tables; i++)
4877                 kfree(data->table[i].files);
4878         kfree(data->table);
4879         kfree(data);
4880         ctx->file_data = NULL;
4881         ctx->nr_user_files = 0;
4882         return 0;
4883 }
4884
4885 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
4886 {
4887         if (ctx->sqo_thread) {
4888                 wait_for_completion(&ctx->completions[1]);
4889                 /*
4890                  * The park is a bit of a work-around, without it we get
4891                  * warning spews on shutdown with SQPOLL set and affinity
4892                  * set to a single CPU.
4893                  */
4894                 kthread_park(ctx->sqo_thread);
4895                 kthread_stop(ctx->sqo_thread);
4896                 ctx->sqo_thread = NULL;
4897         }
4898 }
4899
4900 static void io_finish_async(struct io_ring_ctx *ctx)
4901 {
4902         io_sq_thread_stop(ctx);
4903
4904         if (ctx->io_wq) {
4905                 io_wq_destroy(ctx->io_wq);
4906                 ctx->io_wq = NULL;
4907         }
4908 }
4909
4910 #if defined(CONFIG_UNIX)
4911 /*
4912  * Ensure the UNIX gc is aware of our file set, so we are certain that
4913  * the io_uring can be safely unregistered on process exit, even if we have
4914  * loops in the file referencing.
4915  */
4916 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
4917 {
4918         struct sock *sk = ctx->ring_sock->sk;
4919         struct scm_fp_list *fpl;
4920         struct sk_buff *skb;
4921         int i, nr_files;
4922
4923         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
4924                 unsigned long inflight = ctx->user->unix_inflight + nr;
4925
4926                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
4927                         return -EMFILE;
4928         }
4929
4930         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
4931         if (!fpl)
4932                 return -ENOMEM;
4933
4934         skb = alloc_skb(0, GFP_KERNEL);
4935         if (!skb) {
4936                 kfree(fpl);
4937                 return -ENOMEM;
4938         }
4939
4940         skb->sk = sk;
4941
4942         nr_files = 0;
4943         fpl->user = get_uid(ctx->user);
4944         for (i = 0; i < nr; i++) {
4945                 struct file *file = io_file_from_index(ctx, i + offset);
4946
4947                 if (!file)
4948                         continue;
4949                 fpl->fp[nr_files] = get_file(file);
4950                 unix_inflight(fpl->user, fpl->fp[nr_files]);
4951                 nr_files++;
4952         }
4953
4954         if (nr_files) {
4955                 fpl->max = SCM_MAX_FD;
4956                 fpl->count = nr_files;
4957                 UNIXCB(skb).fp = fpl;
4958                 skb->destructor = unix_destruct_scm;
4959                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
4960                 skb_queue_head(&sk->sk_receive_queue, skb);
4961
4962                 for (i = 0; i < nr_files; i++)
4963                         fput(fpl->fp[i]);
4964         } else {
4965                 kfree_skb(skb);
4966                 kfree(fpl);
4967         }
4968
4969         return 0;
4970 }
4971
4972 /*
4973  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
4974  * causes regular reference counting to break down. We rely on the UNIX
4975  * garbage collection to take care of this problem for us.
4976  */
4977 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
4978 {
4979         unsigned left, total;
4980         int ret = 0;
4981
4982         total = 0;
4983         left = ctx->nr_user_files;
4984         while (left) {
4985                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
4986
4987                 ret = __io_sqe_files_scm(ctx, this_files, total);
4988                 if (ret)
4989                         break;
4990                 left -= this_files;
4991                 total += this_files;
4992         }
4993
4994         if (!ret)
4995                 return 0;
4996
4997         while (total < ctx->nr_user_files) {
4998                 struct file *file = io_file_from_index(ctx, total);
4999
5000                 if (file)
5001                         fput(file);
5002                 total++;
5003         }
5004
5005         return ret;
5006 }
5007 #else
5008 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5009 {
5010         return 0;
5011 }
5012 #endif
5013
5014 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
5015                                     unsigned nr_files)
5016 {
5017         int i;
5018
5019         for (i = 0; i < nr_tables; i++) {
5020                 struct fixed_file_table *table = &ctx->file_data->table[i];
5021                 unsigned this_files;
5022
5023                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
5024                 table->files = kcalloc(this_files, sizeof(struct file *),
5025                                         GFP_KERNEL);
5026                 if (!table->files)
5027                         break;
5028                 nr_files -= this_files;
5029         }
5030
5031         if (i == nr_tables)
5032                 return 0;
5033
5034         for (i = 0; i < nr_tables; i++) {
5035                 struct fixed_file_table *table = &ctx->file_data->table[i];
5036                 kfree(table->files);
5037         }
5038         return 1;
5039 }
5040
5041 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
5042 {
5043 #if defined(CONFIG_UNIX)
5044         struct sock *sock = ctx->ring_sock->sk;
5045         struct sk_buff_head list, *head = &sock->sk_receive_queue;
5046         struct sk_buff *skb;
5047         int i;
5048
5049         __skb_queue_head_init(&list);
5050
5051         /*
5052          * Find the skb that holds this file in its SCM_RIGHTS. When found,
5053          * remove this entry and rearrange the file array.
5054          */
5055         skb = skb_dequeue(head);
5056         while (skb) {
5057                 struct scm_fp_list *fp;
5058
5059                 fp = UNIXCB(skb).fp;
5060                 for (i = 0; i < fp->count; i++) {
5061                         int left;
5062
5063                         if (fp->fp[i] != file)
5064                                 continue;
5065
5066                         unix_notinflight(fp->user, fp->fp[i]);
5067                         left = fp->count - 1 - i;
5068                         if (left) {
5069                                 memmove(&fp->fp[i], &fp->fp[i + 1],
5070                                                 left * sizeof(struct file *));
5071                         }
5072                         fp->count--;
5073                         if (!fp->count) {
5074                                 kfree_skb(skb);
5075                                 skb = NULL;
5076                         } else {
5077                                 __skb_queue_tail(&list, skb);
5078                         }
5079                         fput(file);
5080                         file = NULL;
5081                         break;
5082                 }
5083
5084                 if (!file)
5085                         break;
5086
5087                 __skb_queue_tail(&list, skb);
5088
5089                 skb = skb_dequeue(head);
5090         }
5091
5092         if (skb_peek(&list)) {
5093                 spin_lock_irq(&head->lock);
5094                 while ((skb = __skb_dequeue(&list)) != NULL)
5095                         __skb_queue_tail(head, skb);
5096                 spin_unlock_irq(&head->lock);
5097         }
5098 #else
5099         fput(file);
5100 #endif
5101 }
5102
5103 struct io_file_put {
5104         struct llist_node llist;
5105         struct file *file;
5106         struct completion *done;
5107 };
5108
5109 static void io_ring_file_ref_switch(struct work_struct *work)
5110 {
5111         struct io_file_put *pfile, *tmp;
5112         struct fixed_file_data *data;
5113         struct llist_node *node;
5114
5115         data = container_of(work, struct fixed_file_data, ref_work);
5116
5117         while ((node = llist_del_all(&data->put_llist)) != NULL) {
5118                 llist_for_each_entry_safe(pfile, tmp, node, llist) {
5119                         io_ring_file_put(data->ctx, pfile->file);
5120                         if (pfile->done)
5121                                 complete(pfile->done);
5122                         else
5123                                 kfree(pfile);
5124                 }
5125         }
5126
5127         percpu_ref_get(&data->refs);
5128         percpu_ref_switch_to_percpu(&data->refs);
5129 }
5130
5131 static void io_file_data_ref_zero(struct percpu_ref *ref)
5132 {
5133         struct fixed_file_data *data;
5134
5135         data = container_of(ref, struct fixed_file_data, refs);
5136
5137         /* we can't safely switch from inside this context, punt to wq */
5138         queue_work(system_wq, &data->ref_work);
5139 }
5140
5141 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
5142                                  unsigned nr_args)
5143 {
5144         __s32 __user *fds = (__s32 __user *) arg;
5145         unsigned nr_tables;
5146         struct file *file;
5147         int fd, ret = 0;
5148         unsigned i;
5149
5150         if (ctx->file_data)
5151                 return -EBUSY;
5152         if (!nr_args)
5153                 return -EINVAL;
5154         if (nr_args > IORING_MAX_FIXED_FILES)
5155                 return -EMFILE;
5156
5157         ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
5158         if (!ctx->file_data)
5159                 return -ENOMEM;
5160         ctx->file_data->ctx = ctx;
5161         init_completion(&ctx->file_data->done);
5162
5163         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
5164         ctx->file_data->table = kcalloc(nr_tables,
5165                                         sizeof(struct fixed_file_table),
5166                                         GFP_KERNEL);
5167         if (!ctx->file_data->table) {
5168                 kfree(ctx->file_data);
5169                 ctx->file_data = NULL;
5170                 return -ENOMEM;
5171         }
5172
5173         if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
5174                                 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
5175                 kfree(ctx->file_data->table);
5176                 kfree(ctx->file_data);
5177                 ctx->file_data = NULL;
5178                 return -ENOMEM;
5179         }
5180         ctx->file_data->put_llist.first = NULL;
5181         INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
5182
5183         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
5184                 percpu_ref_exit(&ctx->file_data->refs);
5185                 kfree(ctx->file_data->table);
5186                 kfree(ctx->file_data);
5187                 ctx->file_data = NULL;
5188                 return -ENOMEM;
5189         }
5190
5191         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
5192                 struct fixed_file_table *table;
5193                 unsigned index;
5194
5195                 ret = -EFAULT;
5196                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
5197                         break;
5198                 /* allow sparse sets */
5199                 if (fd == -1) {
5200                         ret = 0;
5201                         continue;
5202                 }
5203
5204                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
5205                 index = i & IORING_FILE_TABLE_MASK;
5206                 file = fget(fd);
5207
5208                 ret = -EBADF;
5209                 if (!file)
5210                         break;
5211
5212                 /*
5213                  * Don't allow io_uring instances to be registered. If UNIX
5214                  * isn't enabled, then this causes a reference cycle and this
5215                  * instance can never get freed. If UNIX is enabled we'll
5216                  * handle it just fine, but there's still no point in allowing
5217                  * a ring fd as it doesn't support regular read/write anyway.
5218                  */
5219                 if (file->f_op == &io_uring_fops) {
5220                         fput(file);
5221                         break;
5222                 }
5223                 ret = 0;
5224                 table->files[index] = file;
5225         }
5226
5227         if (ret) {
5228                 for (i = 0; i < ctx->nr_user_files; i++) {
5229                         file = io_file_from_index(ctx, i);
5230                         if (file)
5231                                 fput(file);
5232                 }
5233                 for (i = 0; i < nr_tables; i++)
5234                         kfree(ctx->file_data->table[i].files);
5235
5236                 kfree(ctx->file_data->table);
5237                 kfree(ctx->file_data);
5238                 ctx->file_data = NULL;
5239                 ctx->nr_user_files = 0;
5240                 return ret;
5241         }
5242
5243         ret = io_sqe_files_scm(ctx);
5244         if (ret)
5245                 io_sqe_files_unregister(ctx);
5246
5247         return ret;
5248 }
5249
5250 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
5251                                 int index)
5252 {
5253 #if defined(CONFIG_UNIX)
5254         struct sock *sock = ctx->ring_sock->sk;
5255         struct sk_buff_head *head = &sock->sk_receive_queue;
5256         struct sk_buff *skb;
5257
5258         /*
5259          * See if we can merge this file into an existing skb SCM_RIGHTS
5260          * file set. If there's no room, fall back to allocating a new skb
5261          * and filling it in.
5262          */
5263         spin_lock_irq(&head->lock);
5264         skb = skb_peek(head);
5265         if (skb) {
5266                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
5267
5268                 if (fpl->count < SCM_MAX_FD) {
5269                         __skb_unlink(skb, head);
5270                         spin_unlock_irq(&head->lock);
5271                         fpl->fp[fpl->count] = get_file(file);
5272                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
5273                         fpl->count++;
5274                         spin_lock_irq(&head->lock);
5275                         __skb_queue_head(head, skb);
5276                 } else {
5277                         skb = NULL;
5278                 }
5279         }
5280         spin_unlock_irq(&head->lock);
5281
5282         if (skb) {
5283                 fput(file);
5284                 return 0;
5285         }
5286
5287         return __io_sqe_files_scm(ctx, 1, index);
5288 #else
5289         return 0;
5290 #endif
5291 }
5292
5293 static void io_atomic_switch(struct percpu_ref *ref)
5294 {
5295         struct fixed_file_data *data;
5296
5297         data = container_of(ref, struct fixed_file_data, refs);
5298         clear_bit(FFD_F_ATOMIC, &data->state);
5299 }
5300
5301 static bool io_queue_file_removal(struct fixed_file_data *data,
5302                                   struct file *file)
5303 {
5304         struct io_file_put *pfile, pfile_stack;
5305         DECLARE_COMPLETION_ONSTACK(done);
5306
5307         /*
5308          * If we fail allocating the struct we need for doing async reomval
5309          * of this file, just punt to sync and wait for it.
5310          */
5311         pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
5312         if (!pfile) {
5313                 pfile = &pfile_stack;
5314                 pfile->done = &done;
5315         }
5316
5317         pfile->file = file;
5318         llist_add(&pfile->llist, &data->put_llist);
5319
5320         if (pfile == &pfile_stack) {
5321                 if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5322                         percpu_ref_put(&data->refs);
5323                         percpu_ref_switch_to_atomic(&data->refs,
5324                                                         io_atomic_switch);
5325                 }
5326                 wait_for_completion(&done);
5327                 flush_work(&data->ref_work);
5328                 return false;
5329         }
5330
5331         return true;
5332 }
5333
5334 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
5335                                  struct io_uring_files_update *up,
5336                                  unsigned nr_args)
5337 {
5338         struct fixed_file_data *data = ctx->file_data;
5339         bool ref_switch = false;
5340         struct file *file;
5341         __s32 __user *fds;
5342         int fd, i, err;
5343         __u32 done;
5344
5345         if (check_add_overflow(up->offset, nr_args, &done))
5346                 return -EOVERFLOW;
5347         if (done > ctx->nr_user_files)
5348                 return -EINVAL;
5349
5350         done = 0;
5351         fds = u64_to_user_ptr(up->fds);
5352         while (nr_args) {
5353                 struct fixed_file_table *table;
5354                 unsigned index;
5355
5356                 err = 0;
5357                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
5358                         err = -EFAULT;
5359                         break;
5360                 }
5361                 i = array_index_nospec(up->offset, ctx->nr_user_files);
5362                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
5363                 index = i & IORING_FILE_TABLE_MASK;
5364                 if (table->files[index]) {
5365                         file = io_file_from_index(ctx, index);
5366                         table->files[index] = NULL;
5367                         if (io_queue_file_removal(data, file))
5368                                 ref_switch = true;
5369                 }
5370                 if (fd != -1) {
5371                         file = fget(fd);
5372                         if (!file) {
5373                                 err = -EBADF;
5374                                 break;
5375                         }
5376                         /*
5377                          * Don't allow io_uring instances to be registered. If
5378                          * UNIX isn't enabled, then this causes a reference
5379                          * cycle and this instance can never get freed. If UNIX
5380                          * is enabled we'll handle it just fine, but there's
5381                          * still no point in allowing a ring fd as it doesn't
5382                          * support regular read/write anyway.
5383                          */
5384                         if (file->f_op == &io_uring_fops) {
5385                                 fput(file);
5386                                 err = -EBADF;
5387                                 break;
5388                         }
5389                         table->files[index] = file;
5390                         err = io_sqe_file_register(ctx, file, i);
5391                         if (err)
5392                                 break;
5393                 }
5394                 nr_args--;
5395                 done++;
5396                 up->offset++;
5397         }
5398
5399         if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
5400                 percpu_ref_put(&data->refs);
5401                 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
5402         }
5403
5404         return done ? done : err;
5405 }
5406 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
5407                                unsigned nr_args)
5408 {
5409         struct io_uring_files_update up;
5410
5411         if (!ctx->file_data)
5412                 return -ENXIO;
5413         if (!nr_args)
5414                 return -EINVAL;
5415         if (copy_from_user(&up, arg, sizeof(up)))
5416                 return -EFAULT;
5417         if (up.resv)
5418                 return -EINVAL;
5419
5420         return __io_sqe_files_update(ctx, &up, nr_args);
5421 }
5422
5423 static void io_put_work(struct io_wq_work *work)
5424 {
5425         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5426
5427         io_put_req(req);
5428 }
5429
5430 static void io_get_work(struct io_wq_work *work)
5431 {
5432         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5433
5434         refcount_inc(&req->refs);
5435 }
5436
5437 static int io_sq_offload_start(struct io_ring_ctx *ctx,
5438                                struct io_uring_params *p)
5439 {
5440         struct io_wq_data data;
5441         unsigned concurrency;
5442         int ret;
5443
5444         init_waitqueue_head(&ctx->sqo_wait);
5445         mmgrab(current->mm);
5446         ctx->sqo_mm = current->mm;
5447
5448         if (ctx->flags & IORING_SETUP_SQPOLL) {
5449                 ret = -EPERM;
5450                 if (!capable(CAP_SYS_ADMIN))
5451                         goto err;
5452
5453                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
5454                 if (!ctx->sq_thread_idle)
5455                         ctx->sq_thread_idle = HZ;
5456
5457                 if (p->flags & IORING_SETUP_SQ_AFF) {
5458                         int cpu = p->sq_thread_cpu;
5459
5460                         ret = -EINVAL;
5461                         if (cpu >= nr_cpu_ids)
5462                                 goto err;
5463                         if (!cpu_online(cpu))
5464                                 goto err;
5465
5466                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
5467                                                         ctx, cpu,
5468                                                         "io_uring-sq");
5469                 } else {
5470                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
5471                                                         "io_uring-sq");
5472                 }
5473                 if (IS_ERR(ctx->sqo_thread)) {
5474                         ret = PTR_ERR(ctx->sqo_thread);
5475                         ctx->sqo_thread = NULL;
5476                         goto err;
5477                 }
5478                 wake_up_process(ctx->sqo_thread);
5479         } else if (p->flags & IORING_SETUP_SQ_AFF) {
5480                 /* Can't have SQ_AFF without SQPOLL */
5481                 ret = -EINVAL;
5482                 goto err;
5483         }
5484
5485         data.mm = ctx->sqo_mm;
5486         data.user = ctx->user;
5487         data.creds = ctx->creds;
5488         data.get_work = io_get_work;
5489         data.put_work = io_put_work;
5490
5491         /* Do QD, or 4 * CPUS, whatever is smallest */
5492         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
5493         ctx->io_wq = io_wq_create(concurrency, &data);
5494         if (IS_ERR(ctx->io_wq)) {
5495                 ret = PTR_ERR(ctx->io_wq);
5496                 ctx->io_wq = NULL;
5497                 goto err;
5498         }
5499
5500         return 0;
5501 err:
5502         io_finish_async(ctx);
5503         mmdrop(ctx->sqo_mm);
5504         ctx->sqo_mm = NULL;
5505         return ret;
5506 }
5507
5508 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
5509 {
5510         atomic_long_sub(nr_pages, &user->locked_vm);
5511 }
5512
5513 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
5514 {
5515         unsigned long page_limit, cur_pages, new_pages;
5516
5517         /* Don't allow more pages than we can safely lock */
5518         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
5519
5520         do {
5521                 cur_pages = atomic_long_read(&user->locked_vm);
5522                 new_pages = cur_pages + nr_pages;
5523                 if (new_pages > page_limit)
5524                         return -ENOMEM;
5525         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
5526                                         new_pages) != cur_pages);
5527
5528         return 0;
5529 }
5530
5531 static void io_mem_free(void *ptr)
5532 {
5533         struct page *page;
5534
5535         if (!ptr)
5536                 return;
5537
5538         page = virt_to_head_page(ptr);
5539         if (put_page_testzero(page))
5540                 free_compound_page(page);
5541 }
5542
5543 static void *io_mem_alloc(size_t size)
5544 {
5545         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
5546                                 __GFP_NORETRY;
5547
5548         return (void *) __get_free_pages(gfp_flags, get_order(size));
5549 }
5550
5551 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
5552                                 size_t *sq_offset)
5553 {
5554         struct io_rings *rings;
5555         size_t off, sq_array_size;
5556
5557         off = struct_size(rings, cqes, cq_entries);
5558         if (off == SIZE_MAX)
5559                 return SIZE_MAX;
5560
5561 #ifdef CONFIG_SMP
5562         off = ALIGN(off, SMP_CACHE_BYTES);
5563         if (off == 0)
5564                 return SIZE_MAX;
5565 #endif
5566
5567         sq_array_size = array_size(sizeof(u32), sq_entries);
5568         if (sq_array_size == SIZE_MAX)
5569                 return SIZE_MAX;
5570
5571         if (check_add_overflow(off, sq_array_size, &off))
5572                 return SIZE_MAX;
5573
5574         if (sq_offset)
5575                 *sq_offset = off;
5576
5577         return off;
5578 }
5579
5580 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
5581 {
5582         size_t pages;
5583
5584         pages = (size_t)1 << get_order(
5585                 rings_size(sq_entries, cq_entries, NULL));
5586         pages += (size_t)1 << get_order(
5587                 array_size(sizeof(struct io_uring_sqe), sq_entries));
5588
5589         return pages;
5590 }
5591
5592 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
5593 {
5594         int i, j;
5595
5596         if (!ctx->user_bufs)
5597                 return -ENXIO;
5598
5599         for (i = 0; i < ctx->nr_user_bufs; i++) {
5600                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
5601
5602                 for (j = 0; j < imu->nr_bvecs; j++)
5603                         put_user_page(imu->bvec[j].bv_page);
5604
5605                 if (ctx->account_mem)
5606                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
5607                 kvfree(imu->bvec);
5608                 imu->nr_bvecs = 0;
5609         }
5610
5611         kfree(ctx->user_bufs);
5612         ctx->user_bufs = NULL;
5613         ctx->nr_user_bufs = 0;
5614         return 0;
5615 }
5616
5617 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
5618                        void __user *arg, unsigned index)
5619 {
5620         struct iovec __user *src;
5621
5622 #ifdef CONFIG_COMPAT
5623         if (ctx->compat) {
5624                 struct compat_iovec __user *ciovs;
5625                 struct compat_iovec ciov;
5626
5627                 ciovs = (struct compat_iovec __user *) arg;
5628                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
5629                         return -EFAULT;
5630
5631                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
5632                 dst->iov_len = ciov.iov_len;
5633                 return 0;
5634         }
5635 #endif
5636         src = (struct iovec __user *) arg;
5637         if (copy_from_user(dst, &src[index], sizeof(*dst)))
5638                 return -EFAULT;
5639         return 0;
5640 }
5641
5642 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
5643                                   unsigned nr_args)
5644 {
5645         struct vm_area_struct **vmas = NULL;
5646         struct page **pages = NULL;
5647         int i, j, got_pages = 0;
5648         int ret = -EINVAL;
5649
5650         if (ctx->user_bufs)
5651                 return -EBUSY;
5652         if (!nr_args || nr_args > UIO_MAXIOV)
5653                 return -EINVAL;
5654
5655         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
5656                                         GFP_KERNEL);
5657         if (!ctx->user_bufs)
5658                 return -ENOMEM;
5659
5660         for (i = 0; i < nr_args; i++) {
5661                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
5662                 unsigned long off, start, end, ubuf;
5663                 int pret, nr_pages;
5664                 struct iovec iov;
5665                 size_t size;
5666
5667                 ret = io_copy_iov(ctx, &iov, arg, i);
5668                 if (ret)
5669                         goto err;
5670
5671                 /*
5672                  * Don't impose further limits on the size and buffer
5673                  * constraints here, we'll -EINVAL later when IO is
5674                  * submitted if they are wrong.
5675                  */
5676                 ret = -EFAULT;
5677                 if (!iov.iov_base || !iov.iov_len)
5678                         goto err;
5679
5680                 /* arbitrary limit, but we need something */
5681                 if (iov.iov_len > SZ_1G)
5682                         goto err;
5683
5684                 ubuf = (unsigned long) iov.iov_base;
5685                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
5686                 start = ubuf >> PAGE_SHIFT;
5687                 nr_pages = end - start;
5688
5689                 if (ctx->account_mem) {
5690                         ret = io_account_mem(ctx->user, nr_pages);
5691                         if (ret)
5692                                 goto err;
5693                 }
5694
5695                 ret = 0;
5696                 if (!pages || nr_pages > got_pages) {
5697                         kfree(vmas);
5698                         kfree(pages);
5699                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
5700                                                 GFP_KERNEL);
5701                         vmas = kvmalloc_array(nr_pages,
5702                                         sizeof(struct vm_area_struct *),
5703                                         GFP_KERNEL);
5704                         if (!pages || !vmas) {
5705                                 ret = -ENOMEM;
5706                                 if (ctx->account_mem)
5707                                         io_unaccount_mem(ctx->user, nr_pages);
5708                                 goto err;
5709                         }
5710                         got_pages = nr_pages;
5711                 }
5712
5713                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
5714                                                 GFP_KERNEL);
5715                 ret = -ENOMEM;
5716                 if (!imu->bvec) {
5717                         if (ctx->account_mem)
5718                                 io_unaccount_mem(ctx->user, nr_pages);
5719                         goto err;
5720                 }
5721
5722                 ret = 0;
5723                 down_read(&current->mm->mmap_sem);
5724                 pret = get_user_pages(ubuf, nr_pages,
5725                                       FOLL_WRITE | FOLL_LONGTERM,
5726                                       pages, vmas);
5727                 if (pret == nr_pages) {
5728                         /* don't support file backed memory */
5729                         for (j = 0; j < nr_pages; j++) {
5730                                 struct vm_area_struct *vma = vmas[j];
5731
5732                                 if (vma->vm_file &&
5733                                     !is_file_hugepages(vma->vm_file)) {
5734                                         ret = -EOPNOTSUPP;
5735                                         break;
5736                                 }
5737                         }
5738                 } else {
5739                         ret = pret < 0 ? pret : -EFAULT;
5740                 }
5741                 up_read(&current->mm->mmap_sem);
5742                 if (ret) {
5743                         /*
5744                          * if we did partial map, or found file backed vmas,
5745                          * release any pages we did get
5746                          */
5747                         if (pret > 0)
5748                                 put_user_pages(pages, pret);
5749                         if (ctx->account_mem)
5750                                 io_unaccount_mem(ctx->user, nr_pages);
5751                         kvfree(imu->bvec);
5752                         goto err;
5753                 }
5754
5755                 off = ubuf & ~PAGE_MASK;
5756                 size = iov.iov_len;
5757                 for (j = 0; j < nr_pages; j++) {
5758                         size_t vec_len;
5759
5760                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
5761                         imu->bvec[j].bv_page = pages[j];
5762                         imu->bvec[j].bv_len = vec_len;
5763                         imu->bvec[j].bv_offset = off;
5764                         off = 0;
5765                         size -= vec_len;
5766                 }
5767                 /* store original address for later verification */
5768                 imu->ubuf = ubuf;
5769                 imu->len = iov.iov_len;
5770                 imu->nr_bvecs = nr_pages;
5771
5772                 ctx->nr_user_bufs++;
5773         }
5774         kvfree(pages);
5775         kvfree(vmas);
5776         return 0;
5777 err:
5778         kvfree(pages);
5779         kvfree(vmas);
5780         io_sqe_buffer_unregister(ctx);
5781         return ret;
5782 }
5783
5784 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
5785 {
5786         __s32 __user *fds = arg;
5787         int fd;
5788
5789         if (ctx->cq_ev_fd)
5790                 return -EBUSY;
5791
5792         if (copy_from_user(&fd, fds, sizeof(*fds)))
5793                 return -EFAULT;
5794
5795         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
5796         if (IS_ERR(ctx->cq_ev_fd)) {
5797                 int ret = PTR_ERR(ctx->cq_ev_fd);
5798                 ctx->cq_ev_fd = NULL;
5799                 return ret;
5800         }
5801
5802         return 0;
5803 }
5804
5805 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
5806 {
5807         if (ctx->cq_ev_fd) {
5808                 eventfd_ctx_put(ctx->cq_ev_fd);
5809                 ctx->cq_ev_fd = NULL;
5810                 return 0;
5811         }
5812
5813         return -ENXIO;
5814 }
5815
5816 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
5817 {
5818         io_finish_async(ctx);
5819         if (ctx->sqo_mm)
5820                 mmdrop(ctx->sqo_mm);
5821
5822         io_iopoll_reap_events(ctx);
5823         io_sqe_buffer_unregister(ctx);
5824         io_sqe_files_unregister(ctx);
5825         io_eventfd_unregister(ctx);
5826
5827 #if defined(CONFIG_UNIX)
5828         if (ctx->ring_sock) {
5829                 ctx->ring_sock->file = NULL; /* so that iput() is called */
5830                 sock_release(ctx->ring_sock);
5831         }
5832 #endif
5833
5834         io_mem_free(ctx->rings);
5835         io_mem_free(ctx->sq_sqes);
5836
5837         percpu_ref_exit(&ctx->refs);
5838         if (ctx->account_mem)
5839                 io_unaccount_mem(ctx->user,
5840                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
5841         free_uid(ctx->user);
5842         put_cred(ctx->creds);
5843         kfree(ctx->completions);
5844         kfree(ctx->cancel_hash);
5845         kmem_cache_free(req_cachep, ctx->fallback_req);
5846         kfree(ctx);
5847 }
5848
5849 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
5850 {
5851         struct io_ring_ctx *ctx = file->private_data;
5852         __poll_t mask = 0;
5853
5854         poll_wait(file, &ctx->cq_wait, wait);
5855         /*
5856          * synchronizes with barrier from wq_has_sleeper call in
5857          * io_commit_cqring
5858          */
5859         smp_rmb();
5860         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
5861             ctx->rings->sq_ring_entries)
5862                 mask |= EPOLLOUT | EPOLLWRNORM;
5863         if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
5864                 mask |= EPOLLIN | EPOLLRDNORM;
5865
5866         return mask;
5867 }
5868
5869 static int io_uring_fasync(int fd, struct file *file, int on)
5870 {
5871         struct io_ring_ctx *ctx = file->private_data;
5872
5873         return fasync_helper(fd, file, on, &ctx->cq_fasync);
5874 }
5875
5876 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
5877 {
5878         mutex_lock(&ctx->uring_lock);
5879         percpu_ref_kill(&ctx->refs);
5880         mutex_unlock(&ctx->uring_lock);
5881
5882         io_kill_timeouts(ctx);
5883         io_poll_remove_all(ctx);
5884
5885         if (ctx->io_wq)
5886                 io_wq_cancel_all(ctx->io_wq);
5887
5888         io_iopoll_reap_events(ctx);
5889         /* if we failed setting up the ctx, we might not have any rings */
5890         if (ctx->rings)
5891                 io_cqring_overflow_flush(ctx, true);
5892         wait_for_completion(&ctx->completions[0]);
5893         io_ring_ctx_free(ctx);
5894 }
5895
5896 static int io_uring_release(struct inode *inode, struct file *file)
5897 {
5898         struct io_ring_ctx *ctx = file->private_data;
5899
5900         file->private_data = NULL;
5901         io_ring_ctx_wait_and_kill(ctx);
5902         return 0;
5903 }
5904
5905 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
5906                                   struct files_struct *files)
5907 {
5908         struct io_kiocb *req;
5909         DEFINE_WAIT(wait);
5910
5911         while (!list_empty_careful(&ctx->inflight_list)) {
5912                 struct io_kiocb *cancel_req = NULL;
5913
5914                 spin_lock_irq(&ctx->inflight_lock);
5915                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
5916                         if (req->work.files != files)
5917                                 continue;
5918                         /* req is being completed, ignore */
5919                         if (!refcount_inc_not_zero(&req->refs))
5920                                 continue;
5921                         cancel_req = req;
5922                         break;
5923                 }
5924                 if (cancel_req)
5925                         prepare_to_wait(&ctx->inflight_wait, &wait,
5926                                                 TASK_UNINTERRUPTIBLE);
5927                 spin_unlock_irq(&ctx->inflight_lock);
5928
5929                 /* We need to keep going until we don't find a matching req */
5930                 if (!cancel_req)
5931                         break;
5932
5933                 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
5934                 io_put_req(cancel_req);
5935                 schedule();
5936         }
5937         finish_wait(&ctx->inflight_wait, &wait);
5938 }
5939
5940 static int io_uring_flush(struct file *file, void *data)
5941 {
5942         struct io_ring_ctx *ctx = file->private_data;
5943
5944         io_uring_cancel_files(ctx, data);
5945         if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
5946                 io_cqring_overflow_flush(ctx, true);
5947                 io_wq_cancel_all(ctx->io_wq);
5948         }
5949         return 0;
5950 }
5951
5952 static void *io_uring_validate_mmap_request(struct file *file,
5953                                             loff_t pgoff, size_t sz)
5954 {
5955         struct io_ring_ctx *ctx = file->private_data;
5956         loff_t offset = pgoff << PAGE_SHIFT;
5957         struct page *page;
5958         void *ptr;
5959
5960         switch (offset) {
5961         case IORING_OFF_SQ_RING:
5962         case IORING_OFF_CQ_RING:
5963                 ptr = ctx->rings;
5964                 break;
5965         case IORING_OFF_SQES:
5966                 ptr = ctx->sq_sqes;
5967                 break;
5968         default:
5969                 return ERR_PTR(-EINVAL);
5970         }
5971
5972         page = virt_to_head_page(ptr);
5973         if (sz > page_size(page))
5974                 return ERR_PTR(-EINVAL);
5975
5976         return ptr;
5977 }
5978
5979 #ifdef CONFIG_MMU
5980
5981 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5982 {
5983         size_t sz = vma->vm_end - vma->vm_start;
5984         unsigned long pfn;
5985         void *ptr;
5986
5987         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
5988         if (IS_ERR(ptr))
5989                 return PTR_ERR(ptr);
5990
5991         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
5992         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
5993 }
5994
5995 #else /* !CONFIG_MMU */
5996
5997 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5998 {
5999         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
6000 }
6001
6002 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
6003 {
6004         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
6005 }
6006
6007 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
6008         unsigned long addr, unsigned long len,
6009         unsigned long pgoff, unsigned long flags)
6010 {
6011         void *ptr;
6012
6013         ptr = io_uring_validate_mmap_request(file, pgoff, len);
6014         if (IS_ERR(ptr))
6015                 return PTR_ERR(ptr);
6016
6017         return (unsigned long) ptr;
6018 }
6019
6020 #endif /* !CONFIG_MMU */
6021
6022 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
6023                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
6024                 size_t, sigsz)
6025 {
6026         struct io_ring_ctx *ctx;
6027         long ret = -EBADF;
6028         int submitted = 0;
6029         struct fd f;
6030
6031         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
6032                 return -EINVAL;
6033
6034         f = fdget(fd);
6035         if (!f.file)
6036                 return -EBADF;
6037
6038         ret = -EOPNOTSUPP;
6039         if (f.file->f_op != &io_uring_fops)
6040                 goto out_fput;
6041
6042         ret = -ENXIO;
6043         ctx = f.file->private_data;
6044         if (!percpu_ref_tryget(&ctx->refs))
6045                 goto out_fput;
6046
6047         /*
6048          * For SQ polling, the thread will do all submissions and completions.
6049          * Just return the requested submit count, and wake the thread if
6050          * we were asked to.
6051          */
6052         ret = 0;
6053         if (ctx->flags & IORING_SETUP_SQPOLL) {
6054                 if (!list_empty_careful(&ctx->cq_overflow_list))
6055                         io_cqring_overflow_flush(ctx, false);
6056                 if (flags & IORING_ENTER_SQ_WAKEUP)
6057                         wake_up(&ctx->sqo_wait);
6058                 submitted = to_submit;
6059         } else if (to_submit) {
6060                 struct mm_struct *cur_mm;
6061
6062                 if (current->mm != ctx->sqo_mm ||
6063                     current_cred() != ctx->creds) {
6064                         ret = -EPERM;
6065                         goto out;
6066                 }
6067
6068                 to_submit = min(to_submit, ctx->sq_entries);
6069                 mutex_lock(&ctx->uring_lock);
6070                 /* already have mm, so io_submit_sqes() won't try to grab it */
6071                 cur_mm = ctx->sqo_mm;
6072                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
6073                                            &cur_mm, false);
6074                 mutex_unlock(&ctx->uring_lock);
6075
6076                 if (submitted != to_submit)
6077                         goto out;
6078         }
6079         if (flags & IORING_ENTER_GETEVENTS) {
6080                 unsigned nr_events = 0;
6081
6082                 min_complete = min(min_complete, ctx->cq_entries);
6083
6084                 if (ctx->flags & IORING_SETUP_IOPOLL) {
6085                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
6086                 } else {
6087                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
6088                 }
6089         }
6090
6091 out:
6092         percpu_ref_put(&ctx->refs);
6093 out_fput:
6094         fdput(f);
6095         return submitted ? submitted : ret;
6096 }
6097
6098 static const struct file_operations io_uring_fops = {
6099         .release        = io_uring_release,
6100         .flush          = io_uring_flush,
6101         .mmap           = io_uring_mmap,
6102 #ifndef CONFIG_MMU
6103         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
6104         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
6105 #endif
6106         .poll           = io_uring_poll,
6107         .fasync         = io_uring_fasync,
6108 };
6109
6110 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
6111                                   struct io_uring_params *p)
6112 {
6113         struct io_rings *rings;
6114         size_t size, sq_array_offset;
6115
6116         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
6117         if (size == SIZE_MAX)
6118                 return -EOVERFLOW;
6119
6120         rings = io_mem_alloc(size);
6121         if (!rings)
6122                 return -ENOMEM;
6123
6124         ctx->rings = rings;
6125         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
6126         rings->sq_ring_mask = p->sq_entries - 1;
6127         rings->cq_ring_mask = p->cq_entries - 1;
6128         rings->sq_ring_entries = p->sq_entries;
6129         rings->cq_ring_entries = p->cq_entries;
6130         ctx->sq_mask = rings->sq_ring_mask;
6131         ctx->cq_mask = rings->cq_ring_mask;
6132         ctx->sq_entries = rings->sq_ring_entries;
6133         ctx->cq_entries = rings->cq_ring_entries;
6134
6135         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
6136         if (size == SIZE_MAX) {
6137                 io_mem_free(ctx->rings);
6138                 ctx->rings = NULL;
6139                 return -EOVERFLOW;
6140         }
6141
6142         ctx->sq_sqes = io_mem_alloc(size);
6143         if (!ctx->sq_sqes) {
6144                 io_mem_free(ctx->rings);
6145                 ctx->rings = NULL;
6146                 return -ENOMEM;
6147         }
6148
6149         return 0;
6150 }
6151
6152 /*
6153  * Allocate an anonymous fd, this is what constitutes the application
6154  * visible backing of an io_uring instance. The application mmaps this
6155  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
6156  * we have to tie this fd to a socket for file garbage collection purposes.
6157  */
6158 static int io_uring_get_fd(struct io_ring_ctx *ctx)
6159 {
6160         struct file *file;
6161         int ret;
6162
6163 #if defined(CONFIG_UNIX)
6164         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
6165                                 &ctx->ring_sock);
6166         if (ret)
6167                 return ret;
6168 #endif
6169
6170         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
6171         if (ret < 0)
6172                 goto err;
6173
6174         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
6175                                         O_RDWR | O_CLOEXEC);
6176         if (IS_ERR(file)) {
6177                 put_unused_fd(ret);
6178                 ret = PTR_ERR(file);
6179                 goto err;
6180         }
6181
6182 #if defined(CONFIG_UNIX)
6183         ctx->ring_sock->file = file;
6184 #endif
6185         fd_install(ret, file);
6186         return ret;
6187 err:
6188 #if defined(CONFIG_UNIX)
6189         sock_release(ctx->ring_sock);
6190         ctx->ring_sock = NULL;
6191 #endif
6192         return ret;
6193 }
6194
6195 static int io_uring_create(unsigned entries, struct io_uring_params *p)
6196 {
6197         struct user_struct *user = NULL;
6198         struct io_ring_ctx *ctx;
6199         bool account_mem;
6200         int ret;
6201
6202         if (!entries || entries > IORING_MAX_ENTRIES)
6203                 return -EINVAL;
6204
6205         /*
6206          * Use twice as many entries for the CQ ring. It's possible for the
6207          * application to drive a higher depth than the size of the SQ ring,
6208          * since the sqes are only used at submission time. This allows for
6209          * some flexibility in overcommitting a bit. If the application has
6210          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
6211          * of CQ ring entries manually.
6212          */
6213         p->sq_entries = roundup_pow_of_two(entries);
6214         if (p->flags & IORING_SETUP_CQSIZE) {
6215                 /*
6216                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
6217                  * to a power-of-two, if it isn't already. We do NOT impose
6218                  * any cq vs sq ring sizing.
6219                  */
6220                 if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
6221                         return -EINVAL;
6222                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
6223         } else {
6224                 p->cq_entries = 2 * p->sq_entries;
6225         }
6226
6227         user = get_uid(current_user());
6228         account_mem = !capable(CAP_IPC_LOCK);
6229
6230         if (account_mem) {
6231                 ret = io_account_mem(user,
6232                                 ring_pages(p->sq_entries, p->cq_entries));
6233                 if (ret) {
6234                         free_uid(user);
6235                         return ret;
6236                 }
6237         }
6238
6239         ctx = io_ring_ctx_alloc(p);
6240         if (!ctx) {
6241                 if (account_mem)
6242                         io_unaccount_mem(user, ring_pages(p->sq_entries,
6243                                                                 p->cq_entries));
6244                 free_uid(user);
6245                 return -ENOMEM;
6246         }
6247         ctx->compat = in_compat_syscall();
6248         ctx->account_mem = account_mem;
6249         ctx->user = user;
6250         ctx->creds = get_current_cred();
6251
6252         ret = io_allocate_scq_urings(ctx, p);
6253         if (ret)
6254                 goto err;
6255
6256         ret = io_sq_offload_start(ctx, p);
6257         if (ret)
6258                 goto err;
6259
6260         memset(&p->sq_off, 0, sizeof(p->sq_off));
6261         p->sq_off.head = offsetof(struct io_rings, sq.head);
6262         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
6263         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
6264         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
6265         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
6266         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
6267         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
6268
6269         memset(&p->cq_off, 0, sizeof(p->cq_off));
6270         p->cq_off.head = offsetof(struct io_rings, cq.head);
6271         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
6272         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
6273         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
6274         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
6275         p->cq_off.cqes = offsetof(struct io_rings, cqes);
6276
6277         /*
6278          * Install ring fd as the very last thing, so we don't risk someone
6279          * having closed it before we finish setup
6280          */
6281         ret = io_uring_get_fd(ctx);
6282         if (ret < 0)
6283                 goto err;
6284
6285         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
6286                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS;
6287         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
6288         return ret;
6289 err:
6290         io_ring_ctx_wait_and_kill(ctx);
6291         return ret;
6292 }
6293
6294 /*
6295  * Sets up an aio uring context, and returns the fd. Applications asks for a
6296  * ring size, we return the actual sq/cq ring sizes (among other things) in the
6297  * params structure passed in.
6298  */
6299 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
6300 {
6301         struct io_uring_params p;
6302         long ret;
6303         int i;
6304
6305         if (copy_from_user(&p, params, sizeof(p)))
6306                 return -EFAULT;
6307         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
6308                 if (p.resv[i])
6309                         return -EINVAL;
6310         }
6311
6312         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
6313                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
6314                 return -EINVAL;
6315
6316         ret = io_uring_create(entries, &p);
6317         if (ret < 0)
6318                 return ret;
6319
6320         if (copy_to_user(params, &p, sizeof(p)))
6321                 return -EFAULT;
6322
6323         return ret;
6324 }
6325
6326 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
6327                 struct io_uring_params __user *, params)
6328 {
6329         return io_uring_setup(entries, params);
6330 }
6331
6332 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
6333                                void __user *arg, unsigned nr_args)
6334         __releases(ctx->uring_lock)
6335         __acquires(ctx->uring_lock)
6336 {
6337         int ret;
6338
6339         /*
6340          * We're inside the ring mutex, if the ref is already dying, then
6341          * someone else killed the ctx or is already going through
6342          * io_uring_register().
6343          */
6344         if (percpu_ref_is_dying(&ctx->refs))
6345                 return -ENXIO;
6346
6347         if (opcode != IORING_UNREGISTER_FILES &&
6348             opcode != IORING_REGISTER_FILES_UPDATE) {
6349                 percpu_ref_kill(&ctx->refs);
6350
6351                 /*
6352                  * Drop uring mutex before waiting for references to exit. If
6353                  * another thread is currently inside io_uring_enter() it might
6354                  * need to grab the uring_lock to make progress. If we hold it
6355                  * here across the drain wait, then we can deadlock. It's safe
6356                  * to drop the mutex here, since no new references will come in
6357                  * after we've killed the percpu ref.
6358                  */
6359                 mutex_unlock(&ctx->uring_lock);
6360                 wait_for_completion(&ctx->completions[0]);
6361                 mutex_lock(&ctx->uring_lock);
6362         }
6363
6364         switch (opcode) {
6365         case IORING_REGISTER_BUFFERS:
6366                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
6367                 break;
6368         case IORING_UNREGISTER_BUFFERS:
6369                 ret = -EINVAL;
6370                 if (arg || nr_args)
6371                         break;
6372                 ret = io_sqe_buffer_unregister(ctx);
6373                 break;
6374         case IORING_REGISTER_FILES:
6375                 ret = io_sqe_files_register(ctx, arg, nr_args);
6376                 break;
6377         case IORING_UNREGISTER_FILES:
6378                 ret = -EINVAL;
6379                 if (arg || nr_args)
6380                         break;
6381                 ret = io_sqe_files_unregister(ctx);
6382                 break;
6383         case IORING_REGISTER_FILES_UPDATE:
6384                 ret = io_sqe_files_update(ctx, arg, nr_args);
6385                 break;
6386         case IORING_REGISTER_EVENTFD:
6387                 ret = -EINVAL;
6388                 if (nr_args != 1)
6389                         break;
6390                 ret = io_eventfd_register(ctx, arg);
6391                 break;
6392         case IORING_UNREGISTER_EVENTFD:
6393                 ret = -EINVAL;
6394                 if (arg || nr_args)
6395                         break;
6396                 ret = io_eventfd_unregister(ctx);
6397                 break;
6398         default:
6399                 ret = -EINVAL;
6400                 break;
6401         }
6402
6403
6404         if (opcode != IORING_UNREGISTER_FILES &&
6405             opcode != IORING_REGISTER_FILES_UPDATE) {
6406                 /* bring the ctx back to life */
6407                 reinit_completion(&ctx->completions[0]);
6408                 percpu_ref_reinit(&ctx->refs);
6409         }
6410         return ret;
6411 }
6412
6413 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
6414                 void __user *, arg, unsigned int, nr_args)
6415 {
6416         struct io_ring_ctx *ctx;
6417         long ret = -EBADF;
6418         struct fd f;
6419
6420         f = fdget(fd);
6421         if (!f.file)
6422                 return -EBADF;
6423
6424         ret = -EOPNOTSUPP;
6425         if (f.file->f_op != &io_uring_fops)
6426                 goto out_fput;
6427
6428         ctx = f.file->private_data;
6429
6430         mutex_lock(&ctx->uring_lock);
6431         ret = __io_uring_register(ctx, opcode, arg, nr_args);
6432         mutex_unlock(&ctx->uring_lock);
6433         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
6434                                                         ctx->cq_ev_fd != NULL, ret);
6435 out_fput:
6436         fdput(f);
6437         return ret;
6438 }
6439
6440 static int __init io_uring_init(void)
6441 {
6442         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
6443         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
6444         return 0;
6445 };
6446 __initcall(io_uring_init);