fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49
  50 #include <linux/sched/signal.h>
  51 #include <linux/fs.h>
  52 #include <linux/file.h>
  53 #include <linux/fdtable.h>
  54 #include <linux/mm.h>
  55 #include <linux/mman.h>
  56 #include <linux/mmu_context.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/workqueue.h>
  60 #include <linux/kthread.h>
  61 #include <linux/blkdev.h>
  62 #include <linux/bvec.h>
  63 #include <linux/net.h>
  64 #include <net/sock.h>
  65 #include <net/af_unix.h>
  66 #include <net/scm.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/highmem.h>
  74 #include <linux/fs_struct.h>
  75
  76 #include <uapi/linux/io_uring.h>
  77
  78 #include "internal.h"
  79
  80 #define IORING_MAX_ENTRIES      32768
  81 #define IORING_MAX_FIXED_FILES  1024
  82
  83 struct io_uring {
  84         u32 head ____cacheline_aligned_in_smp;
  85         u32 tail ____cacheline_aligned_in_smp;
  86 };
  87
  88 /*
  89  * This data is shared with the application through the mmap at offsets
  90  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
  91  *
  92  * The offsets to the member fields are published through struct
  93  * io_sqring_offsets when calling io_uring_setup.
  94  */
  95 struct io_rings {
  96         /*
  97          * Head and tail offsets into the ring; the offsets need to be
  98          * masked to get valid indices.
  99          *
 100          * The kernel controls head of the sq ring and the tail of the cq ring,
 101          * and the application controls tail of the sq ring and the head of the
 102          * cq ring.
 103          */
 104         struct io_uring         sq, cq;
 105         /*
 106          * Bitmasks to apply to head and tail offsets (constant, equals
 107          * ring_entries - 1)
 108          */
 109         u32                     sq_ring_mask, cq_ring_mask;
 110         /* Ring sizes (constant, power of 2) */
 111         u32                     sq_ring_entries, cq_ring_entries;
 112         /*
 113          * Number of invalid entries dropped by the kernel due to
 114          * invalid index stored in array
 115          *
 116          * Written by the kernel, shouldn't be modified by the
 117          * application (i.e. get number of "new events" by comparing to
 118          * cached value).
 119          *
 120          * After a new SQ head value was read by the application this
 121          * counter includes all submissions that were dropped reaching
 122          * the new SQ head (and possibly more).
 123          */
 124         u32                     sq_dropped;
 125         /*
 126          * Runtime flags
 127          *
 128          * Written by the kernel, shouldn't be modified by the
 129          * application.
 130          *
 131          * The application needs a full memory barrier before checking
 132          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 133          */
 134         u32                     sq_flags;
 135         /*
 136          * Number of completion events lost because the queue was full;
 137          * this should be avoided by the application by making sure
 138          * there are not more requests pending thatn there is space in
 139          * the completion queue.
 140          *
 141          * Written by the kernel, shouldn't be modified by the
 142          * application (i.e. get number of "new events" by comparing to
 143          * cached value).
 144          *
 145          * As completion events come in out of order this counter is not
 146          * ordered with any other data.
 147          */
 148         u32                     cq_overflow;
 149         /*
 150          * Ring buffer of completion events.
 151          *
 152          * The kernel writes completion events fresh every time they are
 153          * produced, so the application is allowed to modify pending
 154          * entries.
 155          */
 156         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 157 };
 158
 159 struct io_mapped_ubuf {
 160         u64             ubuf;
 161         size_t          len;
 162         struct          bio_vec *bvec;
 163         unsigned int    nr_bvecs;
 164 };
 165
 166 struct async_list {
 167         spinlock_t              lock;
 168         atomic_t                cnt;
 169         struct list_head        list;
 170
 171         struct file             *file;
 172         off_t                   io_start;
 173         size_t                  io_len;
 174 };
 175
 176 struct io_ring_ctx {
 177         struct {
 178                 struct percpu_ref       refs;
 179         } ____cacheline_aligned_in_smp;
 180
 181         struct {
 182                 unsigned int            flags;
 183                 bool                    compat;
 184                 bool                    account_mem;
 185
 186                 /*
 187                  * Ring buffer of indices into array of io_uring_sqe, which is
 188                  * mmapped by the application using the IORING_OFF_SQES offset.
 189                  *
 190                  * This indirection could e.g. be used to assign fixed
 191                  * io_uring_sqe entries to operations and only submit them to
 192                  * the queue when needed.
 193                  *
 194                  * The kernel modifies neither the indices array nor the entries
 195                  * array.
 196                  */
 197                 u32                     *sq_array;
 198                 unsigned                cached_sq_head;
 199                 unsigned                sq_entries;
 200                 unsigned                sq_mask;
 201                 unsigned                sq_thread_idle;
 202                 unsigned                cached_sq_dropped;
 203                 struct io_uring_sqe     *sq_sqes;
 204
 205                 struct list_head        defer_list;
 206                 struct list_head        timeout_list;
 207         } ____cacheline_aligned_in_smp;
 208
 209         /* IO offload */
 210         struct workqueue_struct *sqo_wq[2];
 211         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 212         struct mm_struct        *sqo_mm;
 213         wait_queue_head_t       sqo_wait;
 214         struct completion       sqo_thread_started;
 215
 216         struct {
 217                 unsigned                cached_cq_tail;
 218                 atomic_t                cached_cq_overflow;
 219                 unsigned                cq_entries;
 220                 unsigned                cq_mask;
 221                 struct wait_queue_head  cq_wait;
 222                 struct fasync_struct    *cq_fasync;
 223                 struct eventfd_ctx      *cq_ev_fd;
 224                 atomic_t                cq_timeouts;
 225         } ____cacheline_aligned_in_smp;
 226
 227         struct io_rings *rings;
 228
 229         /*
 230          * If used, fixed file set. Writers must ensure that ->refs is dead,
 231          * readers must ensure that ->refs is alive as long as the file* is
 232          * used. Only updated through io_uring_register(2).
 233          */
 234         struct file             **user_files;
 235         unsigned                nr_user_files;
 236
 237         /* if used, fixed mapped user buffers */
 238         unsigned                nr_user_bufs;
 239         struct io_mapped_ubuf   *user_bufs;
 240
 241         struct user_struct      *user;
 242
 243         const struct cred       *creds;
 244
 245         struct completion       ctx_done;
 246
 247         struct {
 248                 struct mutex            uring_lock;
 249                 wait_queue_head_t       wait;
 250         } ____cacheline_aligned_in_smp;
 251
 252         struct {
 253                 spinlock_t              completion_lock;
 254                 bool                    poll_multi_file;
 255                 /*
 256                  * ->poll_list is protected by the ctx->uring_lock for
 257                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 258                  * For SQPOLL, only the single threaded io_sq_thread() will
 259                  * manipulate the list, hence no extra locking is needed there.
 260                  */
 261                 struct list_head        poll_list;
 262                 struct list_head        cancel_list;
 263         } ____cacheline_aligned_in_smp;
 264
 265         struct async_list       pending_async[2];
 266
 267 #if defined(CONFIG_UNIX)
 268         struct socket           *ring_sock;
 269 #endif
 270
 271         struct list_head        task_list;
 272         spinlock_t              task_lock;
 273 };
 274
 275 struct sqe_submit {
 276         const struct io_uring_sqe       *sqe;
 277         unsigned short                  index;
 278         u32                             sequence;
 279         bool                            has_user;
 280         bool                            needs_lock;
 281         bool                            needs_fixed_file;
 282         u8                              opcode;
 283 };
 284
 285 /*
 286  * First field must be the file pointer in all the
 287  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 288  */
 289 struct io_poll_iocb {
 290         struct file                     *file;
 291         struct wait_queue_head          *head;
 292         __poll_t                        events;
 293         bool                            done;
 294         bool                            canceled;
 295         struct wait_queue_entry         wait;
 296 };
 297
 298 struct io_timeout {
 299         struct file                     *file;
 300         struct hrtimer                  timer;
 301 };
 302
 303 /*
 304  * NOTE! Each of the iocb union members has the file pointer
 305  * as the first entry in their struct definition. So you can
 306  * access the file pointer through any of the sub-structs,
 307  * or directly as just 'ki_filp' in this struct.
 308  */
 309 struct io_kiocb {
 310         union {
 311                 struct file             *file;
 312                 struct kiocb            rw;
 313                 struct io_poll_iocb     poll;
 314                 struct io_timeout       timeout;
 315         };
 316
 317         struct sqe_submit       submit;
 318
 319         struct io_ring_ctx      *ctx;
 320         struct list_head        list;
 321         struct list_head        link_list;
 322         unsigned int            flags;
 323         refcount_t              refs;
 324 #define REQ_F_NOWAIT            1       /* must not punt to workers */
 325 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 326 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 327 #define REQ_F_SEQ_PREV          8       /* sequential with previous */
 328 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 329 #define REQ_F_IO_DRAINED        32      /* drain done */
 330 #define REQ_F_LINK              64      /* linked sqes */
 331 #define REQ_F_LINK_DONE         128     /* linked sqes done */
 332 #define REQ_F_FAIL_LINK         256     /* fail rest of links */
 333 #define REQ_F_SHADOW_DRAIN      512     /* link-drain shadow req */
 334 #define REQ_F_TIMEOUT           1024    /* timeout request */
 335 #define REQ_F_ISREG             2048    /* regular file */
 336 #define REQ_F_MUST_PUNT         4096    /* must be punted even for NONBLOCK */
 337 #define REQ_F_TIMEOUT_NOSEQ     8192    /* no timeout sequence */
 338 #define REQ_F_CANCEL            16384   /* cancel request */
 339         unsigned long           fsize;
 340         u64                     user_data;
 341         u32                     result;
 342         u32                     sequence;
 343         struct files_struct     *files;
 344
 345         struct fs_struct        *fs;
 346
 347         struct work_struct      work;
 348         struct task_struct      *work_task;
 349         struct list_head        task_list;
 350 };
 351
 352 #define IO_PLUG_THRESHOLD               2
 353 #define IO_IOPOLL_BATCH                 8
 354
 355 struct io_submit_state {
 356         struct blk_plug         plug;
 357
 358         /*
 359          * io_kiocb alloc cache
 360          */
 361         void                    *reqs[IO_IOPOLL_BATCH];
 362         unsigned                int free_reqs;
 363         unsigned                int cur_req;
 364
 365         /*
 366          * File reference cache
 367          */
 368         struct file             *file;
 369         unsigned int            fd;
 370         unsigned int            has_refs;
 371         unsigned int            used_refs;
 372         unsigned int            ios_left;
 373 };
 374
 375 static void io_sq_wq_submit_work(struct work_struct *work);
 376 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 377                                  long res);
 378 static void __io_free_req(struct io_kiocb *req);
 379
 380 static struct kmem_cache *req_cachep;
 381
 382 static const struct file_operations io_uring_fops;
 383
 384 struct sock *io_uring_get_socket(struct file *file)
 385 {
 386 #if defined(CONFIG_UNIX)
 387         if (file->f_op == &io_uring_fops) {
 388                 struct io_ring_ctx *ctx = file->private_data;
 389
 390                 return ctx->ring_sock->sk;
 391         }
 392 #endif
 393         return NULL;
 394 }
 395 EXPORT_SYMBOL(io_uring_get_socket);
 396
 397 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 398 {
 399         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 400
 401         complete(&ctx->ctx_done);
 402 }
 403
 404 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 405 {
 406         struct io_ring_ctx *ctx;
 407         int i;
 408
 409         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 410         if (!ctx)
 411                 return NULL;
 412
 413         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 414                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
 415                 kfree(ctx);
 416                 return NULL;
 417         }
 418
 419         ctx->flags = p->flags;
 420         init_waitqueue_head(&ctx->sqo_wait);
 421         init_waitqueue_head(&ctx->cq_wait);
 422         init_completion(&ctx->ctx_done);
 423         init_completion(&ctx->sqo_thread_started);
 424         mutex_init(&ctx->uring_lock);
 425         init_waitqueue_head(&ctx->wait);
 426         for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
 427                 spin_lock_init(&ctx->pending_async[i].lock);
 428                 INIT_LIST_HEAD(&ctx->pending_async[i].list);
 429                 atomic_set(&ctx->pending_async[i].cnt, 0);
 430         }
 431         spin_lock_init(&ctx->completion_lock);
 432         INIT_LIST_HEAD(&ctx->poll_list);
 433         INIT_LIST_HEAD(&ctx->cancel_list);
 434         INIT_LIST_HEAD(&ctx->defer_list);
 435         INIT_LIST_HEAD(&ctx->timeout_list);
 436         INIT_LIST_HEAD(&ctx->task_list);
 437         spin_lock_init(&ctx->task_lock);
 438         return ctx;
 439 }
 440
 441 static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
 442                                        struct io_kiocb *req)
 443 {
 444         return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
 445                                         + atomic_read(&ctx->cached_cq_overflow);
 446 }
 447
 448 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 449                                      struct io_kiocb *req)
 450 {
 451         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
 452                 return false;
 453
 454         return __io_sequence_defer(ctx, req);
 455 }
 456
 457 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 458 {
 459         struct io_kiocb *req;
 460
 461         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 462         if (req && !io_sequence_defer(ctx, req)) {
 463                 list_del_init(&req->list);
 464                 return req;
 465         }
 466
 467         return NULL;
 468 }
 469
 470 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 471 {
 472         struct io_kiocb *req;
 473
 474         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 475         if (req) {
 476                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 477                         return NULL;
 478                 if (!__io_sequence_defer(ctx, req)) {
 479                         list_del_init(&req->list);
 480                         return req;
 481                 }
 482         }
 483
 484         return NULL;
 485 }
 486
 487 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 488 {
 489         struct io_rings *rings = ctx->rings;
 490
 491         if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
 492                 /* order cqe stores with ring update */
 493                 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 494
 495                 if (wq_has_sleeper(&ctx->cq_wait)) {
 496                         wake_up_interruptible(&ctx->cq_wait);
 497                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 498                 }
 499         }
 500 }
 501
 502 static inline void io_queue_async_work(struct io_ring_ctx *ctx,
 503                                        struct io_kiocb *req)
 504 {
 505         unsigned long flags;
 506         int rw = 0;
 507
 508         if (req->submit.sqe) {
 509                 switch (req->submit.opcode) {
 510                 case IORING_OP_WRITEV:
 511                 case IORING_OP_WRITE_FIXED:
 512                         rw = !(req->rw.ki_flags & IOCB_DIRECT);
 513                         break;
 514                 }
 515         }
 516
 517         if (req->work.func == io_sq_wq_submit_work) {
 518                 req->files = current->files;
 519
 520                 spin_lock_irqsave(&ctx->task_lock, flags);
 521                 list_add(&req->task_list, &ctx->task_list);
 522                 req->work_task = NULL;
 523                 spin_unlock_irqrestore(&ctx->task_lock, flags);
 524         }
 525
 526         queue_work(ctx->sqo_wq[rw], &req->work);
 527 }
 528
 529 static void io_kill_timeout(struct io_kiocb *req)
 530 {
 531         int ret;
 532
 533         ret = hrtimer_try_to_cancel(&req->timeout.timer);
 534         if (ret != -1) {
 535                 atomic_inc(&req->ctx->cq_timeouts);
 536                 list_del(&req->list);
 537                 io_cqring_fill_event(req->ctx, req->user_data, 0);
 538                 __io_free_req(req);
 539         }
 540 }
 541
 542 static void io_kill_timeouts(struct io_ring_ctx *ctx)
 543 {
 544         struct io_kiocb *req, *tmp;
 545
 546         spin_lock_irq(&ctx->completion_lock);
 547         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
 548                 io_kill_timeout(req);
 549         spin_unlock_irq(&ctx->completion_lock);
 550 }
 551
 552 static void io_commit_cqring(struct io_ring_ctx *ctx)
 553 {
 554         struct io_kiocb *req;
 555
 556         while ((req = io_get_timeout_req(ctx)) != NULL)
 557                 io_kill_timeout(req);
 558
 559         __io_commit_cqring(ctx);
 560
 561         while ((req = io_get_deferred_req(ctx)) != NULL) {
 562                 if (req->flags & REQ_F_SHADOW_DRAIN) {
 563                         /* Just for drain, free it. */
 564                         __io_free_req(req);
 565                         continue;
 566                 }
 567                 req->flags |= REQ_F_IO_DRAINED;
 568                 io_queue_async_work(ctx, req);
 569         }
 570 }
 571
 572 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 573 {
 574         struct io_rings *rings = ctx->rings;
 575         unsigned tail;
 576
 577         tail = ctx->cached_cq_tail;
 578         /*
 579          * writes to the cq entry need to come after reading head; the
 580          * control dependency is enough as we're using WRITE_ONCE to
 581          * fill the cq entry
 582          */
 583         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 584                 return NULL;
 585
 586         ctx->cached_cq_tail++;
 587         return &rings->cqes[tail & ctx->cq_mask];
 588 }
 589
 590 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 591                                  long res)
 592 {
 593         struct io_uring_cqe *cqe;
 594
 595         /*
 596          * If we can't get a cq entry, userspace overflowed the
 597          * submission (by quite a lot). Increment the overflow count in
 598          * the ring.
 599          */
 600         cqe = io_get_cqring(ctx);
 601         if (cqe) {
 602                 WRITE_ONCE(cqe->user_data, ki_user_data);
 603                 WRITE_ONCE(cqe->res, res);
 604                 WRITE_ONCE(cqe->flags, 0);
 605         } else {
 606                 WRITE_ONCE(ctx->rings->cq_overflow,
 607                                 atomic_inc_return(&ctx->cached_cq_overflow));
 608         }
 609 }
 610
 611 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 612 {
 613         if (waitqueue_active(&ctx->wait))
 614                 wake_up(&ctx->wait);
 615         if (waitqueue_active(&ctx->sqo_wait))
 616                 wake_up(&ctx->sqo_wait);
 617         if (ctx->cq_ev_fd)
 618                 eventfd_signal(ctx->cq_ev_fd, 1);
 619 }
 620
 621 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 622                                 long res)
 623 {
 624         unsigned long flags;
 625
 626         spin_lock_irqsave(&ctx->completion_lock, flags);
 627         io_cqring_fill_event(ctx, user_data, res);
 628         io_commit_cqring(ctx);
 629         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 630
 631         io_cqring_ev_posted(ctx);
 632 }
 633
 634 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 635                                    struct io_submit_state *state)
 636 {
 637         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 638         struct io_kiocb *req;
 639
 640         if (!percpu_ref_tryget(&ctx->refs))
 641                 return NULL;
 642
 643         if (!state) {
 644                 req = kmem_cache_alloc(req_cachep, gfp);
 645                 if (unlikely(!req))
 646                         goto out;
 647         } else if (!state->free_reqs) {
 648                 size_t sz;
 649                 int ret;
 650
 651                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 652                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
 653
 654                 /*
 655                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 656                  * retry single alloc to be on the safe side.
 657                  */
 658                 if (unlikely(ret <= 0)) {
 659                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 660                         if (!state->reqs[0])
 661                                 goto out;
 662                         ret = 1;
 663                 }
 664                 state->free_reqs = ret - 1;
 665                 state->cur_req = 1;
 666                 req = state->reqs[0];
 667         } else {
 668                 req = state->reqs[state->cur_req];
 669                 state->free_reqs--;
 670                 state->cur_req++;
 671         }
 672
 673         INIT_LIST_HEAD(&req->task_list);
 674         req->file = NULL;
 675         req->ctx = ctx;
 676         req->flags = 0;
 677         /* one is dropped after submission, the other at completion */
 678         refcount_set(&req->refs, 2);
 679         req->result = 0;
 680         req->fs = NULL;
 681         return req;
 682 out:
 683         percpu_ref_put(&ctx->refs);
 684         return NULL;
 685 }
 686
 687 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 688 {
 689         if (*nr) {
 690                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 691                 percpu_ref_put_many(&ctx->refs, *nr);
 692                 *nr = 0;
 693         }
 694 }
 695
 696 static void __io_free_req(struct io_kiocb *req)
 697 {
 698         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 699                 fput(req->file);
 700         percpu_ref_put(&req->ctx->refs);
 701         kmem_cache_free(req_cachep, req);
 702 }
 703
 704 static void io_req_link_next(struct io_kiocb *req)
 705 {
 706         struct io_kiocb *nxt;
 707
 708         /*
 709          * The list should never be empty when we are called here. But could
 710          * potentially happen if the chain is messed up, check to be on the
 711          * safe side.
 712          */
 713         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
 714         if (nxt) {
 715                 list_del(&nxt->list);
 716                 if (!list_empty(&req->link_list)) {
 717                         INIT_LIST_HEAD(&nxt->link_list);
 718                         list_splice(&req->link_list, &nxt->link_list);
 719                         nxt->flags |= REQ_F_LINK;
 720                 }
 721
 722                 nxt->flags |= REQ_F_LINK_DONE;
 723                 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
 724                 io_queue_async_work(req->ctx, nxt);
 725         }
 726 }
 727
 728 /*
 729  * Called if REQ_F_LINK is set, and we fail the head request
 730  */
 731 static void io_fail_links(struct io_kiocb *req)
 732 {
 733         struct io_kiocb *link;
 734
 735         while (!list_empty(&req->link_list)) {
 736                 link = list_first_entry(&req->link_list, struct io_kiocb, list);
 737                 list_del(&link->list);
 738
 739                 io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
 740                 __io_free_req(link);
 741         }
 742 }
 743
 744 static void io_free_req(struct io_kiocb *req)
 745 {
 746         /*
 747          * If LINK is set, we have dependent requests in this chain. If we
 748          * didn't fail this request, queue the first one up, moving any other
 749          * dependencies to the next request. In case of failure, fail the rest
 750          * of the chain.
 751          */
 752         if (req->flags & REQ_F_LINK) {
 753                 if (req->flags & REQ_F_FAIL_LINK)
 754                         io_fail_links(req);
 755                 else
 756                         io_req_link_next(req);
 757         }
 758
 759         __io_free_req(req);
 760 }
 761
 762 static void io_put_req(struct io_kiocb *req)
 763 {
 764         if (refcount_dec_and_test(&req->refs))
 765                 io_free_req(req);
 766 }
 767
 768 static unsigned io_cqring_events(struct io_rings *rings)
 769 {
 770         /* See comment at the top of this file */
 771         smp_rmb();
 772         return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
 773 }
 774
 775 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 776 {
 777         struct io_rings *rings = ctx->rings;
 778
 779         /* make sure SQ entry isn't read before tail */
 780         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 781 }
 782
 783 /*
 784  * Find and free completed poll iocbs
 785  */
 786 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 787                                struct list_head *done)
 788 {
 789         void *reqs[IO_IOPOLL_BATCH];
 790         struct io_kiocb *req;
 791         int to_free;
 792
 793         to_free = 0;
 794         while (!list_empty(done)) {
 795                 req = list_first_entry(done, struct io_kiocb, list);
 796                 list_del(&req->list);
 797
 798                 io_cqring_fill_event(ctx, req->user_data, req->result);
 799                 (*nr_events)++;
 800
 801                 if (refcount_dec_and_test(&req->refs)) {
 802                         /* If we're not using fixed files, we have to pair the
 803                          * completion part with the file put. Use regular
 804                          * completions for those, only batch free for fixed
 805                          * file and non-linked commands.
 806                          */
 807                         if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
 808                             REQ_F_FIXED_FILE) {
 809                                 reqs[to_free++] = req;
 810                                 if (to_free == ARRAY_SIZE(reqs))
 811                                         io_free_req_many(ctx, reqs, &to_free);
 812                         } else {
 813                                 io_free_req(req);
 814                         }
 815                 }
 816         }
 817
 818         io_commit_cqring(ctx);
 819         io_free_req_many(ctx, reqs, &to_free);
 820 }
 821
 822 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 823                         long min)
 824 {
 825         struct io_kiocb *req, *tmp;
 826         LIST_HEAD(done);
 827         bool spin;
 828         int ret;
 829
 830         /*
 831          * Only spin for completions if we don't have multiple devices hanging
 832          * off our complete list, and we're under the requested amount.
 833          */
 834         spin = !ctx->poll_multi_file && *nr_events < min;
 835
 836         ret = 0;
 837         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
 838                 struct kiocb *kiocb = &req->rw;
 839
 840                 /*
 841                  * Move completed entries to our local list. If we find a
 842                  * request that requires polling, break out and complete
 843                  * the done list first, if we have entries there.
 844                  */
 845                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
 846                         list_move_tail(&req->list, &done);
 847                         continue;
 848                 }
 849                 if (!list_empty(&done))
 850                         break;
 851
 852                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
 853                 if (ret < 0)
 854                         break;
 855
 856                 if (ret && spin)
 857                         spin = false;
 858                 ret = 0;
 859         }
 860
 861         if (!list_empty(&done))
 862                 io_iopoll_complete(ctx, nr_events, &done);
 863
 864         return ret;
 865 }
 866
 867 /*
 868  * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
 869  * non-spinning poll check - we'll still enter the driver poll loop, but only
 870  * as a non-spinning completion check.
 871  */
 872 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
 873                                 long min)
 874 {
 875         while (!list_empty(&ctx->poll_list) && !need_resched()) {
 876                 int ret;
 877
 878                 ret = io_do_iopoll(ctx, nr_events, min);
 879                 if (ret < 0)
 880                         return ret;
 881                 if (!min || *nr_events >= min)
 882                         return 0;
 883         }
 884
 885         return 1;
 886 }
 887
 888 /*
 889  * We can't just wait for polled events to come to us, we have to actively
 890  * find and complete them.
 891  */
 892 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 893 {
 894         if (!(ctx->flags & IORING_SETUP_IOPOLL))
 895                 return;
 896
 897         mutex_lock(&ctx->uring_lock);
 898         while (!list_empty(&ctx->poll_list)) {
 899                 unsigned int nr_events = 0;
 900
 901                 io_iopoll_getevents(ctx, &nr_events, 1);
 902
 903                 /*
 904                  * Ensure we allow local-to-the-cpu processing to take place,
 905                  * in this case we need to ensure that we reap all events.
 906                  */
 907                 cond_resched();
 908         }
 909         mutex_unlock(&ctx->uring_lock);
 910 }
 911
 912 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 913                            long min)
 914 {
 915         int iters = 0, ret = 0;
 916
 917         /*
 918          * We disallow the app entering submit/complete with polling, but we
 919          * still need to lock the ring to prevent racing with polled issue
 920          * that got punted to a workqueue.
 921          */
 922         mutex_lock(&ctx->uring_lock);
 923         do {
 924                 int tmin = 0;
 925
 926                 /*
 927                  * Don't enter poll loop if we already have events pending.
 928                  * If we do, we can potentially be spinning for commands that
 929                  * already triggered a CQE (eg in error).
 930                  */
 931                 if (io_cqring_events(ctx->rings))
 932                         break;
 933
 934                 /*
 935                  * If a submit got punted to a workqueue, we can have the
 936                  * application entering polling for a command before it gets
 937                  * issued. That app will hold the uring_lock for the duration
 938                  * of the poll right here, so we need to take a breather every
 939                  * now and then to ensure that the issue has a chance to add
 940                  * the poll to the issued list. Otherwise we can spin here
 941                  * forever, while the workqueue is stuck trying to acquire the
 942                  * very same mutex.
 943                  */
 944                 if (!(++iters & 7)) {
 945                         mutex_unlock(&ctx->uring_lock);
 946                         mutex_lock(&ctx->uring_lock);
 947                 }
 948
 949                 if (*nr_events < min)
 950                         tmin = min - *nr_events;
 951
 952                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
 953                 if (ret <= 0)
 954                         break;
 955                 ret = 0;
 956         } while (min && !*nr_events && !need_resched());
 957
 958         mutex_unlock(&ctx->uring_lock);
 959         return ret;
 960 }
 961
 962 static void kiocb_end_write(struct io_kiocb *req)
 963 {
 964         /*
 965          * Tell lockdep we inherited freeze protection from submission
 966          * thread.
 967          */
 968         if (req->flags & REQ_F_ISREG) {
 969                 struct inode *inode = file_inode(req->file);
 970
 971                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
 972         }
 973         file_end_write(req->file);
 974 }
 975
 976 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 977 {
 978         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 979
 980         if (kiocb->ki_flags & IOCB_WRITE)
 981                 kiocb_end_write(req);
 982
 983         if ((req->flags & REQ_F_LINK) && res != req->result)
 984                 req->flags |= REQ_F_FAIL_LINK;
 985         io_cqring_add_event(req->ctx, req->user_data, res);
 986         io_put_req(req);
 987 }
 988
 989 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 990 {
 991         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 992
 993         if (kiocb->ki_flags & IOCB_WRITE)
 994                 kiocb_end_write(req);
 995
 996         if ((req->flags & REQ_F_LINK) && res != req->result)
 997                 req->flags |= REQ_F_FAIL_LINK;
 998         req->result = res;
 999         if (res != -EAGAIN)
1000                 req->flags |= REQ_F_IOPOLL_COMPLETED;
1001 }
1002
1003 /*
1004  * After the iocb has been issued, it's safe to be found on the poll list.
1005  * Adding the kiocb to the list AFTER submission ensures that we don't
1006  * find it from a io_iopoll_getevents() thread before the issuer is done
1007  * accessing the kiocb cookie.
1008  */
1009 static void io_iopoll_req_issued(struct io_kiocb *req)
1010 {
1011         struct io_ring_ctx *ctx = req->ctx;
1012
1013         /*
1014          * Track whether we have multiple files in our lists. This will impact
1015          * how we do polling eventually, not spinning if we're on potentially
1016          * different devices.
1017          */
1018         if (list_empty(&ctx->poll_list)) {
1019                 ctx->poll_multi_file = false;
1020         } else if (!ctx->poll_multi_file) {
1021                 struct io_kiocb *list_req;
1022
1023                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1024                                                 list);
1025                 if (list_req->rw.ki_filp != req->rw.ki_filp)
1026                         ctx->poll_multi_file = true;
1027         }
1028
1029         /*
1030          * For fast devices, IO may have already completed. If it has, add
1031          * it to the front so we find it first.
1032          */
1033         if (req->flags & REQ_F_IOPOLL_COMPLETED)
1034                 list_add(&req->list, &ctx->poll_list);
1035         else
1036                 list_add_tail(&req->list, &ctx->poll_list);
1037 }
1038
1039 static void io_file_put(struct io_submit_state *state)
1040 {
1041         if (state->file) {
1042                 int diff = state->has_refs - state->used_refs;
1043
1044                 if (diff)
1045                         fput_many(state->file, diff);
1046                 state->file = NULL;
1047         }
1048 }
1049
1050 /*
1051  * Get as many references to a file as we have IOs left in this submission,
1052  * assuming most submissions are for one file, or at least that each file
1053  * has more than one submission.
1054  */
1055 static struct file *io_file_get(struct io_submit_state *state, int fd)
1056 {
1057         if (!state)
1058                 return fget(fd);
1059
1060         if (state->file) {
1061                 if (state->fd == fd) {
1062                         state->used_refs++;
1063                         state->ios_left--;
1064                         return state->file;
1065                 }
1066                 io_file_put(state);
1067         }
1068         state->file = fget_many(fd, state->ios_left);
1069         if (!state->file)
1070                 return NULL;
1071
1072         state->fd = fd;
1073         state->has_refs = state->ios_left;
1074         state->used_refs = 1;
1075         state->ios_left--;
1076         return state->file;
1077 }
1078
1079 /*
1080  * If we tracked the file through the SCM inflight mechanism, we could support
1081  * any file. For now, just ensure that anything potentially problematic is done
1082  * inline.
1083  */
1084 static bool io_file_supports_async(struct file *file)
1085 {
1086         umode_t mode = file_inode(file)->i_mode;
1087
1088         if (S_ISBLK(mode) || S_ISCHR(mode))
1089                 return true;
1090         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1091                 return true;
1092
1093         return false;
1094 }
1095
1096 static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
1097                       bool force_nonblock)
1098 {
1099         const struct io_uring_sqe *sqe = s->sqe;
1100         struct io_ring_ctx *ctx = req->ctx;
1101         struct kiocb *kiocb = &req->rw;
1102         unsigned ioprio;
1103         int ret;
1104
1105         if (!req->file)
1106                 return -EBADF;
1107
1108         if (S_ISREG(file_inode(req->file)->i_mode))
1109                 req->flags |= REQ_F_ISREG;
1110
1111         if (force_nonblock)
1112                 req->fsize = rlimit(RLIMIT_FSIZE);
1113
1114         /*
1115          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1116          * we know to async punt it even if it was opened O_NONBLOCK
1117          */
1118         if (force_nonblock && !io_file_supports_async(req->file)) {
1119                 req->flags |= REQ_F_MUST_PUNT;
1120                 return -EAGAIN;
1121         }
1122
1123         kiocb->ki_pos = READ_ONCE(sqe->off);
1124         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1125         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1126
1127         ioprio = READ_ONCE(sqe->ioprio);
1128         if (ioprio) {
1129                 ret = ioprio_check_cap(ioprio);
1130                 if (ret)
1131                         return ret;
1132
1133                 kiocb->ki_ioprio = ioprio;
1134         } else
1135                 kiocb->ki_ioprio = get_current_ioprio();
1136
1137         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1138         if (unlikely(ret))
1139                 return ret;
1140
1141         /* don't allow async punt if RWF_NOWAIT was requested */
1142         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1143             (req->file->f_flags & O_NONBLOCK))
1144                 req->flags |= REQ_F_NOWAIT;
1145
1146         if (force_nonblock)
1147                 kiocb->ki_flags |= IOCB_NOWAIT;
1148
1149         if (ctx->flags & IORING_SETUP_IOPOLL) {
1150                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1151                     !kiocb->ki_filp->f_op->iopoll)
1152                         return -EOPNOTSUPP;
1153
1154                 kiocb->ki_flags |= IOCB_HIPRI;
1155                 kiocb->ki_complete = io_complete_rw_iopoll;
1156                 req->result = 0;
1157         } else {
1158                 if (kiocb->ki_flags & IOCB_HIPRI)
1159                         return -EINVAL;
1160                 kiocb->ki_complete = io_complete_rw;
1161         }
1162         return 0;
1163 }
1164
1165 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1166 {
1167         switch (ret) {
1168         case -EIOCBQUEUED:
1169                 break;
1170         case -ERESTARTSYS:
1171         case -ERESTARTNOINTR:
1172         case -ERESTARTNOHAND:
1173         case -ERESTART_RESTARTBLOCK:
1174                 /*
1175                  * We can't just restart the syscall, since previously
1176                  * submitted sqes may already be in progress. Just fail this
1177                  * IO with EINTR.
1178                  */
1179                 ret = -EINTR;
1180                 /* fall through */
1181         default:
1182                 kiocb->ki_complete(kiocb, ret, 0);
1183         }
1184 }
1185
1186 static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1187                            const struct io_uring_sqe *sqe,
1188                            struct iov_iter *iter)
1189 {
1190         size_t len = READ_ONCE(sqe->len);
1191         struct io_mapped_ubuf *imu;
1192         unsigned index, buf_index;
1193         size_t offset;
1194         u64 buf_addr;
1195
1196         /* attempt to use fixed buffers without having provided iovecs */
1197         if (unlikely(!ctx->user_bufs))
1198                 return -EFAULT;
1199
1200         buf_index = READ_ONCE(sqe->buf_index);
1201         if (unlikely(buf_index >= ctx->nr_user_bufs))
1202                 return -EFAULT;
1203
1204         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1205         imu = &ctx->user_bufs[index];
1206         buf_addr = READ_ONCE(sqe->addr);
1207
1208         /* overflow */
1209         if (buf_addr + len < buf_addr)
1210                 return -EFAULT;
1211         /* not inside the mapped region */
1212         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1213                 return -EFAULT;
1214
1215         /*
1216          * May not be a start of buffer, set size appropriately
1217          * and advance us to the beginning.
1218          */
1219         offset = buf_addr - imu->ubuf;
1220         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1221
1222         if (offset) {
1223                 /*
1224                  * Don't use iov_iter_advance() here, as it's really slow for
1225                  * using the latter parts of a big fixed buffer - it iterates
1226                  * over each segment manually. We can cheat a bit here, because
1227                  * we know that:
1228                  *
1229                  * 1) it's a BVEC iter, we set it up
1230                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
1231                  *    first and last bvec
1232                  *
1233                  * So just find our index, and adjust the iterator afterwards.
1234                  * If the offset is within the first bvec (or the whole first
1235                  * bvec, just use iov_iter_advance(). This makes it easier
1236                  * since we can just skip the first segment, which may not
1237                  * be PAGE_SIZE aligned.
1238                  */
1239                 const struct bio_vec *bvec = imu->bvec;
1240
1241                 if (offset <= bvec->bv_len) {
1242                         iov_iter_advance(iter, offset);
1243                 } else {
1244                         unsigned long seg_skip;
1245
1246                         /* skip first vec */
1247                         offset -= bvec->bv_len;
1248                         seg_skip = 1 + (offset >> PAGE_SHIFT);
1249
1250                         iter->bvec = bvec + seg_skip;
1251                         iter->nr_segs -= seg_skip;
1252                         iter->count -= bvec->bv_len + offset;
1253                         iter->iov_offset = offset & ~PAGE_MASK;
1254                 }
1255         }
1256
1257         return len;
1258 }
1259
1260 static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
1261                                struct io_kiocb *req, struct iovec **iovec,
1262                                struct iov_iter *iter)
1263 {
1264         const struct io_uring_sqe *sqe = req->submit.sqe;
1265         void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1266         size_t sqe_len = READ_ONCE(sqe->len);
1267         u8 opcode;
1268
1269         opcode = req->submit.opcode;
1270         if (opcode == IORING_OP_READ_FIXED ||
1271             opcode == IORING_OP_WRITE_FIXED) {
1272                 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
1273                 *iovec = NULL;
1274                 return ret;
1275         }
1276
1277         if (!req->submit.has_user)
1278                 return -EFAULT;
1279
1280 #ifdef CONFIG_COMPAT
1281         if (ctx->compat)
1282                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1283                                                 iovec, iter);
1284 #endif
1285
1286         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1287 }
1288
1289 static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
1290 {
1291         if (al->file == kiocb->ki_filp) {
1292                 off_t start, end;
1293
1294                 /*
1295                  * Allow merging if we're anywhere in the range of the same
1296                  * page. Generally this happens for sub-page reads or writes,
1297                  * and it's beneficial to allow the first worker to bring the
1298                  * page in and the piggy backed work can then work on the
1299                  * cached page.
1300                  */
1301                 start = al->io_start & PAGE_MASK;
1302                 end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
1303                 if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
1304                         return true;
1305         }
1306
1307         al->file = NULL;
1308         return false;
1309 }
1310
1311 /*
1312  * Make a note of the last file/offset/direction we punted to async
1313  * context. We'll use this information to see if we can piggy back a
1314  * sequential request onto the previous one, if it's still hasn't been
1315  * completed by the async worker.
1316  */
1317 static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
1318 {
1319         struct async_list *async_list = &req->ctx->pending_async[rw];
1320         struct kiocb *kiocb = &req->rw;
1321         struct file *filp = kiocb->ki_filp;
1322
1323         if (io_should_merge(async_list, kiocb)) {
1324                 unsigned long max_bytes;
1325
1326                 /* Use 8x RA size as a decent limiter for both reads/writes */
1327                 max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
1328                 if (!max_bytes)
1329                         max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
1330
1331                 /* If max len are exceeded, reset the state */
1332                 if (async_list->io_len + len <= max_bytes) {
1333                         req->flags |= REQ_F_SEQ_PREV;
1334                         async_list->io_len += len;
1335                 } else {
1336                         async_list->file = NULL;
1337                 }
1338         }
1339
1340         /* New file? Reset state. */
1341         if (async_list->file != filp) {
1342                 async_list->io_start = kiocb->ki_pos;
1343                 async_list->io_len = len;
1344                 async_list->file = filp;
1345         }
1346 }
1347
1348 /*
1349  * For files that don't have ->read_iter() and ->write_iter(), handle them
1350  * by looping over ->read() or ->write() manually.
1351  */
1352 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
1353                            struct iov_iter *iter)
1354 {
1355         ssize_t ret = 0;
1356
1357         /*
1358          * Don't support polled IO through this interface, and we can't
1359          * support non-blocking either. For the latter, this just causes
1360          * the kiocb to be handled from an async context.
1361          */
1362         if (kiocb->ki_flags & IOCB_HIPRI)
1363                 return -EOPNOTSUPP;
1364         if (kiocb->ki_flags & IOCB_NOWAIT)
1365                 return -EAGAIN;
1366
1367         while (iov_iter_count(iter)) {
1368                 struct iovec iovec;
1369                 ssize_t nr;
1370
1371                 if (!iov_iter_is_bvec(iter)) {
1372                         iovec = iov_iter_iovec(iter);
1373                 } else {
1374                         /* fixed buffers import bvec */
1375                         iovec.iov_base = kmap(iter->bvec->bv_page)
1376                                                 + iter->iov_offset;
1377                         iovec.iov_len = min(iter->count,
1378                                         iter->bvec->bv_len - iter->iov_offset);
1379                 }
1380
1381                 if (rw == READ) {
1382                         nr = file->f_op->read(file, iovec.iov_base,
1383                                               iovec.iov_len, &kiocb->ki_pos);
1384                 } else {
1385                         nr = file->f_op->write(file, iovec.iov_base,
1386                                                iovec.iov_len, &kiocb->ki_pos);
1387                 }
1388
1389                 if (iov_iter_is_bvec(iter))
1390                         kunmap(iter->bvec->bv_page);
1391
1392                 if (nr < 0) {
1393                         if (!ret)
1394                                 ret = nr;
1395                         break;
1396                 }
1397                 ret += nr;
1398                 if (nr != iovec.iov_len)
1399                         break;
1400                 iov_iter_advance(iter, nr);
1401         }
1402
1403         return ret;
1404 }
1405
1406 static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
1407                    bool force_nonblock)
1408 {
1409         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1410         struct kiocb *kiocb = &req->rw;
1411         struct iov_iter iter;
1412         struct file *file;
1413         size_t iov_count;
1414         ssize_t read_size, ret;
1415
1416         ret = io_prep_rw(req, s, force_nonblock);
1417         if (ret)
1418                 return ret;
1419         file = kiocb->ki_filp;
1420
1421         if (unlikely(!(file->f_mode & FMODE_READ)))
1422                 return -EBADF;
1423
1424         ret = io_import_iovec(req->ctx, READ, req, &iovec, &iter);
1425         if (ret < 0)
1426                 return ret;
1427
1428         read_size = ret;
1429         if (req->flags & REQ_F_LINK)
1430                 req->result = read_size;
1431
1432         iov_count = iov_iter_count(&iter);
1433         ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
1434         if (!ret) {
1435                 ssize_t ret2;
1436
1437                 if (file->f_op->read_iter)
1438                         ret2 = call_read_iter(file, kiocb, &iter);
1439                 else if (req->file->f_op->read)
1440                         ret2 = loop_rw_iter(READ, file, kiocb, &iter);
1441                 else
1442                         ret2 = -EINVAL;
1443
1444                 /*
1445                  * In case of a short read, punt to async. This can happen
1446                  * if we have data partially cached. Alternatively we can
1447                  * return the short read, in which case the application will
1448                  * need to issue another SQE and wait for it. That SQE will
1449                  * need async punt anyway, so it's more efficient to do it
1450                  * here.
1451                  */
1452                 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
1453                     (req->flags & REQ_F_ISREG) &&
1454                     ret2 > 0 && ret2 < read_size)
1455                         ret2 = -EAGAIN;
1456                 /* Catch -EAGAIN return for forced non-blocking submission */
1457                 if (!force_nonblock || ret2 != -EAGAIN) {
1458                         io_rw_done(kiocb, ret2);
1459                 } else {
1460                         /*
1461                          * If ->needs_lock is true, we're already in async
1462                          * context.
1463                          */
1464                         if (!s->needs_lock)
1465                                 io_async_list_note(READ, req, iov_count);
1466                         ret = -EAGAIN;
1467                 }
1468         }
1469         kfree(iovec);
1470         return ret;
1471 }
1472
1473 static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
1474                     bool force_nonblock)
1475 {
1476         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1477         struct kiocb *kiocb = &req->rw;
1478         struct iov_iter iter;
1479         struct file *file;
1480         size_t iov_count;
1481         ssize_t ret;
1482
1483         ret = io_prep_rw(req, s, force_nonblock);
1484         if (ret)
1485                 return ret;
1486
1487         file = kiocb->ki_filp;
1488         if (unlikely(!(file->f_mode & FMODE_WRITE)))
1489                 return -EBADF;
1490
1491         ret = io_import_iovec(req->ctx, WRITE, req, &iovec, &iter);
1492         if (ret < 0)
1493                 return ret;
1494
1495         if (req->flags & REQ_F_LINK)
1496                 req->result = ret;
1497
1498         iov_count = iov_iter_count(&iter);
1499
1500         ret = -EAGAIN;
1501         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
1502                 /* If ->needs_lock is true, we're already in async context. */
1503                 if (!s->needs_lock)
1504                         io_async_list_note(WRITE, req, iov_count);
1505                 goto out_free;
1506         }
1507
1508         ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
1509         if (!ret) {
1510                 ssize_t ret2;
1511
1512                 /*
1513                  * Open-code file_start_write here to grab freeze protection,
1514                  * which will be released by another thread in
1515                  * io_complete_rw().  Fool lockdep by telling it the lock got
1516                  * released so that it doesn't complain about the held lock when
1517                  * we return to userspace.
1518                  */
1519                 if (req->flags & REQ_F_ISREG) {
1520                         __sb_start_write(file_inode(file)->i_sb,
1521                                                 SB_FREEZE_WRITE, true);
1522                         __sb_writers_release(file_inode(file)->i_sb,
1523                                                 SB_FREEZE_WRITE);
1524                 }
1525                 kiocb->ki_flags |= IOCB_WRITE;
1526
1527                 if (!force_nonblock)
1528                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
1529
1530                 if (file->f_op->write_iter)
1531                         ret2 = call_write_iter(file, kiocb, &iter);
1532                 else if (req->file->f_op->write)
1533                         ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
1534                 else
1535                         ret2 = -EINVAL;
1536
1537                 if (!force_nonblock)
1538                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
1539
1540                 if (!force_nonblock || ret2 != -EAGAIN) {
1541                         io_rw_done(kiocb, ret2);
1542                 } else {
1543                         /*
1544                          * If ->needs_lock is true, we're already in async
1545                          * context.
1546                          */
1547                         if (!s->needs_lock)
1548                                 io_async_list_note(WRITE, req, iov_count);
1549                         ret = -EAGAIN;
1550                 }
1551         }
1552 out_free:
1553         kfree(iovec);
1554         return ret;
1555 }
1556
1557 /*
1558  * IORING_OP_NOP just posts a completion event, nothing else.
1559  */
1560 static int io_nop(struct io_kiocb *req, u64 user_data)
1561 {
1562         struct io_ring_ctx *ctx = req->ctx;
1563         long err = 0;
1564
1565         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1566                 return -EINVAL;
1567
1568         io_cqring_add_event(ctx, user_data, err);
1569         io_put_req(req);
1570         return 0;
1571 }
1572
1573 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1574 {
1575         struct io_ring_ctx *ctx = req->ctx;
1576
1577         if (!req->file)
1578                 return -EBADF;
1579
1580         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1581                 return -EINVAL;
1582         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1583                 return -EINVAL;
1584
1585         return 0;
1586 }
1587
1588 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1589                     bool force_nonblock)
1590 {
1591         loff_t sqe_off = READ_ONCE(sqe->off);
1592         loff_t sqe_len = READ_ONCE(sqe->len);
1593         loff_t end = sqe_off + sqe_len;
1594         unsigned fsync_flags;
1595         int ret;
1596
1597         fsync_flags = READ_ONCE(sqe->fsync_flags);
1598         if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1599                 return -EINVAL;
1600
1601         ret = io_prep_fsync(req, sqe);
1602         if (ret)
1603                 return ret;
1604
1605         /* fsync always requires a blocking context */
1606         if (force_nonblock)
1607                 return -EAGAIN;
1608
1609         ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1610                                 end > 0 ? end : LLONG_MAX,
1611                                 fsync_flags & IORING_FSYNC_DATASYNC);
1612
1613         if (ret < 0 && (req->flags & REQ_F_LINK))
1614                 req->flags |= REQ_F_FAIL_LINK;
1615         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1616         io_put_req(req);
1617         return 0;
1618 }
1619
1620 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1621 {
1622         struct io_ring_ctx *ctx = req->ctx;
1623         int ret = 0;
1624
1625         if (!req->file)
1626                 return -EBADF;
1627
1628         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1629                 return -EINVAL;
1630         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1631                 return -EINVAL;
1632
1633         return ret;
1634 }
1635
1636 static int io_sync_file_range(struct io_kiocb *req,
1637                               const struct io_uring_sqe *sqe,
1638                               bool force_nonblock)
1639 {
1640         loff_t sqe_off;
1641         loff_t sqe_len;
1642         unsigned flags;
1643         int ret;
1644
1645         ret = io_prep_sfr(req, sqe);
1646         if (ret)
1647                 return ret;
1648
1649         /* sync_file_range always requires a blocking context */
1650         if (force_nonblock)
1651                 return -EAGAIN;
1652
1653         sqe_off = READ_ONCE(sqe->off);
1654         sqe_len = READ_ONCE(sqe->len);
1655         flags = READ_ONCE(sqe->sync_range_flags);
1656
1657         ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
1658
1659         if (ret < 0 && (req->flags & REQ_F_LINK))
1660                 req->flags |= REQ_F_FAIL_LINK;
1661         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1662         io_put_req(req);
1663         return 0;
1664 }
1665
1666 #if defined(CONFIG_NET)
1667 static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1668                            bool force_nonblock,
1669                    long (*fn)(struct socket *, struct user_msghdr __user *,
1670                                 unsigned int))
1671 {
1672         struct socket *sock;
1673         int ret;
1674
1675         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1676                 return -EINVAL;
1677
1678         sock = sock_from_file(req->file, &ret);
1679         if (sock) {
1680                 struct user_msghdr __user *msg;
1681                 unsigned flags;
1682
1683                 flags = READ_ONCE(sqe->msg_flags);
1684                 if (flags & MSG_DONTWAIT)
1685                         req->flags |= REQ_F_NOWAIT;
1686                 else if (force_nonblock)
1687                         flags |= MSG_DONTWAIT;
1688
1689 #ifdef CONFIG_COMPAT
1690                 if (req->ctx->compat)
1691                         flags |= MSG_CMSG_COMPAT;
1692 #endif
1693
1694                 msg = (struct user_msghdr __user *) (unsigned long)
1695                         READ_ONCE(sqe->addr);
1696
1697                 ret = fn(sock, msg, flags);
1698                 if (force_nonblock && ret == -EAGAIN)
1699                         return ret;
1700                 if (ret == -ERESTARTSYS)
1701                         ret = -EINTR;
1702         }
1703
1704         if (req->fs) {
1705                 struct fs_struct *fs = req->fs;
1706
1707                 spin_lock(&req->fs->lock);
1708                 if (--fs->users)
1709                         fs = NULL;
1710                 spin_unlock(&req->fs->lock);
1711                 if (fs)
1712                         free_fs_struct(fs);
1713         }
1714         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1715         io_put_req(req);
1716         return 0;
1717 }
1718 #endif
1719
1720 static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1721                       bool force_nonblock)
1722 {
1723 #if defined(CONFIG_NET)
1724         return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
1725 #else
1726         return -EOPNOTSUPP;
1727 #endif
1728 }
1729
1730 static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1731                       bool force_nonblock)
1732 {
1733 #if defined(CONFIG_NET)
1734         return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
1735 #else
1736         return -EOPNOTSUPP;
1737 #endif
1738 }
1739
1740 static void io_poll_remove_one(struct io_kiocb *req)
1741 {
1742         struct io_poll_iocb *poll = &req->poll;
1743
1744         spin_lock(&poll->head->lock);
1745         WRITE_ONCE(poll->canceled, true);
1746         if (!list_empty(&poll->wait.entry)) {
1747                 list_del_init(&poll->wait.entry);
1748                 io_queue_async_work(req->ctx, req);
1749         }
1750         spin_unlock(&poll->head->lock);
1751
1752         list_del_init(&req->list);
1753 }
1754
1755 static void io_poll_remove_all(struct io_ring_ctx *ctx)
1756 {
1757         struct io_kiocb *req;
1758
1759         spin_lock_irq(&ctx->completion_lock);
1760         while (!list_empty(&ctx->cancel_list)) {
1761                 req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
1762                 io_poll_remove_one(req);
1763         }
1764         spin_unlock_irq(&ctx->completion_lock);
1765 }
1766
1767 /*
1768  * Find a running poll command that matches one specified in sqe->addr,
1769  * and remove it if found.
1770  */
1771 static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1772 {
1773         struct io_ring_ctx *ctx = req->ctx;
1774         struct io_kiocb *poll_req, *next;
1775         int ret = -ENOENT;
1776
1777         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1778                 return -EINVAL;
1779         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
1780             sqe->poll_events)
1781                 return -EINVAL;
1782
1783         spin_lock_irq(&ctx->completion_lock);
1784         list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
1785                 if (READ_ONCE(sqe->addr) == poll_req->user_data) {
1786                         io_poll_remove_one(poll_req);
1787                         ret = 0;
1788                         break;
1789                 }
1790         }
1791         spin_unlock_irq(&ctx->completion_lock);
1792
1793         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1794         io_put_req(req);
1795         return 0;
1796 }
1797
1798 static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
1799                              __poll_t mask)
1800 {
1801         req->poll.done = true;
1802         io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
1803         io_commit_cqring(ctx);
1804 }
1805
1806 static void io_poll_complete_work(struct work_struct *work)
1807 {
1808         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1809         struct io_poll_iocb *poll = &req->poll;
1810         struct poll_table_struct pt = { ._key = poll->events };
1811         struct io_ring_ctx *ctx = req->ctx;
1812         const struct cred *old_cred;
1813         __poll_t mask = 0;
1814
1815         old_cred = override_creds(ctx->creds);
1816
1817         if (!READ_ONCE(poll->canceled))
1818                 mask = vfs_poll(poll->file, &pt) & poll->events;
1819
1820         /*
1821          * Note that ->ki_cancel callers also delete iocb from active_reqs after
1822          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
1823          * synchronize with them.  In the cancellation case the list_del_init
1824          * itself is not actually needed, but harmless so we keep it in to
1825          * avoid further branches in the fast path.
1826          */
1827         spin_lock_irq(&ctx->completion_lock);
1828         if (!mask && !READ_ONCE(poll->canceled)) {
1829                 add_wait_queue(poll->head, &poll->wait);
1830                 spin_unlock_irq(&ctx->completion_lock);
1831                 goto out;
1832         }
1833         list_del_init(&req->list);
1834         io_poll_complete(ctx, req, mask);
1835         spin_unlock_irq(&ctx->completion_lock);
1836
1837         io_cqring_ev_posted(ctx);
1838         io_put_req(req);
1839 out:
1840         revert_creds(old_cred);
1841 }
1842
1843 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1844                         void *key)
1845 {
1846         struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
1847                                                         wait);
1848         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1849         struct io_ring_ctx *ctx = req->ctx;
1850         __poll_t mask = key_to_poll(key);
1851         unsigned long flags;
1852
1853         /* for instances that support it check for an event match first: */
1854         if (mask && !(mask & poll->events))
1855                 return 0;
1856
1857         list_del_init(&poll->wait.entry);
1858
1859         if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1860                 list_del(&req->list);
1861                 io_poll_complete(ctx, req, mask);
1862                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1863
1864                 io_cqring_ev_posted(ctx);
1865                 io_put_req(req);
1866         } else {
1867                 io_queue_async_work(ctx, req);
1868         }
1869
1870         return 1;
1871 }
1872
1873 struct io_poll_table {
1874         struct poll_table_struct pt;
1875         struct io_kiocb *req;
1876         int error;
1877 };
1878
1879 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1880                                struct poll_table_struct *p)
1881 {
1882         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
1883
1884         if (unlikely(pt->req->poll.head)) {
1885                 pt->error = -EINVAL;
1886                 return;
1887         }
1888
1889         pt->error = 0;
1890         pt->req->poll.head = head;
1891         add_wait_queue(head, &pt->req->poll.wait);
1892 }
1893
1894 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1895 {
1896         struct io_poll_iocb *poll = &req->poll;
1897         struct io_ring_ctx *ctx = req->ctx;
1898         struct io_poll_table ipt;
1899         bool cancel = false;
1900         __poll_t mask;
1901         u16 events;
1902
1903         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1904                 return -EINVAL;
1905         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
1906                 return -EINVAL;
1907         if (!poll->file)
1908                 return -EBADF;
1909
1910         req->submit.sqe = NULL;
1911         INIT_WORK(&req->work, io_poll_complete_work);
1912         events = READ_ONCE(sqe->poll_events);
1913         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1914
1915         poll->head = NULL;
1916         poll->done = false;
1917         poll->canceled = false;
1918
1919         ipt.pt._qproc = io_poll_queue_proc;
1920         ipt.pt._key = poll->events;
1921         ipt.req = req;
1922         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1923
1924         /* initialized the list so that we can do list_empty checks */
1925         INIT_LIST_HEAD(&poll->wait.entry);
1926         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1927
1928         INIT_LIST_HEAD(&req->list);
1929
1930         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1931
1932         spin_lock_irq(&ctx->completion_lock);
1933         if (likely(poll->head)) {
1934                 spin_lock(&poll->head->lock);
1935                 if (unlikely(list_empty(&poll->wait.entry))) {
1936                         if (ipt.error)
1937                                 cancel = true;
1938                         ipt.error = 0;
1939                         mask = 0;
1940                 }
1941                 if (mask || ipt.error)
1942                         list_del_init(&poll->wait.entry);
1943                 else if (cancel)
1944                         WRITE_ONCE(poll->canceled, true);
1945                 else if (!poll->done) /* actually waiting for an event */
1946                         list_add_tail(&req->list, &ctx->cancel_list);
1947                 spin_unlock(&poll->head->lock);
1948         }
1949         if (mask) { /* no async, we'd stolen it */
1950                 ipt.error = 0;
1951                 io_poll_complete(ctx, req, mask);
1952         }
1953         spin_unlock_irq(&ctx->completion_lock);
1954
1955         if (mask) {
1956                 io_cqring_ev_posted(ctx);
1957                 io_put_req(req);
1958         }
1959         return ipt.error;
1960 }
1961
1962 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
1963 {
1964         struct io_ring_ctx *ctx;
1965         struct io_kiocb *req, *prev;
1966         unsigned long flags;
1967
1968         req = container_of(timer, struct io_kiocb, timeout.timer);
1969         ctx = req->ctx;
1970         atomic_inc(&ctx->cq_timeouts);
1971
1972         spin_lock_irqsave(&ctx->completion_lock, flags);
1973         /*
1974          * Adjust the reqs sequence before the current one because it
1975          * will consume a slot in the cq_ring and the the cq_tail pointer
1976          * will be increased, otherwise other timeout reqs may return in
1977          * advance without waiting for enough wait_nr.
1978          */
1979         prev = req;
1980         list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
1981                 prev->sequence++;
1982         list_del(&req->list);
1983
1984         io_cqring_fill_event(ctx, req->user_data, -ETIME);
1985         io_commit_cqring(ctx);
1986         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1987
1988         io_cqring_ev_posted(ctx);
1989
1990         io_put_req(req);
1991         return HRTIMER_NORESTART;
1992 }
1993
1994 static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1995 {
1996         unsigned count;
1997         struct io_ring_ctx *ctx = req->ctx;
1998         struct list_head *entry;
1999         struct timespec64 ts;
2000         unsigned span = 0;
2001
2002         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2003                 return -EINVAL;
2004         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
2005             sqe->len != 1)
2006                 return -EINVAL;
2007
2008         if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
2009                 return -EFAULT;
2010
2011         req->flags |= REQ_F_TIMEOUT;
2012
2013         /*
2014          * sqe->off holds how many events that need to occur for this
2015          * timeout event to be satisfied. If it isn't set, then this is
2016          * a pure timeout request, sequence isn't used.
2017          */
2018         count = READ_ONCE(sqe->off);
2019         if (!count) {
2020                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
2021                 spin_lock_irq(&ctx->completion_lock);
2022                 entry = ctx->timeout_list.prev;
2023                 goto add;
2024         }
2025
2026         req->sequence = ctx->cached_sq_head + count - 1;
2027         /* reuse it to store the count */
2028         req->submit.sequence = count;
2029
2030         /*
2031          * Insertion sort, ensuring the first entry in the list is always
2032          * the one we need first.
2033          */
2034         spin_lock_irq(&ctx->completion_lock);
2035         list_for_each_prev(entry, &ctx->timeout_list) {
2036                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
2037                 unsigned nxt_sq_head;
2038                 long long tmp, tmp_nxt;
2039
2040                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
2041                         continue;
2042
2043                 /*
2044                  * Since cached_sq_head + count - 1 can overflow, use type long
2045                  * long to store it.
2046                  */
2047                 tmp = (long long)ctx->cached_sq_head + count - 1;
2048                 nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
2049                 tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
2050
2051                 /*
2052                  * cached_sq_head may overflow, and it will never overflow twice
2053                  * once there is some timeout req still be valid.
2054                  */
2055                 if (ctx->cached_sq_head < nxt_sq_head)
2056                         tmp += UINT_MAX;
2057
2058                 if (tmp > tmp_nxt)
2059                         break;
2060
2061                 /*
2062                  * Sequence of reqs after the insert one and itself should
2063                  * be adjusted because each timeout req consumes a slot.
2064                  */
2065                 span++;
2066                 nxt->sequence++;
2067         }
2068         req->sequence -= span;
2069 add:
2070         list_add(&req->list, entry);
2071         spin_unlock_irq(&ctx->completion_lock);
2072
2073         hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2074         req->timeout.timer.function = io_timeout_fn;
2075         hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
2076                         HRTIMER_MODE_REL);
2077         return 0;
2078 }
2079
2080 static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
2081                         struct sqe_submit *s)
2082 {
2083         struct io_uring_sqe *sqe_copy;
2084
2085         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
2086                 return 0;
2087
2088         sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
2089         if (!sqe_copy)
2090                 return -EAGAIN;
2091
2092         spin_lock_irq(&ctx->completion_lock);
2093         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
2094                 spin_unlock_irq(&ctx->completion_lock);
2095                 kfree(sqe_copy);
2096                 return 0;
2097         }
2098
2099         memcpy(&req->submit, s, sizeof(*s));
2100         memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
2101         req->submit.sqe = sqe_copy;
2102
2103         INIT_WORK(&req->work, io_sq_wq_submit_work);
2104         list_add_tail(&req->list, &ctx->defer_list);
2105         spin_unlock_irq(&ctx->completion_lock);
2106         return -EIOCBQUEUED;
2107 }
2108
2109 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
2110                            const struct sqe_submit *s, bool force_nonblock)
2111 {
2112         int ret;
2113
2114         req->user_data = READ_ONCE(s->sqe->user_data);
2115
2116         if (unlikely(s->index >= ctx->sq_entries))
2117                 return -EINVAL;
2118
2119         switch (req->submit.opcode) {
2120         case IORING_OP_NOP:
2121                 ret = io_nop(req, req->user_data);
2122                 break;
2123         case IORING_OP_READV:
2124                 if (unlikely(s->sqe->buf_index))
2125                         return -EINVAL;
2126                 ret = io_read(req, s, force_nonblock);
2127                 break;
2128         case IORING_OP_WRITEV:
2129                 if (unlikely(s->sqe->buf_index))
2130                         return -EINVAL;
2131                 ret = io_write(req, s, force_nonblock);
2132                 break;
2133         case IORING_OP_READ_FIXED:
2134                 ret = io_read(req, s, force_nonblock);
2135                 break;
2136         case IORING_OP_WRITE_FIXED:
2137                 ret = io_write(req, s, force_nonblock);
2138                 break;
2139         case IORING_OP_FSYNC:
2140                 ret = io_fsync(req, s->sqe, force_nonblock);
2141                 break;
2142         case IORING_OP_POLL_ADD:
2143                 ret = io_poll_add(req, s->sqe);
2144                 break;
2145         case IORING_OP_POLL_REMOVE:
2146                 ret = io_poll_remove(req, s->sqe);
2147                 break;
2148         case IORING_OP_SYNC_FILE_RANGE:
2149                 ret = io_sync_file_range(req, s->sqe, force_nonblock);
2150                 break;
2151         case IORING_OP_SENDMSG:
2152                 ret = io_sendmsg(req, s->sqe, force_nonblock);
2153                 break;
2154         case IORING_OP_RECVMSG:
2155                 ret = io_recvmsg(req, s->sqe, force_nonblock);
2156                 break;
2157         case IORING_OP_TIMEOUT:
2158                 ret = io_timeout(req, s->sqe);
2159                 break;
2160         default:
2161                 ret = -EINVAL;
2162                 break;
2163         }
2164
2165         if (ret)
2166                 return ret;
2167
2168         if (ctx->flags & IORING_SETUP_IOPOLL) {
2169                 if (req->result == -EAGAIN)
2170                         return -EAGAIN;
2171
2172                 /* workqueue context doesn't hold uring_lock, grab it now */
2173                 if (s->needs_lock)
2174                         mutex_lock(&ctx->uring_lock);
2175                 io_iopoll_req_issued(req);
2176                 if (s->needs_lock)
2177                         mutex_unlock(&ctx->uring_lock);
2178         }
2179
2180         return 0;
2181 }
2182
2183 static struct async_list *io_async_list_from_req(struct io_ring_ctx *ctx,
2184                                                  struct io_kiocb *req)
2185 {
2186         switch (req->submit.opcode) {
2187         case IORING_OP_READV:
2188         case IORING_OP_READ_FIXED:
2189                 return &ctx->pending_async[READ];
2190         case IORING_OP_WRITEV:
2191         case IORING_OP_WRITE_FIXED:
2192                 return &ctx->pending_async[WRITE];
2193         default:
2194                 return NULL;
2195         }
2196 }
2197
2198 static inline bool io_req_needs_user(struct io_kiocb *req)
2199 {
2200         return !(req->submit.opcode == IORING_OP_READ_FIXED ||
2201                 req->submit.opcode == IORING_OP_WRITE_FIXED);
2202 }
2203
2204 static void io_sq_wq_submit_work(struct work_struct *work)
2205 {
2206         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2207         struct fs_struct *old_fs_struct = current->fs;
2208         struct io_ring_ctx *ctx = req->ctx;
2209         struct mm_struct *cur_mm = NULL;
2210         struct async_list *async_list;
2211         const struct cred *old_cred;
2212         LIST_HEAD(req_list);
2213         mm_segment_t old_fs;
2214         int ret;
2215
2216         old_cred = override_creds(ctx->creds);
2217         async_list = io_async_list_from_req(ctx, req);
2218
2219         allow_kernel_signal(SIGINT);
2220 restart:
2221         do {
2222                 struct sqe_submit *s = &req->submit;
2223                 const struct io_uring_sqe *sqe = s->sqe;
2224                 unsigned int flags = req->flags;
2225
2226                 /* Ensure we clear previously set non-block flag */
2227                 req->rw.ki_flags &= ~IOCB_NOWAIT;
2228
2229                 if ((req->fs && req->fs != current->fs) ||
2230                     (!req->fs && current->fs != old_fs_struct)) {
2231                         task_lock(current);
2232                         if (req->fs)
2233                                 current->fs = req->fs;
2234                         else
2235                                 current->fs = old_fs_struct;
2236                         task_unlock(current);
2237                 }
2238
2239                 ret = 0;
2240                 if (io_req_needs_user(req) && !cur_mm) {
2241                         if (!mmget_not_zero(ctx->sqo_mm)) {
2242                                 ret = -EFAULT;
2243                                 goto end_req;
2244                         } else {
2245                                 cur_mm = ctx->sqo_mm;
2246                                 use_mm(cur_mm);
2247                                 old_fs = get_fs();
2248                                 set_fs(USER_DS);
2249                         }
2250                 }
2251
2252                 if (!ret) {
2253                         req->work_task = current;
2254
2255                         /*
2256                          * Pairs with the smp_store_mb() (B) in
2257                          * io_cancel_async_work().
2258                          */
2259                         smp_mb(); /* A */
2260                         if (req->flags & REQ_F_CANCEL) {
2261                                 ret = -ECANCELED;
2262                                 goto end_req;
2263                         }
2264
2265                         s->has_user = cur_mm != NULL;
2266                         s->needs_lock = true;
2267                         do {
2268                                 ret = __io_submit_sqe(ctx, req, s, false);
2269                                 /*
2270                                  * We can get EAGAIN for polled IO even though
2271                                  * we're forcing a sync submission from here,
2272                                  * since we can't wait for request slots on the
2273                                  * block side.
2274                                  */
2275                                 if (ret != -EAGAIN)
2276                                         break;
2277                                 cond_resched();
2278                         } while (1);
2279                 }
2280 end_req:
2281                 spin_lock_irq(&ctx->task_lock);
2282                 list_del_init(&req->task_list);
2283                 spin_unlock_irq(&ctx->task_lock);
2284
2285                 /* drop submission reference */
2286                 io_put_req(req);
2287
2288                 if (ret) {
2289                         io_cqring_add_event(ctx, sqe->user_data, ret);
2290                         io_put_req(req);
2291                 }
2292
2293                 /* async context always use a copy of the sqe */
2294                 kfree(sqe);
2295
2296                 /* req from defer and link list needn't decrease async cnt */
2297                 if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
2298                         goto out;
2299
2300                 if (!async_list)
2301                         break;
2302                 if (!list_empty(&req_list)) {
2303                         req = list_first_entry(&req_list, struct io_kiocb,
2304                                                 list);
2305                         list_del(&req->list);
2306                         continue;
2307                 }
2308                 if (list_empty(&async_list->list))
2309                         break;
2310
2311                 req = NULL;
2312                 spin_lock(&async_list->lock);
2313                 if (list_empty(&async_list->list)) {
2314                         spin_unlock(&async_list->lock);
2315                         break;
2316                 }
2317                 list_splice_init(&async_list->list, &req_list);
2318                 spin_unlock(&async_list->lock);
2319
2320                 req = list_first_entry(&req_list, struct io_kiocb, list);
2321                 list_del(&req->list);
2322         } while (req);
2323
2324         /*
2325          * Rare case of racing with a submitter. If we find the count has
2326          * dropped to zero AND we have pending work items, then restart
2327          * the processing. This is a tiny race window.
2328          */
2329         if (async_list) {
2330                 ret = atomic_dec_return(&async_list->cnt);
2331                 while (!ret && !list_empty(&async_list->list)) {
2332                         spin_lock(&async_list->lock);
2333                         atomic_inc(&async_list->cnt);
2334                         list_splice_init(&async_list->list, &req_list);
2335                         spin_unlock(&async_list->lock);
2336
2337                         if (!list_empty(&req_list)) {
2338                                 req = list_first_entry(&req_list,
2339                                                         struct io_kiocb, list);
2340                                 list_del(&req->list);
2341                                 goto restart;
2342                         }
2343                         ret = atomic_dec_return(&async_list->cnt);
2344                 }
2345         }
2346
2347 out:
2348         disallow_signal(SIGINT);
2349         if (cur_mm) {
2350                 set_fs(old_fs);
2351                 unuse_mm(cur_mm);
2352                 mmput(cur_mm);
2353         }
2354         revert_creds(old_cred);
2355         if (old_fs_struct != current->fs) {
2356                 task_lock(current);
2357                 current->fs = old_fs_struct;
2358                 task_unlock(current);
2359         }
2360 }
2361
2362 /*
2363  * See if we can piggy back onto previously submitted work, that is still
2364  * running. We currently only allow this if the new request is sequential
2365  * to the previous one we punted.
2366  */
2367 static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
2368 {
2369         bool ret;
2370
2371         if (!list)
2372                 return false;
2373         if (!(req->flags & REQ_F_SEQ_PREV))
2374                 return false;
2375         if (!atomic_read(&list->cnt))
2376                 return false;
2377
2378         ret = true;
2379         spin_lock(&list->lock);
2380         list_add_tail(&req->list, &list->list);
2381         /*
2382          * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
2383          */
2384         smp_mb();
2385         if (!atomic_read(&list->cnt)) {
2386                 list_del_init(&req->list);
2387                 ret = false;
2388         }
2389
2390         if (ret) {
2391                 struct io_ring_ctx *ctx = req->ctx;
2392
2393                 req->files = current->files;
2394
2395                 spin_lock_irq(&ctx->task_lock);
2396                 list_add(&req->task_list, &ctx->task_list);
2397                 req->work_task = NULL;
2398                 spin_unlock_irq(&ctx->task_lock);
2399         }
2400         spin_unlock(&list->lock);
2401         return ret;
2402 }
2403
2404 static bool io_op_needs_file(struct io_kiocb *req)
2405 {
2406         switch (req->submit.opcode) {
2407         case IORING_OP_NOP:
2408         case IORING_OP_POLL_REMOVE:
2409         case IORING_OP_TIMEOUT:
2410                 return false;
2411         default:
2412                 return true;
2413         }
2414 }
2415
2416 static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
2417                            struct io_submit_state *state, struct io_kiocb *req)
2418 {
2419         unsigned flags;
2420         int fd;
2421
2422         flags = READ_ONCE(s->sqe->flags);
2423         fd = READ_ONCE(s->sqe->fd);
2424
2425         if (flags & IOSQE_IO_DRAIN)
2426                 req->flags |= REQ_F_IO_DRAIN;
2427         /*
2428          * All io need record the previous position, if LINK vs DARIN,
2429          * it can be used to mark the position of the first IO in the
2430          * link list.
2431          */
2432         req->sequence = s->sequence;
2433
2434         if (!io_op_needs_file(req))
2435                 return 0;
2436
2437         if (flags & IOSQE_FIXED_FILE) {
2438                 if (unlikely(!ctx->user_files ||
2439                     (unsigned) fd >= ctx->nr_user_files))
2440                         return -EBADF;
2441                 req->file = ctx->user_files[fd];
2442                 req->flags |= REQ_F_FIXED_FILE;
2443         } else {
2444                 if (s->needs_fixed_file)
2445                         return -EBADF;
2446                 req->file = io_file_get(state, fd);
2447                 if (unlikely(!req->file))
2448                         return -EBADF;
2449         }
2450
2451         return 0;
2452 }
2453
2454 static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
2455                         struct sqe_submit *s)
2456 {
2457         int ret;
2458
2459         ret = __io_submit_sqe(ctx, req, s, true);
2460
2461         /*
2462          * We async punt it if the file wasn't marked NOWAIT, or if the file
2463          * doesn't support non-blocking read/write attempts
2464          */
2465         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
2466             (req->flags & REQ_F_MUST_PUNT))) {
2467                 struct io_uring_sqe *sqe_copy;
2468
2469                 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2470                 if (sqe_copy) {
2471                         struct async_list *list;
2472
2473                         s->sqe = sqe_copy;
2474                         memcpy(&req->submit, s, sizeof(*s));
2475                         list = io_async_list_from_req(ctx, req);
2476                         if (!io_add_to_prev_work(list, req)) {
2477                                 if (list)
2478                                         atomic_inc(&list->cnt);
2479                                 INIT_WORK(&req->work, io_sq_wq_submit_work);
2480                                 io_queue_async_work(ctx, req);
2481                         }
2482
2483                         /*
2484                          * Queued up for async execution, worker will release
2485                          * submit reference when the iocb is actually submitted.
2486                          */
2487                         return 0;
2488                 }
2489         }
2490
2491         /* drop submission reference */
2492         io_put_req(req);
2493
2494         /* and drop final reference, if we failed */
2495         if (ret) {
2496                 io_cqring_add_event(ctx, req->user_data, ret);
2497                 if (req->flags & REQ_F_LINK)
2498                         req->flags |= REQ_F_FAIL_LINK;
2499                 io_put_req(req);
2500         }
2501
2502         return ret;
2503 }
2504
2505 static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
2506                         struct sqe_submit *s)
2507 {
2508         int ret;
2509
2510         ret = io_req_defer(ctx, req, s);
2511         if (ret) {
2512                 if (ret != -EIOCBQUEUED) {
2513                         io_free_req(req);
2514                         io_cqring_add_event(ctx, s->sqe->user_data, ret);
2515                 }
2516                 return 0;
2517         }
2518
2519         return __io_queue_sqe(ctx, req, s);
2520 }
2521
2522 static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
2523                               struct sqe_submit *s, struct io_kiocb *shadow)
2524 {
2525         int ret;
2526         int need_submit = false;
2527
2528         if (!shadow)
2529                 return io_queue_sqe(ctx, req, s);
2530
2531         /*
2532          * Mark the first IO in link list as DRAIN, let all the following
2533          * IOs enter the defer list. all IO needs to be completed before link
2534          * list.
2535          */
2536         req->flags |= REQ_F_IO_DRAIN;
2537         ret = io_req_defer(ctx, req, s);
2538         if (ret) {
2539                 if (ret != -EIOCBQUEUED) {
2540                         io_free_req(req);
2541                         __io_free_req(shadow);
2542                         io_cqring_add_event(ctx, s->sqe->user_data, ret);
2543                         return 0;
2544                 }
2545         } else {
2546                 /*
2547                  * If ret == 0 means that all IOs in front of link io are
2548                  * running done. let's queue link head.
2549                  */
2550                 need_submit = true;
2551         }
2552
2553         /* Insert shadow req to defer_list, blocking next IOs */
2554         spin_lock_irq(&ctx->completion_lock);
2555         list_add_tail(&shadow->list, &ctx->defer_list);
2556         spin_unlock_irq(&ctx->completion_lock);
2557
2558         if (need_submit)
2559                 return __io_queue_sqe(ctx, req, s);
2560
2561         return 0;
2562 }
2563
2564 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
2565
2566 static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
2567                           struct io_submit_state *state, struct io_kiocb **link)
2568 {
2569         struct io_uring_sqe *sqe_copy;
2570         struct io_kiocb *req;
2571         int ret;
2572
2573         /* enforce forwards compatibility on users */
2574         if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
2575                 ret = -EINVAL;
2576                 goto err;
2577         }
2578
2579         req = io_get_req(ctx, state);
2580         if (unlikely(!req)) {
2581                 ret = -EAGAIN;
2582                 goto err;
2583         }
2584
2585         memcpy(&req->submit, s, sizeof(*s));
2586         ret = io_req_set_file(ctx, s, state, req);
2587         if (unlikely(ret)) {
2588 err_req:
2589                 io_free_req(req);
2590 err:
2591                 io_cqring_add_event(ctx, s->sqe->user_data, ret);
2592                 return;
2593         }
2594
2595         req->user_data = s->sqe->user_data;
2596
2597 #if defined(CONFIG_NET)
2598         switch (req->submit.opcode) {
2599         case IORING_OP_SENDMSG:
2600         case IORING_OP_RECVMSG:
2601                 spin_lock(&current->fs->lock);
2602                 if (!current->fs->in_exec) {
2603                         req->fs = current->fs;
2604                         req->fs->users++;
2605                 }
2606                 spin_unlock(&current->fs->lock);
2607                 if (!req->fs) {
2608                         ret = -EAGAIN;
2609                         goto err_req;
2610                 }
2611         }
2612 #endif
2613
2614         /*
2615          * If we already have a head request, queue this one for async
2616          * submittal once the head completes. If we don't have a head but
2617          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
2618          * submitted sync once the chain is complete. If none of those
2619          * conditions are true (normal request), then just queue it.
2620          */
2621         if (*link) {
2622                 struct io_kiocb *prev = *link;
2623
2624                 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2625                 if (!sqe_copy) {
2626                         ret = -EAGAIN;
2627                         goto err_req;
2628                 }
2629
2630                 s->sqe = sqe_copy;
2631                 memcpy(&req->submit, s, sizeof(*s));
2632                 list_add_tail(&req->list, &prev->link_list);
2633         } else if (s->sqe->flags & IOSQE_IO_LINK) {
2634                 req->flags |= REQ_F_LINK;
2635
2636                 memcpy(&req->submit, s, sizeof(*s));
2637                 INIT_LIST_HEAD(&req->link_list);
2638                 *link = req;
2639         } else {
2640                 io_queue_sqe(ctx, req, s);
2641         }
2642 }
2643
2644 /*
2645  * Batched submission is done, ensure local IO is flushed out.
2646  */
2647 static void io_submit_state_end(struct io_submit_state *state)
2648 {
2649         blk_finish_plug(&state->plug);
2650         io_file_put(state);
2651         if (state->free_reqs)
2652                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
2653                                         &state->reqs[state->cur_req]);
2654 }
2655
2656 /*
2657  * Start submission side cache.
2658  */
2659 static void io_submit_state_start(struct io_submit_state *state,
2660                                   struct io_ring_ctx *ctx, unsigned max_ios)
2661 {
2662         blk_start_plug(&state->plug);
2663         state->free_reqs = 0;
2664         state->file = NULL;
2665         state->ios_left = max_ios;
2666 }
2667
2668 static void io_commit_sqring(struct io_ring_ctx *ctx)
2669 {
2670         struct io_rings *rings = ctx->rings;
2671
2672         if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
2673                 /*
2674                  * Ensure any loads from the SQEs are done at this point,
2675                  * since once we write the new head, the application could
2676                  * write new data to them.
2677                  */
2678                 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2679         }
2680 }
2681
2682 /*
2683  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
2684  * that is mapped by userspace. This means that care needs to be taken to
2685  * ensure that reads are stable, as we cannot rely on userspace always
2686  * being a good citizen. If members of the sqe are validated and then later
2687  * used, it's important that those reads are done through READ_ONCE() to
2688  * prevent a re-load down the line.
2689  */
2690 static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
2691 {
2692         struct io_rings *rings = ctx->rings;
2693         u32 *sq_array = ctx->sq_array;
2694         unsigned head;
2695
2696         /*
2697          * The cached sq head (or cq tail) serves two purposes:
2698          *
2699          * 1) allows us to batch the cost of updating the user visible
2700          *    head updates.
2701          * 2) allows the kernel side to track the head on its own, even
2702          *    though the application is the one updating it.
2703          */
2704         head = ctx->cached_sq_head;
2705         /* make sure SQ entry isn't read before tail */
2706         if (head == smp_load_acquire(&rings->sq.tail))
2707                 return false;
2708
2709         head = READ_ONCE(sq_array[head & ctx->sq_mask]);
2710         if (head < ctx->sq_entries) {
2711                 s->index = head;
2712                 s->sqe = &ctx->sq_sqes[head];
2713                 s->opcode = READ_ONCE(s->sqe->opcode);
2714                 s->sequence = ctx->cached_sq_head;
2715                 ctx->cached_sq_head++;
2716                 return true;
2717         }
2718
2719         /* drop invalid entries */
2720         ctx->cached_sq_head++;
2721         ctx->cached_sq_dropped++;
2722         WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
2723         return false;
2724 }
2725
2726 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
2727                           bool has_user, bool mm_fault)
2728 {
2729         struct io_submit_state state, *statep = NULL;
2730         struct io_kiocb *link = NULL;
2731         struct io_kiocb *shadow_req = NULL;
2732         bool prev_was_link = false;
2733         int i, submitted = 0;
2734
2735         if (nr > IO_PLUG_THRESHOLD) {
2736                 io_submit_state_start(&state, ctx, nr);
2737                 statep = &state;
2738         }
2739
2740         for (i = 0; i < nr; i++) {
2741                 struct sqe_submit s;
2742
2743                 if (!io_get_sqring(ctx, &s))
2744                         break;
2745
2746                 /*
2747                  * If previous wasn't linked and we have a linked command,
2748                  * that's the end of the chain. Submit the previous link.
2749                  */
2750                 if (!prev_was_link && link) {
2751                         io_queue_link_head(ctx, link, &link->submit, shadow_req);
2752                         link = NULL;
2753                         shadow_req = NULL;
2754                 }
2755                 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
2756
2757                 if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
2758                         if (!shadow_req) {
2759                                 shadow_req = io_get_req(ctx, NULL);
2760                                 if (unlikely(!shadow_req))
2761                                         goto out;
2762                                 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
2763                                 refcount_dec(&shadow_req->refs);
2764                         }
2765                         shadow_req->sequence = s.sequence;
2766                 }
2767
2768 out:
2769                 if (unlikely(mm_fault)) {
2770                         io_cqring_add_event(ctx, s.sqe->user_data,
2771                                                 -EFAULT);
2772                 } else {
2773                         s.has_user = has_user;
2774                         s.needs_lock = true;
2775                         s.needs_fixed_file = true;
2776                         io_submit_sqe(ctx, &s, statep, &link);
2777                         submitted++;
2778                 }
2779         }
2780
2781         if (link)
2782                 io_queue_link_head(ctx, link, &link->submit, shadow_req);
2783         if (statep)
2784                 io_submit_state_end(&state);
2785
2786         return submitted;
2787 }
2788
2789 static int io_sq_thread(void *data)
2790 {
2791         struct io_ring_ctx *ctx = data;
2792         struct mm_struct *cur_mm = NULL;
2793         const struct cred *old_cred;
2794         mm_segment_t old_fs;
2795         DEFINE_WAIT(wait);
2796         unsigned inflight;
2797         unsigned long timeout;
2798
2799         complete(&ctx->sqo_thread_started);
2800
2801         old_fs = get_fs();
2802         set_fs(USER_DS);
2803         old_cred = override_creds(ctx->creds);
2804
2805         timeout = inflight = 0;
2806         while (!kthread_should_park()) {
2807                 bool mm_fault = false;
2808                 unsigned int to_submit;
2809
2810                 if (inflight) {
2811                         unsigned nr_events = 0;
2812
2813                         if (ctx->flags & IORING_SETUP_IOPOLL) {
2814                                 /*
2815                                  * inflight is the count of the maximum possible
2816                                  * entries we submitted, but it can be smaller
2817                                  * if we dropped some of them. If we don't have
2818                                  * poll entries available, then we know that we
2819                                  * have nothing left to poll for. Reset the
2820                                  * inflight count to zero in that case.
2821                                  */
2822                                 mutex_lock(&ctx->uring_lock);
2823                                 if (!list_empty(&ctx->poll_list))
2824                                         io_iopoll_getevents(ctx, &nr_events, 0);
2825                                 else
2826                                         inflight = 0;
2827                                 mutex_unlock(&ctx->uring_lock);
2828                         } else {
2829                                 /*
2830                                  * Normal IO, just pretend everything completed.
2831                                  * We don't have to poll completions for that.
2832                                  */
2833                                 nr_events = inflight;
2834                         }
2835
2836                         inflight -= nr_events;
2837                         if (!inflight)
2838                                 timeout = jiffies + ctx->sq_thread_idle;
2839                 }
2840
2841                 to_submit = io_sqring_entries(ctx);
2842                 if (!to_submit) {
2843                         /*
2844                          * Drop cur_mm before scheduling, we can't hold it for
2845                          * long periods (or over schedule()). Do this before
2846                          * adding ourselves to the waitqueue, as the unuse/drop
2847                          * may sleep.
2848                          */
2849                         if (cur_mm) {
2850                                 unuse_mm(cur_mm);
2851                                 mmput(cur_mm);
2852                                 cur_mm = NULL;
2853                         }
2854
2855                         /*
2856                          * We're polling. If we're within the defined idle
2857                          * period, then let us spin without work before going
2858                          * to sleep.
2859                          */
2860                         if (inflight || !time_after(jiffies, timeout)) {
2861                                 cond_resched();
2862                                 continue;
2863                         }
2864
2865                         prepare_to_wait(&ctx->sqo_wait, &wait,
2866                                                 TASK_INTERRUPTIBLE);
2867
2868                         /* Tell userspace we may need a wakeup call */
2869                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
2870                         /* make sure to read SQ tail after writing flags */
2871                         smp_mb();
2872
2873                         to_submit = io_sqring_entries(ctx);
2874                         if (!to_submit) {
2875                                 if (kthread_should_park()) {
2876                                         finish_wait(&ctx->sqo_wait, &wait);
2877                                         break;
2878                                 }
2879                                 if (signal_pending(current))
2880                                         flush_signals(current);
2881                                 schedule();
2882                                 finish_wait(&ctx->sqo_wait, &wait);
2883
2884                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
2885                                 continue;
2886                         }
2887                         finish_wait(&ctx->sqo_wait, &wait);
2888
2889                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
2890                 }
2891
2892                 /* Unless all new commands are FIXED regions, grab mm */
2893                 if (!cur_mm) {
2894                         mm_fault = !mmget_not_zero(ctx->sqo_mm);
2895                         if (!mm_fault) {
2896                                 use_mm(ctx->sqo_mm);
2897                                 cur_mm = ctx->sqo_mm;
2898                         }
2899                 }
2900
2901                 to_submit = min(to_submit, ctx->sq_entries);
2902                 inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
2903                                            mm_fault);
2904
2905                 /* Commit SQ ring head once we've consumed all SQEs */
2906                 io_commit_sqring(ctx);
2907         }
2908
2909         set_fs(old_fs);
2910         if (cur_mm) {
2911                 unuse_mm(cur_mm);
2912                 mmput(cur_mm);
2913         }
2914         revert_creds(old_cred);
2915
2916         kthread_parkme();
2917
2918         return 0;
2919 }
2920
2921 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
2922 {
2923         struct io_submit_state state, *statep = NULL;
2924         struct io_kiocb *link = NULL;
2925         struct io_kiocb *shadow_req = NULL;
2926         bool prev_was_link = false;
2927         int i, submit = 0;
2928
2929         if (to_submit > IO_PLUG_THRESHOLD) {
2930                 io_submit_state_start(&state, ctx, to_submit);
2931                 statep = &state;
2932         }
2933
2934         for (i = 0; i < to_submit; i++) {
2935                 struct sqe_submit s;
2936
2937                 if (!io_get_sqring(ctx, &s))
2938                         break;
2939
2940                 /*
2941                  * If previous wasn't linked and we have a linked command,
2942                  * that's the end of the chain. Submit the previous link.
2943                  */
2944                 if (!prev_was_link && link) {
2945                         io_queue_link_head(ctx, link, &link->submit, shadow_req);
2946                         link = NULL;
2947                         shadow_req = NULL;
2948                 }
2949                 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
2950
2951                 if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
2952                         if (!shadow_req) {
2953                                 shadow_req = io_get_req(ctx, NULL);
2954                                 if (unlikely(!shadow_req))
2955                                         goto out;
2956                                 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
2957                                 refcount_dec(&shadow_req->refs);
2958                         }
2959                         shadow_req->sequence = s.sequence;
2960                 }
2961
2962 out:
2963                 s.has_user = true;
2964                 s.needs_lock = false;
2965                 s.needs_fixed_file = false;
2966                 submit++;
2967                 io_submit_sqe(ctx, &s, statep, &link);
2968         }
2969
2970         if (link)
2971                 io_queue_link_head(ctx, link, &link->submit, shadow_req);
2972         if (statep)
2973                 io_submit_state_end(statep);
2974
2975         io_commit_sqring(ctx);
2976
2977         return submit;
2978 }
2979
2980 struct io_wait_queue {
2981         struct wait_queue_entry wq;
2982         struct io_ring_ctx *ctx;
2983         unsigned to_wait;
2984         unsigned nr_timeouts;
2985 };
2986
2987 static inline bool io_should_wake(struct io_wait_queue *iowq)
2988 {
2989         struct io_ring_ctx *ctx = iowq->ctx;
2990
2991         /*
2992          * Wake up if we have enough events, or if a timeout occured since we
2993          * started waiting. For timeouts, we always want to return to userspace,
2994          * regardless of event count.
2995          */
2996         return io_cqring_events(ctx->rings) >= iowq->to_wait ||
2997                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
2998 }
2999
3000 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
3001                             int wake_flags, void *key)
3002 {
3003         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
3004                                                         wq);
3005
3006         if (!io_should_wake(iowq))
3007                 return -1;
3008
3009         return autoremove_wake_function(curr, mode, wake_flags, key);
3010 }
3011
3012 /*
3013  * Wait until events become available, if we don't already have some. The
3014  * application must reap them itself, as they reside on the shared cq ring.
3015  */
3016 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
3017                           const sigset_t __user *sig, size_t sigsz)
3018 {
3019         struct io_wait_queue iowq = {
3020                 .wq = {
3021                         .private        = current,
3022                         .func           = io_wake_function,
3023                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
3024                 },
3025                 .ctx            = ctx,
3026                 .to_wait        = min_events,
3027         };
3028         struct io_rings *rings = ctx->rings;
3029         int ret;
3030
3031         if (io_cqring_events(rings) >= min_events)
3032                 return 0;
3033
3034         if (sig) {
3035 #ifdef CONFIG_COMPAT
3036                 if (in_compat_syscall())
3037                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
3038                                                       sigsz);
3039                 else
3040 #endif
3041                         ret = set_user_sigmask(sig, sigsz);
3042
3043                 if (ret)
3044                         return ret;
3045         }
3046
3047         ret = 0;
3048         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
3049         do {
3050                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
3051                                                 TASK_INTERRUPTIBLE);
3052                 if (io_should_wake(&iowq))
3053                         break;
3054                 schedule();
3055                 if (signal_pending(current)) {
3056                         ret = -ERESTARTSYS;
3057                         break;
3058                 }
3059         } while (1);
3060         finish_wait(&ctx->wait, &iowq.wq);
3061
3062         restore_saved_sigmask_unless(ret == -ERESTARTSYS);
3063         if (ret == -ERESTARTSYS)
3064                 ret = -EINTR;
3065
3066         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
3067 }
3068
3069 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
3070 {
3071 #if defined(CONFIG_UNIX)
3072         if (ctx->ring_sock) {
3073                 struct sock *sock = ctx->ring_sock->sk;
3074                 struct sk_buff *skb;
3075
3076                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
3077                         kfree_skb(skb);
3078         }
3079 #else
3080         int i;
3081
3082         for (i = 0; i < ctx->nr_user_files; i++)
3083                 fput(ctx->user_files[i]);
3084 #endif
3085 }
3086
3087 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
3088 {
3089         if (!ctx->user_files)
3090                 return -ENXIO;
3091
3092         __io_sqe_files_unregister(ctx);
3093         kfree(ctx->user_files);
3094         ctx->user_files = NULL;
3095         ctx->nr_user_files = 0;
3096         return 0;
3097 }
3098
3099 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
3100 {
3101         if (ctx->sqo_thread) {
3102                 wait_for_completion(&ctx->sqo_thread_started);
3103                 /*
3104                  * The park is a bit of a work-around, without it we get
3105                  * warning spews on shutdown with SQPOLL set and affinity
3106                  * set to a single CPU.
3107                  */
3108                 kthread_park(ctx->sqo_thread);
3109                 kthread_stop(ctx->sqo_thread);
3110                 ctx->sqo_thread = NULL;
3111         }
3112 }
3113
3114 static void io_finish_async(struct io_ring_ctx *ctx)
3115 {
3116         int i;
3117
3118         io_sq_thread_stop(ctx);
3119
3120         for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
3121                 if (ctx->sqo_wq[i]) {
3122                         destroy_workqueue(ctx->sqo_wq[i]);
3123                         ctx->sqo_wq[i] = NULL;
3124                 }
3125         }
3126 }
3127
3128 #if defined(CONFIG_UNIX)
3129 static void io_destruct_skb(struct sk_buff *skb)
3130 {
3131         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
3132         int i;
3133
3134         for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
3135                 if (ctx->sqo_wq[i])
3136                         flush_workqueue(ctx->sqo_wq[i]);
3137
3138         unix_destruct_scm(skb);
3139 }
3140
3141 /*
3142  * Ensure the UNIX gc is aware of our file set, so we are certain that
3143  * the io_uring can be safely unregistered on process exit, even if we have
3144  * loops in the file referencing.
3145  */
3146 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
3147 {
3148         struct sock *sk = ctx->ring_sock->sk;
3149         struct scm_fp_list *fpl;
3150         struct sk_buff *skb;
3151         int i;
3152
3153         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
3154         if (!fpl)
3155                 return -ENOMEM;
3156
3157         skb = alloc_skb(0, GFP_KERNEL);
3158         if (!skb) {
3159                 kfree(fpl);
3160                 return -ENOMEM;
3161         }
3162
3163         skb->sk = sk;
3164         skb->destructor = io_destruct_skb;
3165
3166         fpl->user = get_uid(ctx->user);
3167         for (i = 0; i < nr; i++) {
3168                 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
3169                 unix_inflight(fpl->user, fpl->fp[i]);
3170         }
3171
3172         fpl->max = fpl->count = nr;
3173         UNIXCB(skb).fp = fpl;
3174         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
3175         skb_queue_head(&sk->sk_receive_queue, skb);
3176
3177         for (i = 0; i < nr; i++)
3178                 fput(fpl->fp[i]);
3179
3180         return 0;
3181 }
3182
3183 /*
3184  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
3185  * causes regular reference counting to break down. We rely on the UNIX
3186  * garbage collection to take care of this problem for us.
3187  */
3188 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
3189 {
3190         unsigned left, total;
3191         int ret = 0;
3192
3193         total = 0;
3194         left = ctx->nr_user_files;
3195         while (left) {
3196                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
3197
3198                 ret = __io_sqe_files_scm(ctx, this_files, total);
3199                 if (ret)
3200                         break;
3201                 left -= this_files;
3202                 total += this_files;
3203         }
3204
3205         if (!ret)
3206                 return 0;
3207
3208         while (total < ctx->nr_user_files) {
3209                 fput(ctx->user_files[total]);
3210                 total++;
3211         }
3212
3213         return ret;
3214 }
3215 #else
3216 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
3217 {
3218         return 0;
3219 }
3220 #endif
3221
3222 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
3223                                  unsigned nr_args)
3224 {
3225         __s32 __user *fds = (__s32 __user *) arg;
3226         int fd, ret = 0;
3227         unsigned i;
3228
3229         if (ctx->user_files)
3230                 return -EBUSY;
3231         if (!nr_args)
3232                 return -EINVAL;
3233         if (nr_args > IORING_MAX_FIXED_FILES)
3234                 return -EMFILE;
3235
3236         ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
3237         if (!ctx->user_files)
3238                 return -ENOMEM;
3239
3240         for (i = 0; i < nr_args; i++) {
3241                 ret = -EFAULT;
3242                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
3243                         break;
3244
3245                 ctx->user_files[i] = fget(fd);
3246
3247                 ret = -EBADF;
3248                 if (!ctx->user_files[i])
3249                         break;
3250                 /*
3251                  * Don't allow io_uring instances to be registered. If UNIX
3252                  * isn't enabled, then this causes a reference cycle and this
3253                  * instance can never get freed. If UNIX is enabled we'll
3254                  * handle it just fine, but there's still no point in allowing
3255                  * a ring fd as it doesn't support regular read/write anyway.
3256                  */
3257                 if (ctx->user_files[i]->f_op == &io_uring_fops) {
3258                         fput(ctx->user_files[i]);
3259                         break;
3260                 }
3261                 ctx->nr_user_files++;
3262                 ret = 0;
3263         }
3264
3265         if (ret) {
3266                 for (i = 0; i < ctx->nr_user_files; i++)
3267                         fput(ctx->user_files[i]);
3268
3269                 kfree(ctx->user_files);
3270                 ctx->user_files = NULL;
3271                 ctx->nr_user_files = 0;
3272                 return ret;
3273         }
3274
3275         ret = io_sqe_files_scm(ctx);
3276         if (ret)
3277                 io_sqe_files_unregister(ctx);
3278
3279         return ret;
3280 }
3281
3282 static int io_sq_offload_start(struct io_ring_ctx *ctx,
3283                                struct io_uring_params *p)
3284 {
3285         int ret;
3286
3287         mmgrab(current->mm);
3288         ctx->sqo_mm = current->mm;
3289
3290         if (ctx->flags & IORING_SETUP_SQPOLL) {
3291                 ret = -EPERM;
3292                 if (!capable(CAP_SYS_ADMIN))
3293                         goto err;
3294
3295                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
3296                 if (!ctx->sq_thread_idle)
3297                         ctx->sq_thread_idle = HZ;
3298
3299                 if (p->flags & IORING_SETUP_SQ_AFF) {
3300                         int cpu = p->sq_thread_cpu;
3301
3302                         ret = -EINVAL;
3303                         if (cpu >= nr_cpu_ids)
3304                                 goto err;
3305                         if (!cpu_online(cpu))
3306                                 goto err;
3307
3308                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
3309                                                         ctx, cpu,
3310                                                         "io_uring-sq");
3311                 } else {
3312                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
3313                                                         "io_uring-sq");
3314                 }
3315                 if (IS_ERR(ctx->sqo_thread)) {
3316                         ret = PTR_ERR(ctx->sqo_thread);
3317                         ctx->sqo_thread = NULL;
3318                         goto err;
3319                 }
3320                 wake_up_process(ctx->sqo_thread);
3321         } else if (p->flags & IORING_SETUP_SQ_AFF) {
3322                 /* Can't have SQ_AFF without SQPOLL */
3323                 ret = -EINVAL;
3324                 goto err;
3325         }
3326
3327         /* Do QD, or 2 * CPUS, whatever is smallest */
3328         ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
3329                         WQ_UNBOUND | WQ_FREEZABLE,
3330                         min(ctx->sq_entries - 1, 2 * num_online_cpus()));
3331         if (!ctx->sqo_wq[0]) {
3332                 ret = -ENOMEM;
3333                 goto err;
3334         }
3335
3336         /*
3337          * This is for buffered writes, where we want to limit the parallelism
3338          * due to file locking in file systems. As "normal" buffered writes
3339          * should parellelize on writeout quite nicely, limit us to having 2
3340          * pending. This avoids massive contention on the inode when doing
3341          * buffered async writes.
3342          */
3343         ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
3344                                                 WQ_UNBOUND | WQ_FREEZABLE, 2);
3345         if (!ctx->sqo_wq[1]) {
3346                 ret = -ENOMEM;
3347                 goto err;
3348         }
3349
3350         return 0;
3351 err:
3352         io_finish_async(ctx);
3353         mmdrop(ctx->sqo_mm);
3354         ctx->sqo_mm = NULL;
3355         return ret;
3356 }
3357
3358 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
3359 {
3360         atomic_long_sub(nr_pages, &user->locked_vm);
3361 }
3362
3363 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
3364 {
3365         unsigned long page_limit, cur_pages, new_pages;
3366
3367         /* Don't allow more pages than we can safely lock */
3368         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
3369
3370         do {
3371                 cur_pages = atomic_long_read(&user->locked_vm);
3372                 new_pages = cur_pages + nr_pages;
3373                 if (new_pages > page_limit)
3374                         return -ENOMEM;
3375         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
3376                                         new_pages) != cur_pages);
3377
3378         return 0;
3379 }
3380
3381 static void io_mem_free(void *ptr)
3382 {
3383         struct page *page;
3384
3385         if (!ptr)
3386                 return;
3387
3388         page = virt_to_head_page(ptr);
3389         if (put_page_testzero(page))
3390                 free_compound_page(page);
3391 }
3392
3393 static void *io_mem_alloc(size_t size)
3394 {
3395         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
3396                                 __GFP_NORETRY;
3397
3398         return (void *) __get_free_pages(gfp_flags, get_order(size));
3399 }
3400
3401 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
3402                                 size_t *sq_offset)
3403 {
3404         struct io_rings *rings;
3405         size_t off, sq_array_size;
3406
3407         off = struct_size(rings, cqes, cq_entries);
3408         if (off == SIZE_MAX)
3409                 return SIZE_MAX;
3410
3411 #ifdef CONFIG_SMP
3412         off = ALIGN(off, SMP_CACHE_BYTES);
3413         if (off == 0)
3414                 return SIZE_MAX;
3415 #endif
3416
3417         if (sq_offset)
3418                 *sq_offset = off;
3419
3420         sq_array_size = array_size(sizeof(u32), sq_entries);
3421         if (sq_array_size == SIZE_MAX)
3422                 return SIZE_MAX;
3423
3424         if (check_add_overflow(off, sq_array_size, &off))
3425                 return SIZE_MAX;
3426
3427         return off;
3428 }
3429
3430 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
3431 {
3432         size_t pages;
3433
3434         pages = (size_t)1 << get_order(
3435                 rings_size(sq_entries, cq_entries, NULL));
3436         pages += (size_t)1 << get_order(
3437                 array_size(sizeof(struct io_uring_sqe), sq_entries));
3438
3439         return pages;
3440 }
3441
3442 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
3443 {
3444         int i, j;
3445
3446         if (!ctx->user_bufs)
3447                 return -ENXIO;
3448
3449         for (i = 0; i < ctx->nr_user_bufs; i++) {
3450                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
3451
3452                 for (j = 0; j < imu->nr_bvecs; j++)
3453                         put_user_page(imu->bvec[j].bv_page);
3454
3455                 if (ctx->account_mem)
3456                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
3457                 kvfree(imu->bvec);
3458                 imu->nr_bvecs = 0;
3459         }
3460
3461         kfree(ctx->user_bufs);
3462         ctx->user_bufs = NULL;
3463         ctx->nr_user_bufs = 0;
3464         return 0;
3465 }
3466
3467 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
3468                        void __user *arg, unsigned index)
3469 {
3470         struct iovec __user *src;
3471
3472 #ifdef CONFIG_COMPAT
3473         if (ctx->compat) {
3474                 struct compat_iovec __user *ciovs;
3475                 struct compat_iovec ciov;
3476
3477                 ciovs = (struct compat_iovec __user *) arg;
3478                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
3479                         return -EFAULT;
3480
3481                 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
3482                 dst->iov_len = ciov.iov_len;
3483                 return 0;
3484         }
3485 #endif
3486         src = (struct iovec __user *) arg;
3487         if (copy_from_user(dst, &src[index], sizeof(*dst)))
3488                 return -EFAULT;
3489         return 0;
3490 }
3491
3492 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
3493                                   unsigned nr_args)
3494 {
3495         struct vm_area_struct **vmas = NULL;
3496         struct page **pages = NULL;
3497         int i, j, got_pages = 0;
3498         int ret = -EINVAL;
3499
3500         if (ctx->user_bufs)
3501                 return -EBUSY;
3502         if (!nr_args || nr_args > UIO_MAXIOV)
3503                 return -EINVAL;
3504
3505         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
3506                                         GFP_KERNEL);
3507         if (!ctx->user_bufs)
3508                 return -ENOMEM;
3509
3510         for (i = 0; i < nr_args; i++) {
3511                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
3512                 unsigned long off, start, end, ubuf;
3513                 int pret, nr_pages;
3514                 struct iovec iov;
3515                 size_t size;
3516
3517                 ret = io_copy_iov(ctx, &iov, arg, i);
3518                 if (ret)
3519                         goto err;
3520
3521                 /*
3522                  * Don't impose further limits on the size and buffer
3523                  * constraints here, we'll -EINVAL later when IO is
3524                  * submitted if they are wrong.
3525                  */
3526                 ret = -EFAULT;
3527                 if (!iov.iov_base || !iov.iov_len)
3528                         goto err;
3529
3530                 /* arbitrary limit, but we need something */
3531                 if (iov.iov_len > SZ_1G)
3532                         goto err;
3533
3534                 ubuf = (unsigned long) iov.iov_base;
3535                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
3536                 start = ubuf >> PAGE_SHIFT;
3537                 nr_pages = end - start;
3538
3539                 if (ctx->account_mem) {
3540                         ret = io_account_mem(ctx->user, nr_pages);
3541                         if (ret)
3542                                 goto err;
3543                 }
3544
3545                 ret = 0;
3546                 if (!pages || nr_pages > got_pages) {
3547                         kvfree(vmas);
3548                         kvfree(pages);
3549                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
3550                                                 GFP_KERNEL);
3551                         vmas = kvmalloc_array(nr_pages,
3552                                         sizeof(struct vm_area_struct *),
3553                                         GFP_KERNEL);
3554                         if (!pages || !vmas) {
3555                                 ret = -ENOMEM;
3556                                 if (ctx->account_mem)
3557                                         io_unaccount_mem(ctx->user, nr_pages);
3558                                 goto err;
3559                         }
3560                         got_pages = nr_pages;
3561                 }
3562
3563                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
3564                                                 GFP_KERNEL);
3565                 ret = -ENOMEM;
3566                 if (!imu->bvec) {
3567                         if (ctx->account_mem)
3568                                 io_unaccount_mem(ctx->user, nr_pages);
3569                         goto err;
3570                 }
3571
3572                 ret = 0;
3573                 down_read(&current->mm->mmap_sem);
3574                 pret = get_user_pages(ubuf, nr_pages,
3575                                       FOLL_WRITE | FOLL_LONGTERM,
3576                                       pages, vmas);
3577                 if (pret == nr_pages) {
3578                         /* don't support file backed memory */
3579                         for (j = 0; j < nr_pages; j++) {
3580                                 struct vm_area_struct *vma = vmas[j];
3581
3582                                 if (vma->vm_file &&
3583                                     !is_file_hugepages(vma->vm_file)) {
3584                                         ret = -EOPNOTSUPP;
3585                                         break;
3586                                 }
3587                         }
3588                 } else {
3589                         ret = pret < 0 ? pret : -EFAULT;
3590                 }
3591                 up_read(&current->mm->mmap_sem);
3592                 if (ret) {
3593                         /*
3594                          * if we did partial map, or found file backed vmas,
3595                          * release any pages we did get
3596                          */
3597                         if (pret > 0)
3598                                 put_user_pages(pages, pret);
3599                         if (ctx->account_mem)
3600                                 io_unaccount_mem(ctx->user, nr_pages);
3601                         kvfree(imu->bvec);
3602                         goto err;
3603                 }
3604
3605                 off = ubuf & ~PAGE_MASK;
3606                 size = iov.iov_len;
3607                 for (j = 0; j < nr_pages; j++) {
3608                         size_t vec_len;
3609
3610                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
3611                         imu->bvec[j].bv_page = pages[j];
3612                         imu->bvec[j].bv_len = vec_len;
3613                         imu->bvec[j].bv_offset = off;
3614                         off = 0;
3615                         size -= vec_len;
3616                 }
3617                 /* store original address for later verification */
3618                 imu->ubuf = ubuf;
3619                 imu->len = iov.iov_len;
3620                 imu->nr_bvecs = nr_pages;
3621
3622                 ctx->nr_user_bufs++;
3623         }
3624         kvfree(pages);
3625         kvfree(vmas);
3626         return 0;
3627 err:
3628         kvfree(pages);
3629         kvfree(vmas);
3630         io_sqe_buffer_unregister(ctx);
3631         return ret;
3632 }
3633
3634 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
3635 {
3636         __s32 __user *fds = arg;
3637         int fd;
3638
3639         if (ctx->cq_ev_fd)
3640                 return -EBUSY;
3641
3642         if (copy_from_user(&fd, fds, sizeof(*fds)))
3643                 return -EFAULT;
3644
3645         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
3646         if (IS_ERR(ctx->cq_ev_fd)) {
3647                 int ret = PTR_ERR(ctx->cq_ev_fd);
3648                 ctx->cq_ev_fd = NULL;
3649                 return ret;
3650         }
3651
3652         return 0;
3653 }
3654
3655 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
3656 {
3657         if (ctx->cq_ev_fd) {
3658                 eventfd_ctx_put(ctx->cq_ev_fd);
3659                 ctx->cq_ev_fd = NULL;
3660                 return 0;
3661         }
3662
3663         return -ENXIO;
3664 }
3665
3666 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
3667 {
3668         io_finish_async(ctx);
3669         if (ctx->sqo_mm)
3670                 mmdrop(ctx->sqo_mm);
3671
3672         io_iopoll_reap_events(ctx);
3673         io_sqe_buffer_unregister(ctx);
3674         io_sqe_files_unregister(ctx);
3675         io_eventfd_unregister(ctx);
3676
3677 #if defined(CONFIG_UNIX)
3678         if (ctx->ring_sock) {
3679                 ctx->ring_sock->file = NULL; /* so that iput() is called */
3680                 sock_release(ctx->ring_sock);
3681         }
3682 #endif
3683
3684         io_mem_free(ctx->rings);
3685         io_mem_free(ctx->sq_sqes);
3686
3687         percpu_ref_exit(&ctx->refs);
3688         if (ctx->account_mem)
3689                 io_unaccount_mem(ctx->user,
3690                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
3691         free_uid(ctx->user);
3692         if (ctx->creds)
3693                 put_cred(ctx->creds);
3694         kfree(ctx);
3695 }
3696
3697 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
3698 {
3699         struct io_ring_ctx *ctx = file->private_data;
3700         __poll_t mask = 0;
3701
3702         poll_wait(file, &ctx->cq_wait, wait);
3703         /*
3704          * synchronizes with barrier from wq_has_sleeper call in
3705          * io_commit_cqring
3706          */
3707         smp_rmb();
3708         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
3709             ctx->rings->sq_ring_entries)
3710                 mask |= EPOLLOUT | EPOLLWRNORM;
3711         if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
3712                 mask |= EPOLLIN | EPOLLRDNORM;
3713
3714         return mask;
3715 }
3716
3717 static int io_uring_fasync(int fd, struct file *file, int on)
3718 {
3719         struct io_ring_ctx *ctx = file->private_data;
3720
3721         return fasync_helper(fd, file, on, &ctx->cq_fasync);
3722 }
3723
3724 static void io_cancel_async_work(struct io_ring_ctx *ctx,
3725                                  struct files_struct *files)
3726 {
3727         struct io_kiocb *req;
3728
3729         if (list_empty(&ctx->task_list))
3730                 return;
3731
3732         spin_lock_irq(&ctx->task_lock);
3733
3734         list_for_each_entry(req, &ctx->task_list, task_list) {
3735                 if (files && req->files != files)
3736                         continue;
3737
3738                 /*
3739                  * The below executes an smp_mb(), which matches with the
3740                  * smp_mb() (A) in io_sq_wq_submit_work() such that either
3741                  * we store REQ_F_CANCEL flag to req->flags or we see the
3742                  * req->work_task setted in io_sq_wq_submit_work().
3743                  */
3744                 smp_store_mb(req->flags, req->flags | REQ_F_CANCEL); /* B */
3745
3746                 if (req->work_task)
3747                         send_sig(SIGINT, req->work_task, 1);
3748         }
3749         spin_unlock_irq(&ctx->task_lock);
3750 }
3751
3752 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
3753 {
3754         mutex_lock(&ctx->uring_lock);
3755         percpu_ref_kill(&ctx->refs);
3756         mutex_unlock(&ctx->uring_lock);
3757
3758         io_cancel_async_work(ctx, NULL);
3759         io_kill_timeouts(ctx);
3760         io_poll_remove_all(ctx);
3761         io_iopoll_reap_events(ctx);
3762         wait_for_completion(&ctx->ctx_done);
3763         io_ring_ctx_free(ctx);
3764 }
3765
3766 static int io_uring_flush(struct file *file, void *data)
3767 {
3768         struct io_ring_ctx *ctx = file->private_data;
3769
3770         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
3771                 io_cancel_async_work(ctx, data);
3772
3773         return 0;
3774 }
3775
3776 static int io_uring_release(struct inode *inode, struct file *file)
3777 {
3778         struct io_ring_ctx *ctx = file->private_data;
3779
3780         file->private_data = NULL;
3781         io_ring_ctx_wait_and_kill(ctx);
3782         return 0;
3783 }
3784
3785 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
3786 {
3787         loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
3788         unsigned long sz = vma->vm_end - vma->vm_start;
3789         struct io_ring_ctx *ctx = file->private_data;
3790         unsigned long pfn;
3791         struct page *page;
3792         void *ptr;
3793
3794         switch (offset) {
3795         case IORING_OFF_SQ_RING:
3796         case IORING_OFF_CQ_RING:
3797                 ptr = ctx->rings;
3798                 break;
3799         case IORING_OFF_SQES:
3800                 ptr = ctx->sq_sqes;
3801                 break;
3802         default:
3803                 return -EINVAL;
3804         }
3805
3806         page = virt_to_head_page(ptr);
3807         if (sz > page_size(page))
3808                 return -EINVAL;
3809
3810         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
3811         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
3812 }
3813
3814 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3815                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
3816                 size_t, sigsz)
3817 {
3818         struct io_ring_ctx *ctx;
3819         long ret = -EBADF;
3820         int submitted = 0;
3821         struct fd f;
3822
3823         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
3824                 return -EINVAL;
3825
3826         f = fdget(fd);
3827         if (!f.file)
3828                 return -EBADF;
3829
3830         ret = -EOPNOTSUPP;
3831         if (f.file->f_op != &io_uring_fops)
3832                 goto out_fput;
3833
3834         ret = -ENXIO;
3835         ctx = f.file->private_data;
3836         if (!percpu_ref_tryget(&ctx->refs))
3837                 goto out_fput;
3838
3839         /*
3840          * For SQ polling, the thread will do all submissions and completions.
3841          * Just return the requested submit count, and wake the thread if
3842          * we were asked to.
3843          */
3844         ret = 0;
3845         if (ctx->flags & IORING_SETUP_SQPOLL) {
3846                 if (flags & IORING_ENTER_SQ_WAKEUP)
3847                         wake_up(&ctx->sqo_wait);
3848                 submitted = to_submit;
3849         } else if (to_submit) {
3850                 to_submit = min(to_submit, ctx->sq_entries);
3851
3852                 mutex_lock(&ctx->uring_lock);
3853                 submitted = io_ring_submit(ctx, to_submit);
3854                 mutex_unlock(&ctx->uring_lock);
3855
3856                 if (submitted != to_submit)
3857                         goto out;
3858         }
3859         if (flags & IORING_ENTER_GETEVENTS) {
3860                 unsigned nr_events = 0;
3861
3862                 min_complete = min(min_complete, ctx->cq_entries);
3863
3864                 if (ctx->flags & IORING_SETUP_IOPOLL) {
3865                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
3866                 } else {
3867                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
3868                 }
3869         }
3870
3871 out:
3872         percpu_ref_put(&ctx->refs);
3873 out_fput:
3874         fdput(f);
3875         return submitted ? submitted : ret;
3876 }
3877
3878 static const struct file_operations io_uring_fops = {
3879         .release        = io_uring_release,
3880         .flush          = io_uring_flush,
3881         .mmap           = io_uring_mmap,
3882         .poll           = io_uring_poll,
3883         .fasync         = io_uring_fasync,
3884 };
3885
3886 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
3887                                   struct io_uring_params *p)
3888 {
3889         struct io_rings *rings;
3890         size_t size, sq_array_offset;
3891
3892         /* make sure these are sane, as we already accounted them */
3893         ctx->sq_entries = p->sq_entries;
3894         ctx->cq_entries = p->cq_entries;
3895
3896         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
3897         if (size == SIZE_MAX)
3898                 return -EOVERFLOW;
3899
3900         rings = io_mem_alloc(size);
3901         if (!rings)
3902                 return -ENOMEM;
3903
3904         ctx->rings = rings;
3905         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
3906         rings->sq_ring_mask = p->sq_entries - 1;
3907         rings->cq_ring_mask = p->cq_entries - 1;
3908         rings->sq_ring_entries = p->sq_entries;
3909         rings->cq_ring_entries = p->cq_entries;
3910         ctx->sq_mask = rings->sq_ring_mask;
3911         ctx->cq_mask = rings->cq_ring_mask;
3912
3913         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3914         if (size == SIZE_MAX) {
3915                 io_mem_free(ctx->rings);
3916                 ctx->rings = NULL;
3917                 return -EOVERFLOW;
3918         }
3919
3920         ctx->sq_sqes = io_mem_alloc(size);
3921         if (!ctx->sq_sqes) {
3922                 io_mem_free(ctx->rings);
3923                 ctx->rings = NULL;
3924                 return -ENOMEM;
3925         }
3926
3927         return 0;
3928 }
3929
3930 /*
3931  * Allocate an anonymous fd, this is what constitutes the application
3932  * visible backing of an io_uring instance. The application mmaps this
3933  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
3934  * we have to tie this fd to a socket for file garbage collection purposes.
3935  */
3936 static int io_uring_get_fd(struct io_ring_ctx *ctx)
3937 {
3938         struct file *file;
3939         int ret;
3940
3941 #if defined(CONFIG_UNIX)
3942         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
3943                                 &ctx->ring_sock);
3944         if (ret)
3945                 return ret;
3946 #endif
3947
3948         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
3949         if (ret < 0)
3950                 goto err;
3951
3952         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
3953                                         O_RDWR | O_CLOEXEC);
3954         if (IS_ERR(file)) {
3955                 put_unused_fd(ret);
3956                 ret = PTR_ERR(file);
3957                 goto err;
3958         }
3959
3960 #if defined(CONFIG_UNIX)
3961         ctx->ring_sock->file = file;
3962         ctx->ring_sock->sk->sk_user_data = ctx;
3963 #endif
3964         fd_install(ret, file);
3965         return ret;
3966 err:
3967 #if defined(CONFIG_UNIX)
3968         sock_release(ctx->ring_sock);
3969         ctx->ring_sock = NULL;
3970 #endif
3971         return ret;
3972 }
3973
3974 static int io_uring_create(unsigned entries, struct io_uring_params *p)
3975 {
3976         struct user_struct *user = NULL;
3977         struct io_ring_ctx *ctx;
3978         bool account_mem;
3979         int ret;
3980
3981         if (!entries || entries > IORING_MAX_ENTRIES)
3982                 return -EINVAL;
3983
3984         /*
3985          * Use twice as many entries for the CQ ring. It's possible for the
3986          * application to drive a higher depth than the size of the SQ ring,
3987          * since the sqes are only used at submission time. This allows for
3988          * some flexibility in overcommitting a bit.
3989          */
3990         p->sq_entries = roundup_pow_of_two(entries);
3991         p->cq_entries = 2 * p->sq_entries;
3992
3993         user = get_uid(current_user());
3994         account_mem = !capable(CAP_IPC_LOCK);
3995
3996         if (account_mem) {
3997                 ret = io_account_mem(user,
3998                                 ring_pages(p->sq_entries, p->cq_entries));
3999                 if (ret) {
4000                         free_uid(user);
4001                         return ret;
4002                 }
4003         }
4004
4005         ctx = io_ring_ctx_alloc(p);
4006         if (!ctx) {
4007                 if (account_mem)
4008                         io_unaccount_mem(user, ring_pages(p->sq_entries,
4009                                                                 p->cq_entries));
4010                 free_uid(user);
4011                 return -ENOMEM;
4012         }
4013         ctx->compat = in_compat_syscall();
4014         ctx->account_mem = account_mem;
4015         ctx->user = user;
4016
4017         ctx->creds = get_current_cred();
4018         if (!ctx->creds) {
4019                 ret = -ENOMEM;
4020                 goto err;
4021         }
4022
4023         ret = io_allocate_scq_urings(ctx, p);
4024         if (ret)
4025                 goto err;
4026
4027         ret = io_sq_offload_start(ctx, p);
4028         if (ret)
4029                 goto err;
4030
4031         memset(&p->sq_off, 0, sizeof(p->sq_off));
4032         p->sq_off.head = offsetof(struct io_rings, sq.head);
4033         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
4034         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
4035         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
4036         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
4037         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
4038         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
4039
4040         memset(&p->cq_off, 0, sizeof(p->cq_off));
4041         p->cq_off.head = offsetof(struct io_rings, cq.head);
4042         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
4043         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
4044         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
4045         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
4046         p->cq_off.cqes = offsetof(struct io_rings, cqes);
4047
4048         /*
4049          * Install ring fd as the very last thing, so we don't risk someone
4050          * having closed it before we finish setup
4051          */
4052         ret = io_uring_get_fd(ctx);
4053         if (ret < 0)
4054                 goto err;
4055
4056         p->features = IORING_FEAT_SINGLE_MMAP;
4057         return ret;
4058 err:
4059         io_ring_ctx_wait_and_kill(ctx);
4060         return ret;
4061 }
4062
4063 /*
4064  * Sets up an aio uring context, and returns the fd. Applications asks for a
4065  * ring size, we return the actual sq/cq ring sizes (among other things) in the
4066  * params structure passed in.
4067  */
4068 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
4069 {
4070         struct io_uring_params p;
4071         long ret;
4072         int i;
4073
4074         if (copy_from_user(&p, params, sizeof(p)))
4075                 return -EFAULT;
4076         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
4077                 if (p.resv[i])
4078                         return -EINVAL;
4079         }
4080
4081         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
4082                         IORING_SETUP_SQ_AFF))
4083                 return -EINVAL;
4084
4085         ret = io_uring_create(entries, &p);
4086         if (ret < 0)
4087                 return ret;
4088
4089         if (copy_to_user(params, &p, sizeof(p)))
4090                 return -EFAULT;
4091
4092         return ret;
4093 }
4094
4095 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
4096                 struct io_uring_params __user *, params)
4097 {
4098         return io_uring_setup(entries, params);
4099 }
4100
4101 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
4102                                void __user *arg, unsigned nr_args)
4103         __releases(ctx->uring_lock)
4104         __acquires(ctx->uring_lock)
4105 {
4106         int ret;
4107
4108         /*
4109          * We're inside the ring mutex, if the ref is already dying, then
4110          * someone else killed the ctx or is already going through
4111          * io_uring_register().
4112          */
4113         if (percpu_ref_is_dying(&ctx->refs))
4114                 return -ENXIO;
4115
4116         percpu_ref_kill(&ctx->refs);
4117
4118         /*
4119          * Drop uring mutex before waiting for references to exit. If another
4120          * thread is currently inside io_uring_enter() it might need to grab
4121          * the uring_lock to make progress. If we hold it here across the drain
4122          * wait, then we can deadlock. It's safe to drop the mutex here, since
4123          * no new references will come in after we've killed the percpu ref.
4124          */
4125         mutex_unlock(&ctx->uring_lock);
4126         wait_for_completion(&ctx->ctx_done);
4127         mutex_lock(&ctx->uring_lock);
4128
4129         switch (opcode) {
4130         case IORING_REGISTER_BUFFERS:
4131                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
4132                 break;
4133         case IORING_UNREGISTER_BUFFERS:
4134                 ret = -EINVAL;
4135                 if (arg || nr_args)
4136                         break;
4137                 ret = io_sqe_buffer_unregister(ctx);
4138                 break;
4139         case IORING_REGISTER_FILES:
4140                 ret = io_sqe_files_register(ctx, arg, nr_args);
4141                 break;
4142         case IORING_UNREGISTER_FILES:
4143                 ret = -EINVAL;
4144                 if (arg || nr_args)
4145                         break;
4146                 ret = io_sqe_files_unregister(ctx);
4147                 break;
4148         case IORING_REGISTER_EVENTFD:
4149                 ret = -EINVAL;
4150                 if (nr_args != 1)
4151                         break;
4152                 ret = io_eventfd_register(ctx, arg);
4153                 break;
4154         case IORING_UNREGISTER_EVENTFD:
4155                 ret = -EINVAL;
4156                 if (arg || nr_args)
4157                         break;
4158                 ret = io_eventfd_unregister(ctx);
4159                 break;
4160         default:
4161                 ret = -EINVAL;
4162                 break;
4163         }
4164
4165         /* bring the ctx back to life */
4166         reinit_completion(&ctx->ctx_done);
4167         percpu_ref_reinit(&ctx->refs);
4168         return ret;
4169 }
4170
4171 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
4172                 void __user *, arg, unsigned int, nr_args)
4173 {
4174         struct io_ring_ctx *ctx;
4175         long ret = -EBADF;
4176         struct fd f;
4177
4178         f = fdget(fd);
4179         if (!f.file)
4180                 return -EBADF;
4181
4182         ret = -EOPNOTSUPP;
4183         if (f.file->f_op != &io_uring_fops)
4184                 goto out_fput;
4185
4186         ctx = f.file->private_data;
4187
4188         mutex_lock(&ctx->uring_lock);
4189         ret = __io_uring_register(ctx, opcode, arg, nr_args);
4190         mutex_unlock(&ctx->uring_lock);
4191 out_fput:
4192         fdput(f);
4193         return ret;
4194 }
4195
4196 static int __init io_uring_init(void)
4197 {
4198         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
4199         return 0;
4200 };
4201 __initcall(io_uring_init);