io_uring: fix ltout double free on completion race

[mirror_ubuntu-hirsute-kernel.git] / fs / io_uring.c
diff --git a/fs/io_uring.c b/fs/io_uring.c

index 68508f010b908122e1b39e2c68cd6ebfb206844c..2b86b413641a490b610f4123943325939a4f5276 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -222,7 +222,7 @@ struct fixed_file_data {
  struct io_buffer {
         struct list_head list;
         __u64 addr;
-       __s32 len;
+       __u32 len;
         __u16 bid;
  };
  
@@ -535,7 +535,7 @@ struct io_splice {
  struct io_provide_buf {
         struct file                     *file;
         __u64                           addr;
-       __s32                           len;
+       __u32                           len;
         __u32                           bgid;
         __u16                           nbufs;
         __u16                           bid;
@@ -574,7 +574,7 @@ struct io_unlink {
  struct io_completion {
         struct file                     *file;
         struct list_head                list;
-       int                             cflags;
+       u32                             cflags;
  };
  
  struct io_async_connect {
@@ -1546,7 +1546,7 @@ static void io_prep_async_work(struct io_kiocb *req)
         if (req->flags & REQ_F_ISREG) {
                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
                         io_wq_hash_work(&req->work, file_inode(req->file));
-       } else {
+       } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
                 if (def->unbound_nonreg_file)
                         req->work.flags |= IO_WQ_WORK_UNBOUND;
         }
@@ -1594,7 +1594,7 @@ static void io_queue_async_work(struct io_kiocb *req)
                 io_queue_linked_timeout(link);
  }
  
-static void io_kill_timeout(struct io_kiocb *req)
+static void io_kill_timeout(struct io_kiocb *req, int status)
  {
         struct io_timeout_data *io = req->async_data;
         int ret;
@@ -1604,7 +1604,7 @@ static void io_kill_timeout(struct io_kiocb *req)
                 atomic_set(&req->ctx->cq_timeouts,
                         atomic_read(&req->ctx->cq_timeouts) + 1);
                 list_del_init(&req->timeout.list);
-               io_cqring_fill_event(req, 0);
+               io_cqring_fill_event(req, status);
                 io_put_req_deferred(req, 1);
         }
  }
@@ -1621,7 +1621,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
         spin_lock_irq(&ctx->completion_lock);
         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
                 if (io_match_task(req, tsk, files)) {
-                       io_kill_timeout(req);
+                       io_kill_timeout(req, -ECANCELED);
                         canceled++;
                 }
         }
@@ -1673,7 +1673,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
                         break;
  
                 list_del_init(&req->timeout.list);
-               io_kill_timeout(req);
+               io_kill_timeout(req, 0);
         } while (!list_empty(&ctx->timeout_list));
  
         ctx->cq_last_tm_flush = seq;
@@ -1823,21 +1823,26 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
         return all_flushed;
  }
  
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                                      struct task_struct *tsk,
                                      struct files_struct *files)
  {
+       bool ret = true;
+
         if (test_bit(0, &ctx->cq_check_overflow)) {
                 /* iopoll syncs against uring_lock, not completion_lock */
                 if (ctx->flags & IORING_SETUP_IOPOLL)
                         mutex_lock(&ctx->uring_lock);
-               __io_cqring_overflow_flush(ctx, force, tsk, files);
+               ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
                 if (ctx->flags & IORING_SETUP_IOPOLL)
                         mutex_unlock(&ctx->uring_lock);
         }
+
+       return ret;
  }
  
-static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
+static void __io_cqring_fill_event(struct io_kiocb *req, long res,
+                                  unsigned int cflags)
  {
         struct io_ring_ctx *ctx = req->ctx;
         struct io_uring_cqe *cqe;
@@ -2217,6 +2222,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
                 __io_req_task_cancel(req, -EFAULT);
         mutex_unlock(&ctx->uring_lock);
  
+       ctx->flags &= ~IORING_SETUP_R_DISABLED;
         if (ctx->flags & IORING_SETUP_SQPOLL)
                 io_sq_thread_drop_mm_files();
  }
@@ -3514,7 +3520,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
         else
                 kiocb->ki_flags |= IOCB_NOWAIT;
  
-
         /* If the file doesn't support async, just async punt */
         no_async = force_nonblock && !io_file_supports_async(req->file, READ);
         if (no_async)
@@ -3526,9 +3531,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
  
         ret = io_iter_do_read(req, iter);
  
-       if (!ret) {
-               goto done;
-       } else if (ret == -EIOCBQUEUED) {
+       if (ret == -EIOCBQUEUED) {
                 ret = 0;
                 goto out_free;
         } else if (ret == -EAGAIN) {
@@ -3542,7 +3545,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
                 ret = 0;
                 goto copy_iov;
-       } else if (ret < 0) {
+       } else if (ret <= 0) {
                 /* make sure -ERESTARTSYS -> -EINTR is done */
                 goto done;
         }
@@ -3586,6 +3589,7 @@ retry:
                 goto out_free;
         } else if (ret > 0 && ret < io_size) {
                 /* we got some bytes, but not all. retry. */
+               kiocb->ki_flags &= ~IOCB_WAITQ;
                 goto retry;
         }
  done:
@@ -4211,6 +4215,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
  static int io_provide_buffers_prep(struct io_kiocb *req,
                                    const struct io_uring_sqe *sqe)
  {
+       unsigned long size, tmp_check;
         struct io_provide_buf *p = &req->pbuf;
         u64 tmp;
  
@@ -4224,7 +4229,14 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
         p->addr = READ_ONCE(sqe->addr);
         p->len = READ_ONCE(sqe->len);
  
-       if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
+       if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
+                               &size))
+               return -EOVERFLOW;
+       if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
+               return -EOVERFLOW;
+
+       size = (unsigned long)p->len * p->nbufs;
+       if (!access_ok(u64_to_user_ptr(p->addr), size))
                 return -EFAULT;
  
         p->bgid = READ_ONCE(sqe->buf_group);
@@ -4247,7 +4259,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
                         break;
  
                 buf->addr = addr;
-               buf->len = pbuf->len;
+               buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
                 buf->bid = bid;
                 addr += pbuf->len;
                 bid++;
@@ -4623,6 +4635,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
         struct io_async_msghdr iomsg, *kmsg;
         struct socket *sock;
         unsigned flags;
+       int min_ret = 0;
         int ret;
  
         sock = sock_from_file(req->file);
@@ -4643,12 +4656,15 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
                 kmsg = &iomsg;
         }
  
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
         if (flags & MSG_DONTWAIT)
                 req->flags |= REQ_F_NOWAIT;
         else if (force_nonblock)
                 flags |= MSG_DONTWAIT;
  
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
         if (force_nonblock && ret == -EAGAIN)
                 return io_setup_async_msg(req, kmsg);
@@ -4658,7 +4674,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
         if (kmsg->iov != kmsg->fast_iov)
                 kfree(kmsg->iov);
         req->flags &= ~REQ_F_NEED_CLEANUP;
-       if (ret < 0)
+       if (ret < min_ret)
                 req_set_fail_links(req);
         __io_req_complete(req, ret, 0, cs);
         return 0;
@@ -4672,6 +4688,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
         struct iovec iov;
         struct socket *sock;
         unsigned flags;
+       int min_ret = 0;
         int ret;
  
         sock = sock_from_file(req->file);
@@ -4687,12 +4704,15 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
         msg.msg_controllen = 0;
         msg.msg_namelen = 0;
  
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
         if (flags & MSG_DONTWAIT)
                 req->flags |= REQ_F_NOWAIT;
         else if (force_nonblock)
                 flags |= MSG_DONTWAIT;
  
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&msg.msg_iter);
+
         msg.msg_flags = flags;
         ret = sock_sendmsg(sock, &msg);
         if (force_nonblock && ret == -EAGAIN)
@@ -4700,7 +4720,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
         if (ret == -ERESTARTSYS)
                 ret = -EINTR;
  
-       if (ret < 0)
+       if (ret < min_ret)
                 req_set_fail_links(req);
         __io_req_complete(req, ret, 0, cs);
         return 0;
@@ -4852,6 +4872,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
         struct socket *sock;
         struct io_buffer *kbuf;
         unsigned flags;
+       int min_ret = 0;
         int ret, cflags = 0;
  
         sock = sock_from_file(req->file);
@@ -4881,12 +4902,15 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
                                 1, req->sr_msg.len);
         }
  
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
         if (flags & MSG_DONTWAIT)
                 req->flags |= REQ_F_NOWAIT;
         else if (force_nonblock)
                 flags |= MSG_DONTWAIT;
  
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
                                         kmsg->uaddr, flags);
         if (force_nonblock && ret == -EAGAIN)
@@ -4899,7 +4923,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
         if (kmsg->iov != kmsg->fast_iov)
                 kfree(kmsg->iov);
         req->flags &= ~REQ_F_NEED_CLEANUP;
-       if (ret < 0)
+       if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
                 req_set_fail_links(req);
         __io_req_complete(req, ret, cflags, cs);
         return 0;
@@ -4915,6 +4939,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
         struct socket *sock;
         struct iovec iov;
         unsigned flags;
+       int min_ret = 0;
         int ret, cflags = 0;
  
         sock = sock_from_file(req->file);
@@ -4939,12 +4964,15 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
         msg.msg_iocb = NULL;
         msg.msg_flags = 0;
  
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
         if (flags & MSG_DONTWAIT)
                 req->flags |= REQ_F_NOWAIT;
         else if (force_nonblock)
                 flags |= MSG_DONTWAIT;
  
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&msg.msg_iter);
+
         ret = sock_recvmsg(sock, &msg, flags);
         if (force_nonblock && ret == -EAGAIN)
                 return -EAGAIN;
@@ -4953,7 +4981,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
  out_free:
         if (req->flags & REQ_F_BUFFER_SELECTED)
                 cflags = io_put_recv_kbuf(req);
-       if (ret < 0)
+       if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
                 req_set_fail_links(req);
         __io_req_complete(req, ret, cflags, cs);
         return 0;
@@ -6484,16 +6512,17 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
          * We don't expect the list to be empty, that will only happen if we
          * race with the completion of the linked work.
          */
-       if (prev && refcount_inc_not_zero(&prev->refs))
+       if (prev) {
                 io_remove_next_linked(prev);
-       else
-               prev = NULL;
+               if (!refcount_inc_not_zero(&prev->refs))
+                       prev = NULL;
+       }
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
         if (prev) {
-               req_set_fail_links(prev);
                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
                 io_put_req_deferred(prev, 1);
+               io_put_req_deferred(req, 1);
         } else {
                 io_cqring_add_event(req, -ETIME, 0);
                 io_put_req_deferred(req, 1);
@@ -7208,6 +7237,25 @@ static int io_run_task_work_sig(void)
         return -EINTR;
  }
  
+/* when returns >0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+                                         struct io_wait_queue *iowq,
+                                         signed long *timeout)
+{
+       int ret;
+
+       /* make sure we run task_work before checking for signals */
+       ret = io_run_task_work_sig();
+       if (ret || io_should_wake(iowq))
+               return ret;
+       /* let the caller flush overflows, retry */
+       if (test_bit(0, &ctx->cq_check_overflow))
+               return 1;
+
+       *timeout = schedule_timeout(*timeout);
+       return !*timeout ? -ETIME : 1;
+}
+
  /*
   * Wait until events become available, if we don't already have some. The
   * application must reap them itself, as they reside on the shared cq ring.
@@ -7226,9 +7274,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                 .to_wait        = min_events,
         };
         struct io_rings *rings = ctx->rings;
-       struct timespec64 ts;
-       signed long timeout = 0;
-       int ret = 0;
+       signed long timeout = MAX_SCHEDULE_TIMEOUT;
+       int ret;
  
         do {
                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -7252,6 +7299,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
         }
  
         if (uts) {
+               struct timespec64 ts;
+
                 if (get_timespec64(&ts, uts))
                         return -EFAULT;
                 timeout = timespec64_to_jiffies(&ts);
@@ -7260,34 +7309,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
         trace_io_uring_cqring_wait(ctx, min_events);
         do {
-               io_cqring_overflow_flush(ctx, false, NULL, NULL);
-               prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
-                                               TASK_INTERRUPTIBLE);
-               /* make sure we run task_work before checking for signals */
-               ret = io_run_task_work_sig();
-               if (ret > 0) {
-                       finish_wait(&ctx->wait, &iowq.wq);
-                       continue;
-               }
-               else if (ret < 0)
+               /* if we can't even flush overflow, don't wait for more */
+               if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+                       ret = -EBUSY;
                         break;
-               if (io_should_wake(&iowq))
-                       break;
-               if (test_bit(0, &ctx->cq_check_overflow)) {
-                       finish_wait(&ctx->wait, &iowq.wq);
-                       continue;
-               }
-               if (uts) {
-                       timeout = schedule_timeout(timeout);
-                       if (timeout == 0) {
-                               ret = -ETIME;
-                               break;
-                       }
-               } else {
-                       schedule();
                 }
-       } while (1);
-       finish_wait(&ctx->wait, &iowq.wq);
+               prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+                                               TASK_INTERRUPTIBLE);
+               ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+               finish_wait(&ctx->wait, &iowq.wq);
+               cond_resched();
+       } while (ret > 0);
  
         restore_saved_sigmask_unless(ret == -EINTR);
  
@@ -8715,6 +8747,14 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
         if (!io_sqring_full(ctx))
                 mask |= EPOLLOUT | EPOLLWRNORM;
  
+       /* prevent SQPOLL from submitting new requests */
+       if (ctx->sq_data) {
+               io_sq_thread_park(ctx->sq_data);
+               list_del_init(&ctx->sqd_list);
+               io_sqd_update_thread_idle(ctx->sq_data);
+               io_sq_thread_unpark(ctx->sq_data);
+       }
+
         /*
          * Don't flush cqring overflow list here, just do a simple check.
          * Otherwise there could possible be ABBA deadlock:
@@ -8855,11 +8895,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
         return ret;
  }
  
-static void io_cancel_defer_files(struct io_ring_ctx *ctx,
+static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
                                   struct task_struct *task,
                                   struct files_struct *files)
  {
-       struct io_defer_entry *de = NULL;
+       struct io_defer_entry *de;
         LIST_HEAD(list);
  
         spin_lock_irq(&ctx->completion_lock);
@@ -8870,6 +8910,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
                 }
         }
         spin_unlock_irq(&ctx->completion_lock);
+       if (list_empty(&list))
+               return false;
  
         while (!list_empty(&list)) {
                 de = list_first_entry(&list, struct io_defer_entry, list);
@@ -8879,6 +8921,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
                 io_req_complete(de->req, -ECANCELED);
                 kfree(de);
         }
+       return true;
  }
  
  static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
@@ -8906,6 +8949,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                         }
                 }
  
+               ret |= io_cancel_defer_files(ctx, task, files);
                 ret |= io_poll_remove_all(ctx, task, files);
                 ret |= io_kill_timeouts(ctx, task, files);
                 ret |= io_run_task_work();
@@ -8960,6 +9004,8 @@ static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
  {
         mutex_lock(&ctx->uring_lock);
         ctx->sqo_dead = 1;
+       if (ctx->flags & IORING_SETUP_R_DISABLED)
+               io_sq_offload_start(ctx);
         mutex_unlock(&ctx->uring_lock);
  
         /* make sure callers enter the ring to get error */
@@ -8984,8 +9030,6 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
                 io_sq_thread_park(ctx->sq_data);
         }
  
-       io_cancel_defer_files(ctx, task, files);
-
         io_uring_cancel_files(ctx, task, files);
         if (!files)
                 io_uring_try_cancel_requests(ctx, task, NULL);
@@ -9975,10 +10019,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
         if (ctx->restrictions.registered)
                 ctx->restricted = 1;
  
-       ctx->flags &= ~IORING_SETUP_R_DISABLED;
-
         io_sq_offload_start(ctx);
-
         return 0;
  }