ceph/src/seastar/src/core/linux-aio.cc

   1 /*
   2  * This file is open source software, licensed to you under the terms
   3  * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
   4  * distributed with this work for additional information regarding copyright
   5  * ownership.  You may not use this file except in compliance with the License.
   6  *
   7  * You may obtain a copy of the License at
   8  *
   9  *   http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing,
  12  * software distributed under the License is distributed on an
  13  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14  * KIND, either express or implied.  See the License for the
  15  * specific language governing permissions and limitations
  16  * under the License.
  17  */
  18 /*
  19  * Copyright (C) 2017 ScyllaDB
  20  */
  21
  22 #include <seastar/core/linux-aio.hh>
  23 #include <seastar/core/print.hh>
  24 #include <unistd.h>
  25 #include <sys/syscall.h>
  26 #include <atomic>
  27 #include <algorithm>
  28 #include <errno.h>
  29 #include <string.h>
  30
  31 namespace seastar {
  32
  33 namespace internal {
  34
  35 namespace linux_abi {
  36
  37 struct linux_aio_ring {
  38     uint32_t id;
  39     uint32_t nr;
  40     std::atomic<uint32_t> head;
  41     std::atomic<uint32_t> tail;
  42     uint32_t magic;
  43     uint32_t compat_features;
  44     uint32_t incompat_features;
  45     uint32_t header_length;
  46 };
  47
  48 }
  49
  50 using namespace linux_abi;
  51
  52 static linux_aio_ring* to_ring(aio_context_t io_context) {
  53     return reinterpret_cast<linux_aio_ring*>(uintptr_t(io_context));
  54 }
  55
  56 static bool usable(const linux_aio_ring* ring) {
  57     return ring->magic == 0xa10a10a1 && ring->incompat_features == 0;
  58 }
  59
  60 int io_setup(int nr_events, aio_context_t* io_context) {
  61     return ::syscall(SYS_io_setup, nr_events, io_context);
  62 }
  63
  64 int io_destroy(aio_context_t io_context) {
  65    return ::syscall(SYS_io_destroy, io_context);
  66 }
  67
  68 int io_submit(aio_context_t io_context, long nr, iocb** iocbs) {
  69     return ::syscall(SYS_io_submit, io_context, nr, iocbs);
  70 }
  71
  72 int io_cancel(aio_context_t io_context, iocb* iocb, io_event* result) {
  73     return ::syscall(SYS_io_cancel, io_context, iocb, result);
  74 }
  75
  76 static int try_reap_events(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
  77         bool force_syscall) {
  78     auto ring = to_ring(io_context);
  79     if (usable(ring) && !force_syscall) {
  80         // Try to complete in userspace, if enough available events,
  81         // or if the timeout is zero
  82
  83         // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
  84         // only a single thread calls io_getevents()).
  85         auto head = ring->head.load(std::memory_order_relaxed);
  86         // The kernel will write to the ring from an interrupt and then release with a write
  87         // to ring->tail, so we must memory_order_acquire here.
  88         auto tail = ring->tail.load(std::memory_order_acquire); // kernel writes from interrupts
  89         auto available = tail - head;
  90         if (tail < head) {
  91             available += ring->nr;
  92         }
  93         if (available >= uint32_t(min_nr)
  94                 || (timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0)) {
  95             if (!available) {
  96                 return 0;
  97             }
  98             auto ring_events = reinterpret_cast<const io_event*>(uintptr_t(io_context) + ring->header_length);
  99             auto now = std::min<uint32_t>(nr, available);
 100             auto start = ring_events + head;
 101             head += now;
 102             if (head < ring->nr) {
 103                 std::copy(start, start + now, events);
 104             } else {
 105                 head -= ring->nr;
 106                 auto p = std::copy(start, ring_events + ring->nr, events);
 107                 std::copy(ring_events, ring_events + head, p);
 108             }
 109             // The kernel will read ring->head and update its view of how many entries
 110             // in the ring are available, so memory_order_release to make sure any ring
 111             // accesses are completed before the update to ring->head is visible.
 112             ring->head.store(head, std::memory_order_release);
 113             return now;
 114         }
 115     }
 116     return -1;
 117 }
 118
 119 int io_getevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
 120         bool force_syscall) {
 121     auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
 122     if (r >= 0) {
 123         return r;
 124     }
 125     return ::syscall(SYS_io_getevents, io_context, min_nr, nr, events, timeout);
 126 }
 127
 128
 129 #ifndef __NR_io_pgetevents
 130
 131 #  if defined(__x86_64__)
 132 #    define __NR_io_pgetevents 333
 133 #  elif defined(__i386__)
 134 #    define __NR_io_pgetevents 385
 135 #  endif
 136
 137 #endif
 138
 139 int io_pgetevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, const sigset_t* sigmask,
 140         bool force_syscall) {
 141 #ifdef __NR_io_pgetevents
 142     auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
 143     if (r >= 0) {
 144         return r;
 145     }
 146     aio_sigset as;
 147     as.sigmask = sigmask;
 148     as.sigsetsize = 8;  // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
 149     return ::syscall(__NR_io_pgetevents, io_context, min_nr, nr, events, timeout, &as);
 150 #else
 151     errno = ENOSYS;
 152     return -1;
 153 #endif
 154 }
 155
 156 void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context) {
 157     auto r = io_setup(nr, io_context);
 158     if (r < 0) {
 159         char buf[1024];
 160         char *msg = strerror_r(errno, buf, sizeof(buf));
 161         throw std::runtime_error(fmt::format("Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application", msg));
 162     }
 163 }
 164
 165 }
 166
 167 }