]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/src/core/linux-aio.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / seastar / src / core / linux-aio.cc
1 /*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18 /*
19 * Copyright (C) 2017 ScyllaDB
20 */
21
22 #include <seastar/core/linux-aio.hh>
23 #include <seastar/core/print.hh>
24 #include <unistd.h>
25 #include <sys/syscall.h>
26 #include <atomic>
27 #include <algorithm>
28 #include <errno.h>
29 #include <string.h>
30 #include <valgrind/valgrind.h>
31
32 namespace seastar {
33
34 namespace internal {
35
36 namespace linux_abi {
37
38 struct linux_aio_ring {
39 uint32_t id;
40 uint32_t nr;
41 std::atomic<uint32_t> head;
42 std::atomic<uint32_t> tail;
43 uint32_t magic;
44 uint32_t compat_features;
45 uint32_t incompat_features;
46 uint32_t header_length;
47 };
48
49 }
50
51 using namespace linux_abi;
52
53 static linux_aio_ring* to_ring(aio_context_t io_context) {
54 return reinterpret_cast<linux_aio_ring*>(uintptr_t(io_context));
55 }
56
57 static bool usable(const linux_aio_ring* ring) {
58 return ring->magic == 0xa10a10a1 && ring->incompat_features == 0 && !RUNNING_ON_VALGRIND;
59 }
60
61 int io_setup(int nr_events, aio_context_t* io_context) {
62 return ::syscall(SYS_io_setup, nr_events, io_context);
63 }
64
65 int io_destroy(aio_context_t io_context) noexcept {
66 return ::syscall(SYS_io_destroy, io_context);
67 }
68
69 int io_submit(aio_context_t io_context, long nr, iocb** iocbs) {
70 return ::syscall(SYS_io_submit, io_context, nr, iocbs);
71 }
72
73 int io_cancel(aio_context_t io_context, iocb* iocb, io_event* result) {
74 return ::syscall(SYS_io_cancel, io_context, iocb, result);
75 }
76
77 static int try_reap_events(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
78 bool force_syscall) {
79 auto ring = to_ring(io_context);
80 if (usable(ring) && !force_syscall) {
81 // Try to complete in userspace, if enough available events,
82 // or if the timeout is zero
83
84 // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
85 // only a single thread calls io_getevents()).
86 auto head = ring->head.load(std::memory_order_relaxed);
87 // The kernel will write to the ring from an interrupt and then release with a write
88 // to ring->tail, so we must memory_order_acquire here.
89 auto tail = ring->tail.load(std::memory_order_acquire); // kernel writes from interrupts
90 auto available = tail - head;
91 if (tail < head) {
92 available += ring->nr;
93 }
94 if (available >= uint32_t(min_nr)
95 || (timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0)) {
96 if (!available) {
97 return 0;
98 }
99 auto ring_events = reinterpret_cast<const io_event*>(uintptr_t(io_context) + ring->header_length);
100 auto now = std::min<uint32_t>(nr, available);
101 auto start = ring_events + head;
102 head += now;
103 if (head < ring->nr) {
104 std::copy(start, start + now, events);
105 } else {
106 head -= ring->nr;
107 auto p = std::copy(start, ring_events + ring->nr, events);
108 std::copy(ring_events, ring_events + head, p);
109 }
110 // The kernel will read ring->head and update its view of how many entries
111 // in the ring are available, so memory_order_release to make sure any ring
112 // accesses are completed before the update to ring->head is visible.
113 ring->head.store(head, std::memory_order_release);
114 return now;
115 }
116 }
117 return -1;
118 }
119
120 int io_getevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
121 bool force_syscall) {
122 auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
123 if (r >= 0) {
124 return r;
125 }
126 return ::syscall(SYS_io_getevents, io_context, min_nr, nr, events, timeout);
127 }
128
129
130 #ifndef __NR_io_pgetevents
131
132 # if defined(__x86_64__)
133 # define __NR_io_pgetevents 333
134 # elif defined(__i386__)
135 # define __NR_io_pgetevents 385
136 # endif
137
138 #endif
139
140 int io_pgetevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, const sigset_t* sigmask,
141 bool force_syscall) {
142 #ifdef __NR_io_pgetevents
143 auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
144 if (r >= 0) {
145 return r;
146 }
147 aio_sigset as;
148 as.sigmask = sigmask;
149 as.sigsetsize = 8; // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
150 return ::syscall(__NR_io_pgetevents, io_context, min_nr, nr, events, timeout, &as);
151 #else
152 errno = ENOSYS;
153 return -1;
154 #endif
155 }
156
157 void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context) {
158 auto r = io_setup(nr, io_context);
159 if (r < 0) {
160 char buf[1024];
161 char *msg = strerror_r(errno, buf, sizeof(buf));
162 throw std::runtime_error(fmt::format("Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application", msg));
163 }
164 }
165
166 }
167
168 }