]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/src/core/linux-aio.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / seastar / src / core / linux-aio.cc
1 /*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18 /*
19 * Copyright (C) 2017 ScyllaDB
20 */
21
22 #include <seastar/core/linux-aio.hh>
23 #include <seastar/core/print.hh>
24 #include <unistd.h>
25 #include <sys/syscall.h>
26 #include <atomic>
27 #include <algorithm>
28 #include <errno.h>
29 #include <string.h>
30
31 namespace seastar {
32
33 namespace internal {
34
35 namespace linux_abi {
36
37 struct linux_aio_ring {
38 uint32_t id;
39 uint32_t nr;
40 std::atomic<uint32_t> head;
41 std::atomic<uint32_t> tail;
42 uint32_t magic;
43 uint32_t compat_features;
44 uint32_t incompat_features;
45 uint32_t header_length;
46 };
47
48 }
49
50 using namespace linux_abi;
51
52 static linux_aio_ring* to_ring(aio_context_t io_context) {
53 return reinterpret_cast<linux_aio_ring*>(uintptr_t(io_context));
54 }
55
56 static bool usable(const linux_aio_ring* ring) {
57 return ring->magic == 0xa10a10a1 && ring->incompat_features == 0;
58 }
59
60 int io_setup(int nr_events, aio_context_t* io_context) {
61 return ::syscall(SYS_io_setup, nr_events, io_context);
62 }
63
64 int io_destroy(aio_context_t io_context) {
65 return ::syscall(SYS_io_destroy, io_context);
66 }
67
68 int io_submit(aio_context_t io_context, long nr, iocb** iocbs) {
69 return ::syscall(SYS_io_submit, io_context, nr, iocbs);
70 }
71
72 int io_cancel(aio_context_t io_context, iocb* iocb, io_event* result) {
73 return ::syscall(SYS_io_cancel, io_context, iocb, result);
74 }
75
76 static int try_reap_events(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
77 bool force_syscall) {
78 auto ring = to_ring(io_context);
79 if (usable(ring) && !force_syscall) {
80 // Try to complete in userspace, if enough available events,
81 // or if the timeout is zero
82
83 // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
84 // only a single thread calls io_getevents()).
85 auto head = ring->head.load(std::memory_order_relaxed);
86 // The kernel will write to the ring from an interrupt and then release with a write
87 // to ring->tail, so we must memory_order_acquire here.
88 auto tail = ring->tail.load(std::memory_order_acquire); // kernel writes from interrupts
89 auto available = tail - head;
90 if (tail < head) {
91 available += ring->nr;
92 }
93 if (available >= uint32_t(min_nr)
94 || (timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0)) {
95 if (!available) {
96 return 0;
97 }
98 auto ring_events = reinterpret_cast<const io_event*>(uintptr_t(io_context) + ring->header_length);
99 auto now = std::min<uint32_t>(nr, available);
100 auto start = ring_events + head;
101 head += now;
102 if (head < ring->nr) {
103 std::copy(start, start + now, events);
104 } else {
105 head -= ring->nr;
106 auto p = std::copy(start, ring_events + ring->nr, events);
107 std::copy(ring_events, ring_events + head, p);
108 }
109 // The kernel will read ring->head and update its view of how many entries
110 // in the ring are available, so memory_order_release to make sure any ring
111 // accesses are completed before the update to ring->head is visible.
112 ring->head.store(head, std::memory_order_release);
113 return now;
114 }
115 }
116 return -1;
117 }
118
119 int io_getevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
120 bool force_syscall) {
121 auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
122 if (r >= 0) {
123 return r;
124 }
125 return ::syscall(SYS_io_getevents, io_context, min_nr, nr, events, timeout);
126 }
127
128
129 #ifndef __NR_io_pgetevents
130
131 # if defined(__x86_64__)
132 # define __NR_io_pgetevents 333
133 # elif defined(__i386__)
134 # define __NR_io_pgetevents 385
135 # endif
136
137 #endif
138
139 int io_pgetevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, const sigset_t* sigmask,
140 bool force_syscall) {
141 #ifdef __NR_io_pgetevents
142 auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
143 if (r >= 0) {
144 return r;
145 }
146 aio_sigset as;
147 as.sigmask = sigmask;
148 as.sigsetsize = 8; // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
149 return ::syscall(__NR_io_pgetevents, io_context, min_nr, nr, events, timeout, &as);
150 #else
151 errno = ENOSYS;
152 return -1;
153 #endif
154 }
155
156 void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context) {
157 auto r = io_setup(nr, io_context);
158 if (r < 0) {
159 char buf[1024];
160 char *msg = strerror_r(errno, buf, sizeof(buf));
161 throw std::runtime_error(fmt::format("Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application", msg));
162 }
163 }
164
165 }
166
167 }