]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/src/core/linux-aio.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / seastar / src / core / linux-aio.cc
CommitLineData
11fdf7f2
TL
1/*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18/*
19 * Copyright (C) 2017 ScyllaDB
20 */
21
22#include <seastar/core/linux-aio.hh>
9f95a23c 23#include <seastar/core/print.hh>
11fdf7f2
TL
24#include <unistd.h>
25#include <sys/syscall.h>
26#include <atomic>
27#include <algorithm>
28#include <errno.h>
9f95a23c 29#include <string.h>
f67539c2 30#include <valgrind/valgrind.h>
11fdf7f2
TL
31
32namespace seastar {
33
34namespace internal {
35
36namespace linux_abi {
37
38struct linux_aio_ring {
39 uint32_t id;
40 uint32_t nr;
41 std::atomic<uint32_t> head;
42 std::atomic<uint32_t> tail;
43 uint32_t magic;
44 uint32_t compat_features;
45 uint32_t incompat_features;
46 uint32_t header_length;
47};
48
49}
50
51using namespace linux_abi;
52
53static linux_aio_ring* to_ring(aio_context_t io_context) {
54 return reinterpret_cast<linux_aio_ring*>(uintptr_t(io_context));
55}
56
57static bool usable(const linux_aio_ring* ring) {
f67539c2 58 return ring->magic == 0xa10a10a1 && ring->incompat_features == 0 && !RUNNING_ON_VALGRIND;
11fdf7f2
TL
59}
60
61int io_setup(int nr_events, aio_context_t* io_context) {
62 return ::syscall(SYS_io_setup, nr_events, io_context);
63}
64
20effc67 65int io_destroy(aio_context_t io_context) noexcept {
11fdf7f2
TL
66 return ::syscall(SYS_io_destroy, io_context);
67}
68
69int io_submit(aio_context_t io_context, long nr, iocb** iocbs) {
70 return ::syscall(SYS_io_submit, io_context, nr, iocbs);
71}
72
73int io_cancel(aio_context_t io_context, iocb* iocb, io_event* result) {
74 return ::syscall(SYS_io_cancel, io_context, iocb, result);
75}
76
77static int try_reap_events(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
78 bool force_syscall) {
79 auto ring = to_ring(io_context);
80 if (usable(ring) && !force_syscall) {
81 // Try to complete in userspace, if enough available events,
82 // or if the timeout is zero
83
84 // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
85 // only a single thread calls io_getevents()).
86 auto head = ring->head.load(std::memory_order_relaxed);
87 // The kernel will write to the ring from an interrupt and then release with a write
88 // to ring->tail, so we must memory_order_acquire here.
89 auto tail = ring->tail.load(std::memory_order_acquire); // kernel writes from interrupts
90 auto available = tail - head;
91 if (tail < head) {
92 available += ring->nr;
93 }
94 if (available >= uint32_t(min_nr)
95 || (timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0)) {
96 if (!available) {
97 return 0;
98 }
99 auto ring_events = reinterpret_cast<const io_event*>(uintptr_t(io_context) + ring->header_length);
100 auto now = std::min<uint32_t>(nr, available);
101 auto start = ring_events + head;
102 head += now;
103 if (head < ring->nr) {
104 std::copy(start, start + now, events);
105 } else {
106 head -= ring->nr;
107 auto p = std::copy(start, ring_events + ring->nr, events);
108 std::copy(ring_events, ring_events + head, p);
109 }
110 // The kernel will read ring->head and update its view of how many entries
111 // in the ring are available, so memory_order_release to make sure any ring
112 // accesses are completed before the update to ring->head is visible.
113 ring->head.store(head, std::memory_order_release);
114 return now;
115 }
116 }
117 return -1;
118}
119
120int io_getevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
121 bool force_syscall) {
122 auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
123 if (r >= 0) {
124 return r;
125 }
126 return ::syscall(SYS_io_getevents, io_context, min_nr, nr, events, timeout);
127}
128
129
130#ifndef __NR_io_pgetevents
131
132# if defined(__x86_64__)
133# define __NR_io_pgetevents 333
134# elif defined(__i386__)
135# define __NR_io_pgetevents 385
136# endif
137
138#endif
139
140int io_pgetevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, const sigset_t* sigmask,
141 bool force_syscall) {
142#ifdef __NR_io_pgetevents
143 auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
144 if (r >= 0) {
145 return r;
146 }
147 aio_sigset as;
148 as.sigmask = sigmask;
149 as.sigsetsize = 8; // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
150 return ::syscall(__NR_io_pgetevents, io_context, min_nr, nr, events, timeout, &as);
151#else
152 errno = ENOSYS;
153 return -1;
154#endif
155}
156
9f95a23c
TL
157void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context) {
158 auto r = io_setup(nr, io_context);
159 if (r < 0) {
160 char buf[1024];
161 char *msg = strerror_r(errno, buf, sizeof(buf));
162 throw std::runtime_error(fmt::format("Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application", msg));
163 }
164}
165
11fdf7f2
TL
166}
167
168}