]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* |
2 | * This file is open source software, licensed to you under the terms | |
3 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
4 | * distributed with this work for additional information regarding copyright | |
5 | * ownership. You may not use this file except in compliance with the License. | |
6 | * | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, | |
12 | * software distributed under the License is distributed on an | |
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | * KIND, either express or implied. See the License for the | |
15 | * specific language governing permissions and limitations | |
16 | * under the License. | |
17 | */ | |
18 | /* | |
19 | * Copyright (C) 2017 ScyllaDB | |
20 | */ | |
21 | ||
22 | #include <seastar/core/linux-aio.hh> | |
9f95a23c | 23 | #include <seastar/core/print.hh> |
11fdf7f2 TL |
24 | #include <unistd.h> |
25 | #include <sys/syscall.h> | |
26 | #include <atomic> | |
27 | #include <algorithm> | |
28 | #include <errno.h> | |
9f95a23c | 29 | #include <string.h> |
f67539c2 | 30 | #include <valgrind/valgrind.h> |
11fdf7f2 TL |
31 | |
32 | namespace seastar { | |
33 | ||
34 | namespace internal { | |
35 | ||
36 | namespace linux_abi { | |
37 | ||
38 | struct linux_aio_ring { | |
39 | uint32_t id; | |
40 | uint32_t nr; | |
41 | std::atomic<uint32_t> head; | |
42 | std::atomic<uint32_t> tail; | |
43 | uint32_t magic; | |
44 | uint32_t compat_features; | |
45 | uint32_t incompat_features; | |
46 | uint32_t header_length; | |
47 | }; | |
48 | ||
49 | } | |
50 | ||
51 | using namespace linux_abi; | |
52 | ||
53 | static linux_aio_ring* to_ring(aio_context_t io_context) { | |
54 | return reinterpret_cast<linux_aio_ring*>(uintptr_t(io_context)); | |
55 | } | |
56 | ||
57 | static bool usable(const linux_aio_ring* ring) { | |
f67539c2 | 58 | return ring->magic == 0xa10a10a1 && ring->incompat_features == 0 && !RUNNING_ON_VALGRIND; |
11fdf7f2 TL |
59 | } |
60 | ||
61 | int io_setup(int nr_events, aio_context_t* io_context) { | |
62 | return ::syscall(SYS_io_setup, nr_events, io_context); | |
63 | } | |
64 | ||
20effc67 | 65 | int io_destroy(aio_context_t io_context) noexcept { |
11fdf7f2 TL |
66 | return ::syscall(SYS_io_destroy, io_context); |
67 | } | |
68 | ||
69 | int io_submit(aio_context_t io_context, long nr, iocb** iocbs) { | |
70 | return ::syscall(SYS_io_submit, io_context, nr, iocbs); | |
71 | } | |
72 | ||
73 | int io_cancel(aio_context_t io_context, iocb* iocb, io_event* result) { | |
74 | return ::syscall(SYS_io_cancel, io_context, iocb, result); | |
75 | } | |
76 | ||
77 | static int try_reap_events(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, | |
78 | bool force_syscall) { | |
79 | auto ring = to_ring(io_context); | |
80 | if (usable(ring) && !force_syscall) { | |
81 | // Try to complete in userspace, if enough available events, | |
82 | // or if the timeout is zero | |
83 | ||
84 | // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming | |
85 | // only a single thread calls io_getevents()). | |
86 | auto head = ring->head.load(std::memory_order_relaxed); | |
87 | // The kernel will write to the ring from an interrupt and then release with a write | |
88 | // to ring->tail, so we must memory_order_acquire here. | |
89 | auto tail = ring->tail.load(std::memory_order_acquire); // kernel writes from interrupts | |
90 | auto available = tail - head; | |
91 | if (tail < head) { | |
92 | available += ring->nr; | |
93 | } | |
94 | if (available >= uint32_t(min_nr) | |
95 | || (timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0)) { | |
96 | if (!available) { | |
97 | return 0; | |
98 | } | |
99 | auto ring_events = reinterpret_cast<const io_event*>(uintptr_t(io_context) + ring->header_length); | |
100 | auto now = std::min<uint32_t>(nr, available); | |
101 | auto start = ring_events + head; | |
102 | head += now; | |
103 | if (head < ring->nr) { | |
104 | std::copy(start, start + now, events); | |
105 | } else { | |
106 | head -= ring->nr; | |
107 | auto p = std::copy(start, ring_events + ring->nr, events); | |
108 | std::copy(ring_events, ring_events + head, p); | |
109 | } | |
110 | // The kernel will read ring->head and update its view of how many entries | |
111 | // in the ring are available, so memory_order_release to make sure any ring | |
112 | // accesses are completed before the update to ring->head is visible. | |
113 | ring->head.store(head, std::memory_order_release); | |
114 | return now; | |
115 | } | |
116 | } | |
117 | return -1; | |
118 | } | |
119 | ||
120 | int io_getevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, | |
121 | bool force_syscall) { | |
122 | auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall); | |
123 | if (r >= 0) { | |
124 | return r; | |
125 | } | |
126 | return ::syscall(SYS_io_getevents, io_context, min_nr, nr, events, timeout); | |
127 | } | |
128 | ||
129 | ||
130 | #ifndef __NR_io_pgetevents | |
131 | ||
132 | # if defined(__x86_64__) | |
133 | # define __NR_io_pgetevents 333 | |
134 | # elif defined(__i386__) | |
135 | # define __NR_io_pgetevents 385 | |
136 | # endif | |
137 | ||
138 | #endif | |
139 | ||
140 | int io_pgetevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, const sigset_t* sigmask, | |
141 | bool force_syscall) { | |
142 | #ifdef __NR_io_pgetevents | |
143 | auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall); | |
144 | if (r >= 0) { | |
145 | return r; | |
146 | } | |
147 | aio_sigset as; | |
148 | as.sigmask = sigmask; | |
149 | as.sigsetsize = 8; // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent | |
150 | return ::syscall(__NR_io_pgetevents, io_context, min_nr, nr, events, timeout, &as); | |
151 | #else | |
152 | errno = ENOSYS; | |
153 | return -1; | |
154 | #endif | |
155 | } | |
156 | ||
9f95a23c TL |
157 | void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context) { |
158 | auto r = io_setup(nr, io_context); | |
159 | if (r < 0) { | |
160 | char buf[1024]; | |
161 | char *msg = strerror_r(errno, buf, sizeof(buf)); | |
162 | throw std::runtime_error(fmt::format("Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application", msg)); | |
163 | } | |
164 | } | |
165 | ||
11fdf7f2 TL |
166 | } |
167 | ||
168 | } |