]>
git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/src/core/linux-aio.cc
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
19 * Copyright (C) 2017 ScyllaDB
22 #include <seastar/core/linux-aio.hh>
23 #include <seastar/core/print.hh>
25 #include <sys/syscall.h>
30 #include <valgrind/valgrind.h>
38 struct linux_aio_ring
{
41 std :: atomic
< uint32_t > head
;
42 std :: atomic
< uint32_t > tail
;
44 uint32_t compat_features
;
45 uint32_t incompat_features
;
46 uint32_t header_length
;
51 using namespace linux_abi
;
53 static linux_aio_ring
* to_ring ( aio_context_t io_context
) {
54 return reinterpret_cast < linux_aio_ring
*>( uintptr_t ( io_context
));
57 static bool usable ( const linux_aio_ring
* ring
) {
58 return ring
-> magic
== 0xa10a10a1 && ring
-> incompat_features
== 0 && ! RUNNING_ON_VALGRIND
;
61 int io_setup ( int nr_events
, aio_context_t
* io_context
) {
62 return :: syscall ( SYS_io_setup
, nr_events
, io_context
);
65 int io_destroy ( aio_context_t io_context
) noexcept
{
66 return :: syscall ( SYS_io_destroy
, io_context
);
69 int io_submit ( aio_context_t io_context
, long nr
, iocb
** iocbs
) {
70 return :: syscall ( SYS_io_submit
, io_context
, nr
, iocbs
);
73 int io_cancel ( aio_context_t io_context
, iocb
* iocb
, io_event
* result
) {
74 return :: syscall ( SYS_io_cancel
, io_context
, iocb
, result
);
77 static int try_reap_events ( aio_context_t io_context
, long min_nr
, long nr
, io_event
* events
, const :: timespec
* timeout
,
79 auto ring
= to_ring ( io_context
);
80 if ( usable ( ring
) && ! force_syscall
) {
81 // Try to complete in userspace, if enough available events,
82 // or if the timeout is zero
84 // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
85 // only a single thread calls io_getevents()).
86 auto head
= ring
-> head
. load ( std :: memory_order_relaxed
);
87 // The kernel will write to the ring from an interrupt and then release with a write
88 // to ring->tail, so we must memory_order_acquire here.
89 auto tail
= ring
-> tail
. load ( std :: memory_order_acquire
); // kernel writes from interrupts
90 auto available
= tail
- head
;
92 available
+= ring
-> nr
;
94 if ( available
>= uint32_t ( min_nr
)
95 || ( timeout
&& timeout
-> tv_sec
== 0 && timeout
-> tv_nsec
== 0 )) {
99 auto ring_events
= reinterpret_cast < const io_event
*>( uintptr_t ( io_context
) + ring
-> header_length
);
100 auto now
= std :: min
< uint32_t >( nr
, available
);
101 auto start
= ring_events
+ head
;
103 if ( head
< ring
-> nr
) {
104 std :: copy ( start
, start
+ now
, events
);
107 auto p
= std :: copy ( start
, ring_events
+ ring
-> nr
, events
);
108 std :: copy ( ring_events
, ring_events
+ head
, p
);
110 // The kernel will read ring->head and update its view of how many entries
111 // in the ring are available, so memory_order_release to make sure any ring
112 // accesses are completed before the update to ring->head is visible.
113 ring
-> head
. store ( head
, std :: memory_order_release
);
120 int io_getevents ( aio_context_t io_context
, long min_nr
, long nr
, io_event
* events
, const :: timespec
* timeout
,
121 bool force_syscall
) {
122 auto r
= try_reap_events ( io_context
, min_nr
, nr
, events
, timeout
, force_syscall
);
126 return :: syscall ( SYS_io_getevents
, io_context
, min_nr
, nr
, events
, timeout
);
130 #ifndef __NR_io_pgetevents
132 # if defined(__x86_64__)
133 # define __NR_io_pgetevents 333
134 # elif defined(__i386__)
135 # define __NR_io_pgetevents 385
140 int io_pgetevents ( aio_context_t io_context
, long min_nr
, long nr
, io_event
* events
, const :: timespec
* timeout
, const sigset_t
* sigmask
,
141 bool force_syscall
) {
142 #ifdef __NR_io_pgetevents
143 auto r
= try_reap_events ( io_context
, min_nr
, nr
, events
, timeout
, force_syscall
);
148 as
. sigmask
= sigmask
;
149 as
. sigsetsize
= 8 ; // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
150 return :: syscall ( __NR_io_pgetevents
, io_context
, min_nr
, nr
, events
, timeout
, & as
);
157 void setup_aio_context ( size_t nr
, linux_abi :: aio_context_t
* io_context
) {
158 auto r
= io_setup ( nr
, io_context
);
161 char * msg
= strerror_r ( errno
, buf
, sizeof ( buf
));
162 throw std :: runtime_error ( fmt :: format ( "Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application" , msg
));