]>
git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/src/core/linux-aio.cc
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
19 * Copyright (C) 2017 ScyllaDB
22 #include <seastar/core/linux-aio.hh>
23 #include <seastar/core/print.hh>
25 #include <sys/syscall.h>
37 struct linux_aio_ring
{
40 std :: atomic
< uint32_t > head
;
41 std :: atomic
< uint32_t > tail
;
43 uint32_t compat_features
;
44 uint32_t incompat_features
;
45 uint32_t header_length
;
50 using namespace linux_abi
;
52 static linux_aio_ring
* to_ring ( aio_context_t io_context
) {
53 return reinterpret_cast < linux_aio_ring
*>( uintptr_t ( io_context
));
56 static bool usable ( const linux_aio_ring
* ring
) {
57 return ring
-> magic
== 0xa10a10a1 && ring
-> incompat_features
== 0 ;
60 int io_setup ( int nr_events
, aio_context_t
* io_context
) {
61 return :: syscall ( SYS_io_setup
, nr_events
, io_context
);
64 int io_destroy ( aio_context_t io_context
) {
65 return :: syscall ( SYS_io_destroy
, io_context
);
68 int io_submit ( aio_context_t io_context
, long nr
, iocb
** iocbs
) {
69 return :: syscall ( SYS_io_submit
, io_context
, nr
, iocbs
);
72 int io_cancel ( aio_context_t io_context
, iocb
* iocb
, io_event
* result
) {
73 return :: syscall ( SYS_io_cancel
, io_context
, iocb
, result
);
76 static int try_reap_events ( aio_context_t io_context
, long min_nr
, long nr
, io_event
* events
, const :: timespec
* timeout
,
78 auto ring
= to_ring ( io_context
);
79 if ( usable ( ring
) && ! force_syscall
) {
80 // Try to complete in userspace, if enough available events,
81 // or if the timeout is zero
83 // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
84 // only a single thread calls io_getevents()).
85 auto head
= ring
-> head
. load ( std :: memory_order_relaxed
);
86 // The kernel will write to the ring from an interrupt and then release with a write
87 // to ring->tail, so we must memory_order_acquire here.
88 auto tail
= ring
-> tail
. load ( std :: memory_order_acquire
); // kernel writes from interrupts
89 auto available
= tail
- head
;
91 available
+= ring
-> nr
;
93 if ( available
>= uint32_t ( min_nr
)
94 || ( timeout
&& timeout
-> tv_sec
== 0 && timeout
-> tv_nsec
== 0 )) {
98 auto ring_events
= reinterpret_cast < const io_event
*>( uintptr_t ( io_context
) + ring
-> header_length
);
99 auto now
= std :: min
< uint32_t >( nr
, available
);
100 auto start
= ring_events
+ head
;
102 if ( head
< ring
-> nr
) {
103 std :: copy ( start
, start
+ now
, events
);
106 auto p
= std :: copy ( start
, ring_events
+ ring
-> nr
, events
);
107 std :: copy ( ring_events
, ring_events
+ head
, p
);
109 // The kernel will read ring->head and update its view of how many entries
110 // in the ring are available, so memory_order_release to make sure any ring
111 // accesses are completed before the update to ring->head is visible.
112 ring
-> head
. store ( head
, std :: memory_order_release
);
119 int io_getevents ( aio_context_t io_context
, long min_nr
, long nr
, io_event
* events
, const :: timespec
* timeout
,
120 bool force_syscall
) {
121 auto r
= try_reap_events ( io_context
, min_nr
, nr
, events
, timeout
, force_syscall
);
125 return :: syscall ( SYS_io_getevents
, io_context
, min_nr
, nr
, events
, timeout
);
129 #ifndef __NR_io_pgetevents
131 # if defined(__x86_64__)
132 # define __NR_io_pgetevents 333
133 # elif defined(__i386__)
134 # define __NR_io_pgetevents 385
139 int io_pgetevents ( aio_context_t io_context
, long min_nr
, long nr
, io_event
* events
, const :: timespec
* timeout
, const sigset_t
* sigmask
,
140 bool force_syscall
) {
141 #ifdef __NR_io_pgetevents
142 auto r
= try_reap_events ( io_context
, min_nr
, nr
, events
, timeout
, force_syscall
);
147 as
. sigmask
= sigmask
;
148 as
. sigsetsize
= 8 ; // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
149 return :: syscall ( __NR_io_pgetevents
, io_context
, min_nr
, nr
, events
, timeout
, & as
);
156 void setup_aio_context ( size_t nr
, linux_abi :: aio_context_t
* io_context
) {
157 auto r
= io_setup ( nr
, io_context
);
160 char * msg
= strerror_r ( errno
, buf
, sizeof ( buf
));
161 throw std :: runtime_error ( fmt :: format ( "Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application" , msg
));