[ceph.git] / ceph / src / seastar / src / core / linux-aio.cc

/*
 * This file is open source software, licensed to you under the terms
 * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
 * distributed with this work for additional information regarding copyright
 * ownership.  You may not use this file except in compliance with the License.
 *
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Copyright (C) 2017 ScyllaDB
 */

#include <seastar/core/linux-aio.hh>
#include <seastar/core/print.hh>
#include <unistd.h>
#include <sys/syscall.h>
#include <atomic>
#include <algorithm>
#include <errno.h>
#include <string.h>
#include <valgrind/valgrind.h>

namespace seastar {

namespace internal {

namespace linux_abi {

struct linux_aio_ring {
    uint32_t id;
    uint32_t nr;
    std::atomic<uint32_t> head;
    std::atomic<uint32_t> tail;
    uint32_t magic;
    uint32_t compat_features;
    uint32_t incompat_features;
    uint32_t header_length;
};

}

using namespace linux_abi;

static linux_aio_ring* to_ring(aio_context_t io_context) {
    return reinterpret_cast<linux_aio_ring*>(uintptr_t(io_context));
}

static bool usable(const linux_aio_ring* ring) {
    return ring->magic == 0xa10a10a1 && ring->incompat_features == 0 && !RUNNING_ON_VALGRIND;
}

int io_setup(int nr_events, aio_context_t* io_context) {
    return ::syscall(SYS_io_setup, nr_events, io_context);
}

int io_destroy(aio_context_t io_context) noexcept {
   return ::syscall(SYS_io_destroy, io_context);
}

int io_submit(aio_context_t io_context, long nr, iocb** iocbs) {
    return ::syscall(SYS_io_submit, io_context, nr, iocbs);
}

int io_cancel(aio_context_t io_context, iocb* iocb, io_event* result) {
    return ::syscall(SYS_io_cancel, io_context, iocb, result);
}

static int try_reap_events(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
        bool force_syscall) {
    auto ring = to_ring(io_context);
    if (usable(ring) && !force_syscall) {
        // Try to complete in userspace, if enough available events,
        // or if the timeout is zero

        // We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
        // only a single thread calls io_getevents()).
        auto head = ring->head.load(std::memory_order_relaxed);
        // The kernel will write to the ring from an interrupt and then release with a write
        // to ring->tail, so we must memory_order_acquire here.
        auto tail = ring->tail.load(std::memory_order_acquire); // kernel writes from interrupts
        auto available = tail - head;
        if (tail < head) {
            available += ring->nr;
        }
        if (available >= uint32_t(min_nr)
                || (timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0)) {
            if (!available) {
                return 0;
            }
            auto ring_events = reinterpret_cast<const io_event*>(uintptr_t(io_context) + ring->header_length);
            auto now = std::min<uint32_t>(nr, available);
            auto start = ring_events + head;
            head += now;
            if (head < ring->nr) {
                std::copy(start, start + now, events);
            } else {
                head -= ring->nr;
                auto p = std::copy(start, ring_events + ring->nr, events);
                std::copy(ring_events, ring_events + head, p);
            }
            // The kernel will read ring->head and update its view of how many entries
            // in the ring are available, so memory_order_release to make sure any ring
            // accesses are completed before the update to ring->head is visible.
            ring->head.store(head, std::memory_order_release);
            return now;
        }
    }
    return -1;
}

int io_getevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
        bool force_syscall) {
    auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
    if (r >= 0) {
        return r;
    }
    return ::syscall(SYS_io_getevents, io_context, min_nr, nr, events, timeout);
}


#ifndef __NR_io_pgetevents

#  if defined(__x86_64__)
#    define __NR_io_pgetevents 333
#  elif defined(__i386__)
#    define __NR_io_pgetevents 385
#  endif

#endif

int io_pgetevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, const sigset_t* sigmask,
        bool force_syscall) {
#ifdef __NR_io_pgetevents
    auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
    if (r >= 0) {
        return r;
    }
    aio_sigset as;
    as.sigmask = sigmask;
    as.sigsetsize = 8;  // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
    return ::syscall(__NR_io_pgetevents, io_context, min_nr, nr, events, timeout, &as);
#else
    errno = ENOSYS;
    return -1;
#endif
}

void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context) {
    auto r = io_setup(nr, io_context);
    if (r < 0) {
        char buf[1024];
        char *msg = strerror_r(errno, buf, sizeof(buf));
        throw std::runtime_error(fmt::format("Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application", msg));
    }
}

}

}
Commit	Line	Data
11fdf7f2 TL	1	/*
	2	* This file is open source software, licensed to you under the terms
	3	* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
	4	* distributed with this work for additional information regarding copyright
	5	* ownership. You may not use this file except in compliance with the License.
	6	*
	7	* You may obtain a copy of the License at
	8	*
	9	* http://www.apache.org/licenses/LICENSE-2.0
	10	*
	11	* Unless required by applicable law or agreed to in writing,
	12	* software distributed under the License is distributed on an
	13	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	14	* KIND, either express or implied. See the License for the
	15	* specific language governing permissions and limitations
	16	* under the License.
	17	*/
	18	/*
	19	* Copyright (C) 2017 ScyllaDB
	20	*/
	21
	22	#include <seastar/core/linux-aio.hh>
9f95a23c	23	#include <seastar/core/print.hh>
11fdf7f2 TL	24	#include <unistd.h>
	25	#include <sys/syscall.h>
	26	#include <atomic>
	27	#include <algorithm>
	28	#include <errno.h>
9f95a23c	29	#include <string.h>
f67539c2	30	#include <valgrind/valgrind.h>
11fdf7f2 TL	31
	32	namespace seastar {
	33
	34	namespace internal {
	35
	36	namespace linux_abi {
	37
	38	struct linux_aio_ring {
	39	uint32_t id;
	40	uint32_t nr;
	41	std::atomic<uint32_t> head;
	42	std::atomic<uint32_t> tail;
	43	uint32_t magic;
	44	uint32_t compat_features;
	45	uint32_t incompat_features;
	46	uint32_t header_length;
	47	};
	48
	49	}
	50
	51	using namespace linux_abi;
	52
	53	static linux_aio_ring* to_ring(aio_context_t io_context) {
	54	return reinterpret_cast<linux_aio_ring*>(uintptr_t(io_context));
	55	}
	56
	57	static bool usable(const linux_aio_ring* ring) {
f67539c2	58	return ring->magic == 0xa10a10a1 && ring->incompat_features == 0 && !RUNNING_ON_VALGRIND;
11fdf7f2 TL	59	}
	60
	61	int io_setup(int nr_events, aio_context_t* io_context) {
	62	return ::syscall(SYS_io_setup, nr_events, io_context);
	63	}
	64
20effc67	65	int io_destroy(aio_context_t io_context) noexcept {
11fdf7f2 TL	66	return ::syscall(SYS_io_destroy, io_context);
	67	}
	68
	69	int io_submit(aio_context_t io_context, long nr, iocb** iocbs) {
	70	return ::syscall(SYS_io_submit, io_context, nr, iocbs);
	71	}
	72
	73	int io_cancel(aio_context_t io_context, iocb* iocb, io_event* result) {
	74	return ::syscall(SYS_io_cancel, io_context, iocb, result);
	75	}
	76
	77	static int try_reap_events(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
	78	bool force_syscall) {
	79	auto ring = to_ring(io_context);
	80	if (usable(ring) && !force_syscall) {
	81	// Try to complete in userspace, if enough available events,
	82	// or if the timeout is zero
	83
	84	// We're the only writer to ->head, so we can load with memory_order_relaxed (assuming
	85	// only a single thread calls io_getevents()).
	86	auto head = ring->head.load(std::memory_order_relaxed);
	87	// The kernel will write to the ring from an interrupt and then release with a write
	88	// to ring->tail, so we must memory_order_acquire here.
	89	auto tail = ring->tail.load(std::memory_order_acquire); // kernel writes from interrupts
	90	auto available = tail - head;
	91	if (tail < head) {
	92	available += ring->nr;
	93	}
	94	if (available >= uint32_t(min_nr)
	95	\|\| (timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0)) {
	96	if (!available) {
	97	return 0;
	98	}
	99	auto ring_events = reinterpret_cast<const io_event*>(uintptr_t(io_context) + ring->header_length);
	100	auto now = std::min<uint32_t>(nr, available);
	101	auto start = ring_events + head;
	102	head += now;
	103	if (head < ring->nr) {
	104	std::copy(start, start + now, events);
	105	} else {
	106	head -= ring->nr;
	107	auto p = std::copy(start, ring_events + ring->nr, events);
	108	std::copy(ring_events, ring_events + head, p);
	109	}
	110	// The kernel will read ring->head and update its view of how many entries
	111	// in the ring are available, so memory_order_release to make sure any ring
	112	// accesses are completed before the update to ring->head is visible.
	113	ring->head.store(head, std::memory_order_release);
	114	return now;
	115	}
	116	}
	117	return -1;
	118	}
	119
	120	int io_getevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout,
	121	bool force_syscall) {
	122	auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
	123	if (r >= 0) {
	124	return r;
	125	}
	126	return ::syscall(SYS_io_getevents, io_context, min_nr, nr, events, timeout);
	127	}
	128
	129
130	#ifndef __NR_io_pgetevents
131
132	# if defined(__x86_64__)
133	# define __NR_io_pgetevents 333
134	# elif defined(__i386__)
135	# define __NR_io_pgetevents 385
136	# endif
137
138	#endif
139
140	int io_pgetevents(aio_context_t io_context, long min_nr, long nr, io_event* events, const ::timespec* timeout, const sigset_t* sigmask,
141	bool force_syscall) {
142	#ifdef __NR_io_pgetevents
143	auto r = try_reap_events(io_context, min_nr, nr, events, timeout, force_syscall);
144	if (r >= 0) {
145	return r;
146	}
147	aio_sigset as;
148	as.sigmask = sigmask;
149	as.sigsetsize = 8; // Can't use sizeof(*sigmask) because user and kernel sigset_t are inconsistent
150	return ::syscall(__NR_io_pgetevents, io_context, min_nr, nr, events, timeout, &as);
151	#else
152	errno = ENOSYS;
153	return -1;
154	#endif
155	}
156
9f95a23c TL	157	void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context) {
	158	auto r = io_setup(nr, io_context);
	159	if (r < 0) {
	160	char buf[1024];
	161	char *msg = strerror_r(errno, buf, sizeof(buf));
	162	throw std::runtime_error(fmt::format("Could not setup Async I/O: {}. The most common cause is not enough request capacity in /proc/sys/fs/aio-max-nr. Try increasing that number or reducing the amount of logical CPUs available for your application", msg));
	163	}
	164	}
	165
11fdf7f2 TL	166	}
	167
	168	}