]>
Commit | Line | Data |
---|---|---|
5c6c3a6c CH |
1 | /* |
2 | * Linux native AIO support. | |
3 | * | |
4 | * Copyright (C) 2009 IBM, Corp. | |
5 | * Copyright (C) 2009 Red Hat, Inc. | |
6 | * | |
7 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
8 | * See the COPYING file in the top-level directory. | |
9 | */ | |
80c71a24 | 10 | #include "qemu/osdep.h" |
737e150e | 11 | #include "block/aio.h" |
1de7afc9 | 12 | #include "qemu/queue.h" |
2174f12b | 13 | #include "block/block.h" |
9f8540ec | 14 | #include "block/raw-aio.h" |
1de7afc9 | 15 | #include "qemu/event_notifier.h" |
2174f12b | 16 | #include "qemu/coroutine.h" |
433fcea4 | 17 | #include "qemu/defer-call.h" |
ed6e2161 | 18 | #include "qapi/error.h" |
07668288 | 19 | #include "sysemu/block-backend.h" |
5c6c3a6c | 20 | |
ab50533b EGE |
21 | /* Only used for assertions. */ |
22 | #include "qemu/coroutine_int.h" | |
23 | ||
5c6c3a6c CH |
24 | #include <libaio.h> |
25 | ||
26 | /* | |
27 | * Queue size (per-device). | |
28 | * | |
29 | * XXX: eventually we need to communicate this to the guest and/or make it | |
30 | * tunable by the guest. If we get more outstanding requests at a time | |
31 | * than this we will get EAGAIN from io_submit which is communicated to | |
32 | * the guest as an I/O error. | |
33 | */ | |
2558cb8d | 34 | #define MAX_EVENTS 1024 |
5c6c3a6c | 35 | |
d7ddd0a1 SG |
36 | /* Maximum number of requests in a batch. (default value) */ |
37 | #define DEFAULT_MAX_BATCH 32 | |
38 | ||
5c6c3a6c | 39 | struct qemu_laiocb { |
2174f12b | 40 | Coroutine *co; |
dd7f7ed1 | 41 | LinuxAioState *ctx; |
5c6c3a6c CH |
42 | struct iocb iocb; |
43 | ssize_t ret; | |
44 | size_t nbytes; | |
b161e2e4 KW |
45 | QEMUIOVector *qiov; |
46 | bool is_read; | |
28b24087 | 47 | QSIMPLEQ_ENTRY(qemu_laiocb) next; |
5c6c3a6c CH |
48 | }; |
49 | ||
1b3abdcc | 50 | typedef struct { |
5e1b34a3 RP |
51 | unsigned int in_queue; |
52 | unsigned int in_flight; | |
43f2376e | 53 | bool blocked; |
28b24087 | 54 | QSIMPLEQ_HEAD(, qemu_laiocb) pending; |
1b3abdcc ML |
55 | } LaioQueue; |
56 | ||
dd7f7ed1 | 57 | struct LinuxAioState { |
0187f5c9 PB |
58 | AioContext *aio_context; |
59 | ||
5c6c3a6c | 60 | io_context_t ctx; |
c90caf25 | 61 | EventNotifier e; |
1b3abdcc | 62 | |
ab50533b | 63 | /* No locking required, only accessed from AioContext home thread */ |
1b3abdcc | 64 | LaioQueue io_q; |
2cdff7f6 | 65 | QEMUBH *completion_bh; |
2cdff7f6 SH |
66 | int event_idx; |
67 | int event_max; | |
5c6c3a6c CH |
68 | }; |
69 | ||
dd7f7ed1 | 70 | static void ioq_submit(LinuxAioState *s); |
28b24087 | 71 | |
5c6c3a6c CH |
72 | static inline ssize_t io_event_ret(struct io_event *ev) |
73 | { | |
74 | return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); | |
75 | } | |
76 | ||
db0ffc24 | 77 | /* |
2b02fd81 | 78 | * Completes an AIO request. |
db0ffc24 | 79 | */ |
dd7f7ed1 | 80 | static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) |
db0ffc24 KW |
81 | { |
82 | int ret; | |
83 | ||
db0ffc24 KW |
84 | ret = laiocb->ret; |
85 | if (ret != -ECANCELED) { | |
b161e2e4 | 86 | if (ret == laiocb->nbytes) { |
db0ffc24 | 87 | ret = 0; |
b161e2e4 KW |
88 | } else if (ret >= 0) { |
89 | /* Short reads mean EOF, pad with zeros. */ | |
90 | if (laiocb->is_read) { | |
3d9b4925 MT |
91 | qemu_iovec_memset(laiocb->qiov, ret, 0, |
92 | laiocb->qiov->size - ret); | |
b161e2e4 | 93 | } else { |
1c42f149 | 94 | ret = -ENOSPC; |
b161e2e4 KW |
95 | } |
96 | } | |
db0ffc24 KW |
97 | } |
98 | ||
2174f12b | 99 | laiocb->ret = ret; |
2b02fd81 JS |
100 | |
101 | /* | |
102 | * If the coroutine is already entered it must be in ioq_submit() and | |
103 | * will notice laio->ret has been filled in when it eventually runs | |
104 | * later. Coroutines cannot be entered recursively so avoid doing | |
105 | * that! | |
106 | */ | |
ab50533b | 107 | assert(laiocb->co->ctx == laiocb->ctx->aio_context); |
2b02fd81 JS |
108 | if (!qemu_coroutine_entered(laiocb->co)) { |
109 | aio_co_wake(laiocb->co); | |
2174f12b | 110 | } |
db0ffc24 KW |
111 | } |
112 | ||
9e909a58 RP |
113 | /** |
114 | * aio_ring buffer which is shared between userspace and kernel. | |
115 | * | |
116 | * This copied from linux/fs/aio.c, common header does not exist | |
117 | * but AIO exists for ages so we assume ABI is stable. | |
118 | */ | |
119 | struct aio_ring { | |
120 | unsigned id; /* kernel internal index number */ | |
121 | unsigned nr; /* number of io_events */ | |
122 | unsigned head; /* Written to by userland or by kernel. */ | |
123 | unsigned tail; | |
124 | ||
125 | unsigned magic; | |
126 | unsigned compat_features; | |
127 | unsigned incompat_features; | |
128 | unsigned header_length; /* size of aio_ring */ | |
129 | ||
f7795e40 | 130 | struct io_event io_events[]; |
9e909a58 RP |
131 | }; |
132 | ||
133 | /** | |
134 | * io_getevents_peek: | |
135 | * @ctx: AIO context | |
136 | * @events: pointer on events array, output value | |
137 | ||
138 | * Returns the number of completed events and sets a pointer | |
139 | * on events array. This function does not update the internal | |
140 | * ring buffer, only reads head and tail. When @events has been | |
141 | * processed io_getevents_commit() must be called. | |
142 | */ | |
143 | static inline unsigned int io_getevents_peek(io_context_t ctx, | |
144 | struct io_event **events) | |
145 | { | |
146 | struct aio_ring *ring = (struct aio_ring *)ctx; | |
147 | unsigned int head = ring->head, tail = ring->tail; | |
148 | unsigned int nr; | |
149 | ||
150 | nr = tail >= head ? tail - head : ring->nr - head; | |
151 | *events = ring->io_events + head; | |
152 | /* To avoid speculative loads of s->events[i] before observing tail. | |
153 | Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */ | |
154 | smp_rmb(); | |
155 | ||
156 | return nr; | |
157 | } | |
158 | ||
159 | /** | |
160 | * io_getevents_commit: | |
161 | * @ctx: AIO context | |
162 | * @nr: the number of events on which head should be advanced | |
163 | * | |
164 | * Advances head of a ring buffer. | |
165 | */ | |
166 | static inline void io_getevents_commit(io_context_t ctx, unsigned int nr) | |
167 | { | |
168 | struct aio_ring *ring = (struct aio_ring *)ctx; | |
169 | ||
170 | if (nr) { | |
171 | ring->head = (ring->head + nr) % ring->nr; | |
172 | } | |
173 | } | |
174 | ||
175 | /** | |
176 | * io_getevents_advance_and_peek: | |
177 | * @ctx: AIO context | |
178 | * @events: pointer on events array, output value | |
179 | * @nr: the number of events on which head should be advanced | |
180 | * | |
181 | * Advances head of a ring buffer and returns number of elements left. | |
182 | */ | |
183 | static inline unsigned int | |
184 | io_getevents_advance_and_peek(io_context_t ctx, | |
185 | struct io_event **events, | |
186 | unsigned int nr) | |
187 | { | |
188 | io_getevents_commit(ctx, nr); | |
189 | return io_getevents_peek(ctx, events); | |
190 | } | |
191 | ||
3407de57 RP |
192 | /** |
193 | * qemu_laio_process_completions: | |
194 | * @s: AIO state | |
195 | * | |
196 | * Fetches completed I/O requests and invokes their callbacks. | |
2cdff7f6 SH |
197 | * |
198 | * The function is somewhat tricky because it supports nested event loops, for | |
199 | * example when a request callback invokes aio_poll(). In order to do this, | |
3407de57 RP |
200 | * indices are kept in LinuxAioState. Function schedules BH completion so it |
201 | * can be called again in a nested event loop. When there are no events left | |
202 | * to complete the BH is being canceled. | |
2cdff7f6 | 203 | */ |
3407de57 | 204 | static void qemu_laio_process_completions(LinuxAioState *s) |
5c6c3a6c | 205 | { |
9e909a58 | 206 | struct io_event *events; |
5c6c3a6c | 207 | |
84d61e5f SH |
208 | defer_call_begin(); |
209 | ||
2cdff7f6 SH |
210 | /* Reschedule so nested event loops see currently pending completions */ |
211 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c | 212 | |
9e909a58 RP |
213 | while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events, |
214 | s->event_idx))) { | |
215 | for (s->event_idx = 0; s->event_idx < s->event_max; ) { | |
216 | struct iocb *iocb = events[s->event_idx].obj; | |
217 | struct qemu_laiocb *laiocb = | |
2cdff7f6 SH |
218 | container_of(iocb, struct qemu_laiocb, iocb); |
219 | ||
9e909a58 | 220 | laiocb->ret = io_event_ret(&events[s->event_idx]); |
2cdff7f6 | 221 | |
9e909a58 RP |
222 | /* Change counters one-by-one because we can be nested. */ |
223 | s->io_q.in_flight--; | |
224 | s->event_idx++; | |
225 | qemu_laio_process_completion(laiocb); | |
226 | } | |
2cdff7f6 | 227 | } |
28b24087 | 228 | |
9e909a58 RP |
229 | qemu_bh_cancel(s->completion_bh); |
230 | ||
231 | /* If we are nested we have to notify the level above that we are done | |
232 | * by setting event_max to zero, upper level will then jump out of it's | |
3202d8e4 | 233 | * own `for` loop. If we are the last all counters dropped to zero. */ |
9e909a58 RP |
234 | s->event_max = 0; |
235 | s->event_idx = 0; | |
84d61e5f SH |
236 | |
237 | defer_call_end(); | |
3407de57 | 238 | } |
9e909a58 | 239 | |
3407de57 RP |
240 | static void qemu_laio_process_completions_and_submit(LinuxAioState *s) |
241 | { | |
242 | qemu_laio_process_completions(s); | |
1919631e | 243 | |
07668288 | 244 | if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
28b24087 PB |
245 | ioq_submit(s); |
246 | } | |
2cdff7f6 SH |
247 | } |
248 | ||
3407de57 RP |
249 | static void qemu_laio_completion_bh(void *opaque) |
250 | { | |
251 | LinuxAioState *s = opaque; | |
252 | ||
253 | qemu_laio_process_completions_and_submit(s); | |
254 | } | |
255 | ||
2cdff7f6 SH |
256 | static void qemu_laio_completion_cb(EventNotifier *e) |
257 | { | |
dd7f7ed1 | 258 | LinuxAioState *s = container_of(e, LinuxAioState, e); |
2cdff7f6 SH |
259 | |
260 | if (event_notifier_test_and_clear(&s->e)) { | |
3407de57 | 261 | qemu_laio_process_completions_and_submit(s); |
5c6c3a6c CH |
262 | } |
263 | } | |
264 | ||
ee686975 SH |
265 | static bool qemu_laio_poll_cb(void *opaque) |
266 | { | |
267 | EventNotifier *e = opaque; | |
268 | LinuxAioState *s = container_of(e, LinuxAioState, e); | |
269 | struct io_event *events; | |
270 | ||
826cc324 SH |
271 | return io_getevents_peek(s->ctx, &events); |
272 | } | |
273 | ||
274 | static void qemu_laio_poll_ready(EventNotifier *opaque) | |
275 | { | |
276 | EventNotifier *e = opaque; | |
277 | LinuxAioState *s = container_of(e, LinuxAioState, e); | |
ee686975 SH |
278 | |
279 | qemu_laio_process_completions_and_submit(s); | |
ee686975 SH |
280 | } |
281 | ||
1b3abdcc ML |
282 | static void ioq_init(LaioQueue *io_q) |
283 | { | |
28b24087 | 284 | QSIMPLEQ_INIT(&io_q->pending); |
5e1b34a3 RP |
285 | io_q->in_queue = 0; |
286 | io_q->in_flight = 0; | |
43f2376e | 287 | io_q->blocked = false; |
1b3abdcc ML |
288 | } |
289 | ||
dd7f7ed1 | 290 | static void ioq_submit(LinuxAioState *s) |
1b3abdcc | 291 | { |
82595da8 | 292 | int ret, len; |
28b24087 | 293 | struct qemu_laiocb *aiocb; |
5e1b34a3 | 294 | struct iocb *iocbs[MAX_EVENTS]; |
82595da8 | 295 | QSIMPLEQ_HEAD(, qemu_laiocb) completed; |
1b3abdcc | 296 | |
43f2376e | 297 | do { |
5e1b34a3 RP |
298 | if (s->io_q.in_flight >= MAX_EVENTS) { |
299 | break; | |
300 | } | |
43f2376e PB |
301 | len = 0; |
302 | QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { | |
303 | iocbs[len++] = &aiocb->iocb; | |
5e1b34a3 | 304 | if (s->io_q.in_flight + len >= MAX_EVENTS) { |
43f2376e PB |
305 | break; |
306 | } | |
28b24087 | 307 | } |
1b3abdcc | 308 | |
43f2376e PB |
309 | ret = io_submit(s->ctx, len, iocbs); |
310 | if (ret == -EAGAIN) { | |
82595da8 | 311 | break; |
43f2376e PB |
312 | } |
313 | if (ret < 0) { | |
44713c9e KW |
314 | /* Fail the first request, retry the rest */ |
315 | aiocb = QSIMPLEQ_FIRST(&s->io_q.pending); | |
316 | QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next); | |
317 | s->io_q.in_queue--; | |
318 | aiocb->ret = ret; | |
319 | qemu_laio_process_completion(aiocb); | |
320 | continue; | |
43f2376e PB |
321 | } |
322 | ||
5e1b34a3 RP |
323 | s->io_q.in_flight += ret; |
324 | s->io_q.in_queue -= ret; | |
82595da8 PB |
325 | aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); |
326 | QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); | |
43f2376e | 327 | } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); |
5e1b34a3 | 328 | s->io_q.blocked = (s->io_q.in_queue > 0); |
0ed93d84 RP |
329 | |
330 | if (s->io_q.in_flight) { | |
331 | /* We can try to complete something just right away if there are | |
332 | * still requests in-flight. */ | |
333 | qemu_laio_process_completions(s); | |
334 | /* | |
335 | * Even we have completed everything (in_flight == 0), the queue can | |
336 | * have still pended requests (in_queue > 0). We do not attempt to | |
337 | * repeat submission to avoid IO hang. The reason is simple: s->e is | |
338 | * still set and completion callback will be called shortly and all | |
339 | * pended requests will be submitted from there. | |
340 | */ | |
341 | } | |
1b3abdcc ML |
342 | } |
343 | ||
512da211 SG |
344 | static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch) |
345 | { | |
346 | uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH; | |
347 | ||
348 | /* | |
349 | * AIO context can be shared between multiple block devices, so | |
350 | * `dev_max_batch` allows reducing the batch size for latency-sensitive | |
351 | * devices. | |
352 | */ | |
353 | max_batch = MIN_NON_ZERO(dev_max_batch, max_batch); | |
354 | ||
355 | /* limit the batch with the number of available events */ | |
356 | max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch); | |
357 | ||
358 | return max_batch; | |
359 | } | |
360 | ||
ccee48aa | 361 | static void laio_deferred_fn(void *opaque) |
1b3abdcc | 362 | { |
07668288 | 363 | LinuxAioState *s = opaque; |
f387cac5 | 364 | |
07668288 | 365 | if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
de354644 | 366 | ioq_submit(s); |
1b3abdcc | 367 | } |
1b3abdcc ML |
368 | } |
369 | ||
2174f12b | 370 | static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, |
512da211 | 371 | int type, uint64_t dev_max_batch) |
5c6c3a6c | 372 | { |
2174f12b KW |
373 | LinuxAioState *s = laiocb->ctx; |
374 | struct iocb *iocbs = &laiocb->iocb; | |
375 | QEMUIOVector *qiov = laiocb->qiov; | |
5c6c3a6c CH |
376 | |
377 | switch (type) { | |
378 | case QEMU_AIO_WRITE: | |
379 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | |
7d37435b | 380 | break; |
4751d09a SL |
381 | case QEMU_AIO_ZONE_APPEND: |
382 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | |
383 | break; | |
5c6c3a6c CH |
384 | case QEMU_AIO_READ: |
385 | io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); | |
7d37435b | 386 | break; |
c30e624d | 387 | /* Currently Linux kernel does not support other operations */ |
5c6c3a6c CH |
388 | default: |
389 | fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", | |
390 | __func__, type); | |
2174f12b | 391 | return -EIO; |
5c6c3a6c | 392 | } |
c90caf25 | 393 | io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); |
5c6c3a6c | 394 | |
28b24087 | 395 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); |
5e1b34a3 | 396 | s->io_q.in_queue++; |
07668288 SH |
397 | if (!s->io_q.blocked) { |
398 | if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) { | |
399 | ioq_submit(s); | |
400 | } else { | |
ccee48aa | 401 | defer_call(laio_deferred_fn, s); |
07668288 | 402 | } |
1b3abdcc | 403 | } |
5c6c3a6c | 404 | |
2174f12b KW |
405 | return 0; |
406 | } | |
407 | ||
ab50533b EGE |
408 | int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, |
409 | int type, uint64_t dev_max_batch) | |
2174f12b | 410 | { |
2174f12b | 411 | int ret; |
ab50533b | 412 | AioContext *ctx = qemu_get_current_aio_context(); |
2174f12b KW |
413 | struct qemu_laiocb laiocb = { |
414 | .co = qemu_coroutine_self(), | |
9d52aa3c | 415 | .nbytes = qiov->size, |
ab50533b | 416 | .ctx = aio_get_linux_aio(ctx), |
0ed93d84 | 417 | .ret = -EINPROGRESS, |
2174f12b KW |
418 | .is_read = (type == QEMU_AIO_READ), |
419 | .qiov = qiov, | |
420 | }; | |
421 | ||
512da211 | 422 | ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch); |
2174f12b KW |
423 | if (ret < 0) { |
424 | return ret; | |
425 | } | |
426 | ||
0ed93d84 RP |
427 | if (laiocb.ret == -EINPROGRESS) { |
428 | qemu_coroutine_yield(); | |
429 | } | |
2174f12b KW |
430 | return laiocb.ret; |
431 | } | |
432 | ||
dd7f7ed1 | 433 | void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context) |
c2f3426c | 434 | { |
60f782b6 | 435 | aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL); |
2cdff7f6 | 436 | qemu_bh_delete(s->completion_bh); |
1919631e | 437 | s->aio_context = NULL; |
c2f3426c SH |
438 | } |
439 | ||
dd7f7ed1 | 440 | void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context) |
c2f3426c | 441 | { |
0187f5c9 | 442 | s->aio_context = new_context; |
2cdff7f6 | 443 | s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); |
60f782b6 | 444 | aio_set_event_notifier(new_context, &s->e, |
ee686975 | 445 | qemu_laio_completion_cb, |
826cc324 SH |
446 | qemu_laio_poll_cb, |
447 | qemu_laio_poll_ready); | |
c2f3426c SH |
448 | } |
449 | ||
ed6e2161 | 450 | LinuxAioState *laio_init(Error **errp) |
5c6c3a6c | 451 | { |
ed6e2161 | 452 | int rc; |
dd7f7ed1 | 453 | LinuxAioState *s; |
5c6c3a6c | 454 | |
7267c094 | 455 | s = g_malloc0(sizeof(*s)); |
ed6e2161 NA |
456 | rc = event_notifier_init(&s->e, false); |
457 | if (rc < 0) { | |
7a21bee2 | 458 | error_setg_errno(errp, -rc, "failed to initialize event notifier"); |
5c6c3a6c | 459 | goto out_free_state; |
c90caf25 | 460 | } |
5c6c3a6c | 461 | |
ed6e2161 NA |
462 | rc = io_setup(MAX_EVENTS, &s->ctx); |
463 | if (rc < 0) { | |
464 | error_setg_errno(errp, -rc, "failed to create linux AIO context"); | |
5c6c3a6c | 465 | goto out_close_efd; |
c90caf25 | 466 | } |
5c6c3a6c | 467 | |
1b3abdcc ML |
468 | ioq_init(&s->io_q); |
469 | ||
5c6c3a6c CH |
470 | return s; |
471 | ||
472 | out_close_efd: | |
c90caf25 | 473 | event_notifier_cleanup(&s->e); |
5c6c3a6c | 474 | out_free_state: |
7267c094 | 475 | g_free(s); |
5c6c3a6c CH |
476 | return NULL; |
477 | } | |
abd269b7 | 478 | |
dd7f7ed1 | 479 | void laio_cleanup(LinuxAioState *s) |
abd269b7 | 480 | { |
abd269b7 | 481 | event_notifier_cleanup(&s->e); |
a1abf40d GA |
482 | |
483 | if (io_destroy(s->ctx) != 0) { | |
484 | fprintf(stderr, "%s: destroy AIO context %p failed\n", | |
485 | __func__, &s->ctx); | |
486 | } | |
abd269b7 SH |
487 | g_free(s); |
488 | } |