]>
Commit | Line | Data |
---|---|---|
5c6c3a6c CH |
1 | /* |
2 | * Linux native AIO support. | |
3 | * | |
4 | * Copyright (C) 2009 IBM, Corp. | |
5 | * Copyright (C) 2009 Red Hat, Inc. | |
6 | * | |
7 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
8 | * See the COPYING file in the top-level directory. | |
9 | */ | |
10 | #include "qemu-common.h" | |
737e150e | 11 | #include "block/aio.h" |
1de7afc9 | 12 | #include "qemu/queue.h" |
9f8540ec | 13 | #include "block/raw-aio.h" |
1de7afc9 | 14 | #include "qemu/event_notifier.h" |
5c6c3a6c | 15 | |
5c6c3a6c CH |
16 | #include <libaio.h> |
17 | ||
18 | /* | |
19 | * Queue size (per-device). | |
20 | * | |
21 | * XXX: eventually we need to communicate this to the guest and/or make it | |
22 | * tunable by the guest. If we get more outstanding requests at a time | |
23 | * than this we will get EAGAIN from io_submit which is communicated to | |
24 | * the guest as an I/O error. | |
25 | */ | |
26 | #define MAX_EVENTS 128 | |
27 | ||
1b3abdcc ML |
28 | #define MAX_QUEUED_IO 128 |
29 | ||
5c6c3a6c | 30 | struct qemu_laiocb { |
7c84b1b8 | 31 | BlockAIOCB common; |
5c6c3a6c CH |
32 | struct qemu_laio_state *ctx; |
33 | struct iocb iocb; | |
34 | ssize_t ret; | |
35 | size_t nbytes; | |
b161e2e4 KW |
36 | QEMUIOVector *qiov; |
37 | bool is_read; | |
28b24087 | 38 | QSIMPLEQ_ENTRY(qemu_laiocb) next; |
5c6c3a6c CH |
39 | }; |
40 | ||
1b3abdcc | 41 | typedef struct { |
1b3abdcc | 42 | int plugged; |
8455ce05 | 43 | unsigned int n; |
43f2376e | 44 | bool blocked; |
28b24087 | 45 | QSIMPLEQ_HEAD(, qemu_laiocb) pending; |
1b3abdcc ML |
46 | } LaioQueue; |
47 | ||
5c6c3a6c CH |
48 | struct qemu_laio_state { |
49 | io_context_t ctx; | |
c90caf25 | 50 | EventNotifier e; |
1b3abdcc ML |
51 | |
52 | /* io queue for submit at batch */ | |
53 | LaioQueue io_q; | |
2cdff7f6 SH |
54 | |
55 | /* I/O completion processing */ | |
56 | QEMUBH *completion_bh; | |
57 | struct io_event events[MAX_EVENTS]; | |
58 | int event_idx; | |
59 | int event_max; | |
5c6c3a6c CH |
60 | }; |
61 | ||
28b24087 PB |
62 | static int ioq_submit(struct qemu_laio_state *s); |
63 | ||
5c6c3a6c CH |
64 | static inline ssize_t io_event_ret(struct io_event *ev) |
65 | { | |
66 | return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); | |
67 | } | |
68 | ||
db0ffc24 KW |
69 | /* |
70 | * Completes an AIO request (calls the callback and frees the ACB). | |
db0ffc24 KW |
71 | */ |
72 | static void qemu_laio_process_completion(struct qemu_laio_state *s, | |
73 | struct qemu_laiocb *laiocb) | |
74 | { | |
75 | int ret; | |
76 | ||
db0ffc24 KW |
77 | ret = laiocb->ret; |
78 | if (ret != -ECANCELED) { | |
b161e2e4 | 79 | if (ret == laiocb->nbytes) { |
db0ffc24 | 80 | ret = 0; |
b161e2e4 KW |
81 | } else if (ret >= 0) { |
82 | /* Short reads mean EOF, pad with zeros. */ | |
83 | if (laiocb->is_read) { | |
3d9b4925 MT |
84 | qemu_iovec_memset(laiocb->qiov, ret, 0, |
85 | laiocb->qiov->size - ret); | |
b161e2e4 KW |
86 | } else { |
87 | ret = -EINVAL; | |
88 | } | |
89 | } | |
db0ffc24 | 90 | } |
771b64da | 91 | laiocb->common.cb(laiocb->common.opaque, ret); |
db0ffc24 | 92 | |
8007429a | 93 | qemu_aio_unref(laiocb); |
db0ffc24 KW |
94 | } |
95 | ||
2cdff7f6 SH |
96 | /* The completion BH fetches completed I/O requests and invokes their |
97 | * callbacks. | |
98 | * | |
99 | * The function is somewhat tricky because it supports nested event loops, for | |
100 | * example when a request callback invokes aio_poll(). In order to do this, | |
101 | * the completion events array and index are kept in qemu_laio_state. The BH | |
102 | * reschedules itself as long as there are completions pending so it will | |
103 | * either be called again in a nested event loop or will be called after all | |
104 | * events have been completed. When there are no events left to complete, the | |
105 | * BH returns without rescheduling. | |
106 | */ | |
107 | static void qemu_laio_completion_bh(void *opaque) | |
5c6c3a6c | 108 | { |
2cdff7f6 | 109 | struct qemu_laio_state *s = opaque; |
5c6c3a6c | 110 | |
2cdff7f6 SH |
111 | /* Fetch more completion events when empty */ |
112 | if (s->event_idx == s->event_max) { | |
5c6c3a6c | 113 | do { |
2cdff7f6 SH |
114 | struct timespec ts = { 0 }; |
115 | s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, | |
116 | s->events, &ts); | |
117 | } while (s->event_max == -EINTR); | |
118 | ||
119 | s->event_idx = 0; | |
120 | if (s->event_max <= 0) { | |
121 | s->event_max = 0; | |
122 | return; /* no more events */ | |
123 | } | |
124 | } | |
5c6c3a6c | 125 | |
2cdff7f6 SH |
126 | /* Reschedule so nested event loops see currently pending completions */ |
127 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c | 128 | |
2cdff7f6 SH |
129 | /* Process completion events */ |
130 | while (s->event_idx < s->event_max) { | |
131 | struct iocb *iocb = s->events[s->event_idx].obj; | |
132 | struct qemu_laiocb *laiocb = | |
133 | container_of(iocb, struct qemu_laiocb, iocb); | |
134 | ||
135 | laiocb->ret = io_event_ret(&s->events[s->event_idx]); | |
136 | s->event_idx++; | |
137 | ||
138 | qemu_laio_process_completion(s, laiocb); | |
139 | } | |
28b24087 PB |
140 | |
141 | if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { | |
142 | ioq_submit(s); | |
143 | } | |
2cdff7f6 SH |
144 | } |
145 | ||
146 | static void qemu_laio_completion_cb(EventNotifier *e) | |
147 | { | |
148 | struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); | |
149 | ||
150 | if (event_notifier_test_and_clear(&s->e)) { | |
151 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c CH |
152 | } |
153 | } | |
154 | ||
7c84b1b8 | 155 | static void laio_cancel(BlockAIOCB *blockacb) |
5c6c3a6c CH |
156 | { |
157 | struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; | |
158 | struct io_event event; | |
159 | int ret; | |
160 | ||
771b64da | 161 | if (laiocb->ret != -EINPROGRESS) { |
5c6c3a6c | 162 | return; |
771b64da | 163 | } |
5c6c3a6c | 164 | ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); |
771b64da FZ |
165 | laiocb->ret = -ECANCELED; |
166 | if (ret != 0) { | |
167 | /* iocb is not cancelled, cb will be called by the event loop later */ | |
5c6c3a6c CH |
168 | return; |
169 | } | |
170 | ||
771b64da | 171 | laiocb->common.cb(laiocb->common.opaque, laiocb->ret); |
5c6c3a6c CH |
172 | } |
173 | ||
d7331bed | 174 | static const AIOCBInfo laio_aiocb_info = { |
5c6c3a6c | 175 | .aiocb_size = sizeof(struct qemu_laiocb), |
771b64da | 176 | .cancel_async = laio_cancel, |
5c6c3a6c CH |
177 | }; |
178 | ||
1b3abdcc ML |
179 | static void ioq_init(LaioQueue *io_q) |
180 | { | |
28b24087 | 181 | QSIMPLEQ_INIT(&io_q->pending); |
1b3abdcc | 182 | io_q->plugged = 0; |
8455ce05 | 183 | io_q->n = 0; |
43f2376e | 184 | io_q->blocked = false; |
1b3abdcc ML |
185 | } |
186 | ||
187 | static int ioq_submit(struct qemu_laio_state *s) | |
188 | { | |
43f2376e | 189 | int ret, i, len; |
28b24087 PB |
190 | struct qemu_laiocb *aiocb; |
191 | struct iocb *iocbs[MAX_QUEUED_IO]; | |
1b3abdcc | 192 | |
43f2376e PB |
193 | do { |
194 | len = 0; | |
195 | QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { | |
196 | iocbs[len++] = &aiocb->iocb; | |
197 | if (len == MAX_QUEUED_IO) { | |
198 | break; | |
199 | } | |
28b24087 | 200 | } |
1b3abdcc | 201 | |
43f2376e PB |
202 | ret = io_submit(s->ctx, len, iocbs); |
203 | if (ret == -EAGAIN) { | |
204 | ret = 0; | |
205 | } | |
206 | if (ret < 0) { | |
207 | abort(); | |
208 | } | |
209 | ||
210 | for (i = 0; i < ret; i++) { | |
8455ce05 | 211 | s->io_q.n--; |
43f2376e PB |
212 | QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next); |
213 | } | |
214 | } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); | |
8455ce05 | 215 | s->io_q.blocked = (s->io_q.n > 0); |
1b3abdcc | 216 | |
1b3abdcc ML |
217 | return ret; |
218 | } | |
219 | ||
1b3abdcc ML |
220 | void laio_io_plug(BlockDriverState *bs, void *aio_ctx) |
221 | { | |
222 | struct qemu_laio_state *s = aio_ctx; | |
223 | ||
224 | s->io_q.plugged++; | |
225 | } | |
226 | ||
227 | int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) | |
228 | { | |
229 | struct qemu_laio_state *s = aio_ctx; | |
230 | int ret = 0; | |
231 | ||
232 | assert(s->io_q.plugged > 0 || !unplug); | |
233 | ||
234 | if (unplug && --s->io_q.plugged > 0) { | |
235 | return 0; | |
236 | } | |
237 | ||
43f2376e | 238 | if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
1b3abdcc ML |
239 | ret = ioq_submit(s); |
240 | } | |
241 | ||
242 | return ret; | |
243 | } | |
244 | ||
7c84b1b8 | 245 | BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, |
5c6c3a6c | 246 | int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, |
097310b5 | 247 | BlockCompletionFunc *cb, void *opaque, int type) |
5c6c3a6c CH |
248 | { |
249 | struct qemu_laio_state *s = aio_ctx; | |
250 | struct qemu_laiocb *laiocb; | |
251 | struct iocb *iocbs; | |
252 | off_t offset = sector_num * 512; | |
253 | ||
d7331bed | 254 | laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); |
5c6c3a6c CH |
255 | laiocb->nbytes = nb_sectors * 512; |
256 | laiocb->ctx = s; | |
257 | laiocb->ret = -EINPROGRESS; | |
b161e2e4 KW |
258 | laiocb->is_read = (type == QEMU_AIO_READ); |
259 | laiocb->qiov = qiov; | |
5c6c3a6c CH |
260 | |
261 | iocbs = &laiocb->iocb; | |
262 | ||
263 | switch (type) { | |
264 | case QEMU_AIO_WRITE: | |
265 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | |
266 | break; | |
267 | case QEMU_AIO_READ: | |
268 | io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); | |
269 | break; | |
c30e624d | 270 | /* Currently Linux kernel does not support other operations */ |
5c6c3a6c CH |
271 | default: |
272 | fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", | |
273 | __func__, type); | |
274 | goto out_free_aiocb; | |
275 | } | |
c90caf25 | 276 | io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); |
5c6c3a6c | 277 | |
28b24087 | 278 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); |
8455ce05 | 279 | s->io_q.n++; |
43f2376e | 280 | if (!s->io_q.blocked && |
8455ce05 | 281 | (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { |
28b24087 | 282 | ioq_submit(s); |
1b3abdcc | 283 | } |
5c6c3a6c CH |
284 | return &laiocb->common; |
285 | ||
449c184e | 286 | out_free_aiocb: |
8007429a | 287 | qemu_aio_unref(laiocb); |
5c6c3a6c CH |
288 | return NULL; |
289 | } | |
290 | ||
c2f3426c SH |
291 | void laio_detach_aio_context(void *s_, AioContext *old_context) |
292 | { | |
293 | struct qemu_laio_state *s = s_; | |
294 | ||
295 | aio_set_event_notifier(old_context, &s->e, NULL); | |
2cdff7f6 | 296 | qemu_bh_delete(s->completion_bh); |
c2f3426c SH |
297 | } |
298 | ||
299 | void laio_attach_aio_context(void *s_, AioContext *new_context) | |
300 | { | |
301 | struct qemu_laio_state *s = s_; | |
302 | ||
2cdff7f6 | 303 | s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); |
c2f3426c SH |
304 | aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb); |
305 | } | |
306 | ||
5c6c3a6c CH |
307 | void *laio_init(void) |
308 | { | |
309 | struct qemu_laio_state *s; | |
310 | ||
7267c094 | 311 | s = g_malloc0(sizeof(*s)); |
c90caf25 | 312 | if (event_notifier_init(&s->e, false) < 0) { |
5c6c3a6c | 313 | goto out_free_state; |
c90caf25 | 314 | } |
5c6c3a6c | 315 | |
c90caf25 | 316 | if (io_setup(MAX_EVENTS, &s->ctx) != 0) { |
5c6c3a6c | 317 | goto out_close_efd; |
c90caf25 | 318 | } |
5c6c3a6c | 319 | |
1b3abdcc ML |
320 | ioq_init(&s->io_q); |
321 | ||
5c6c3a6c CH |
322 | return s; |
323 | ||
324 | out_close_efd: | |
c90caf25 | 325 | event_notifier_cleanup(&s->e); |
5c6c3a6c | 326 | out_free_state: |
7267c094 | 327 | g_free(s); |
5c6c3a6c CH |
328 | return NULL; |
329 | } | |
abd269b7 SH |
330 | |
331 | void laio_cleanup(void *s_) | |
332 | { | |
333 | struct qemu_laio_state *s = s_; | |
334 | ||
335 | event_notifier_cleanup(&s->e); | |
a1abf40d GA |
336 | |
337 | if (io_destroy(s->ctx) != 0) { | |
338 | fprintf(stderr, "%s: destroy AIO context %p failed\n", | |
339 | __func__, &s->ctx); | |
340 | } | |
abd269b7 SH |
341 | g_free(s); |
342 | } |