]>
Commit | Line | Data |
---|---|---|
5c6c3a6c CH |
1 | /* |
2 | * Linux native AIO support. | |
3 | * | |
4 | * Copyright (C) 2009 IBM, Corp. | |
5 | * Copyright (C) 2009 Red Hat, Inc. | |
6 | * | |
7 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
8 | * See the COPYING file in the top-level directory. | |
9 | */ | |
10 | #include "qemu-common.h" | |
737e150e | 11 | #include "block/aio.h" |
1de7afc9 | 12 | #include "qemu/queue.h" |
9f8540ec | 13 | #include "block/raw-aio.h" |
1de7afc9 | 14 | #include "qemu/event_notifier.h" |
5c6c3a6c | 15 | |
5c6c3a6c CH |
16 | #include <libaio.h> |
17 | ||
18 | /* | |
19 | * Queue size (per-device). | |
20 | * | |
21 | * XXX: eventually we need to communicate this to the guest and/or make it | |
22 | * tunable by the guest. If we get more outstanding requests at a time | |
23 | * than this we will get EAGAIN from io_submit which is communicated to | |
24 | * the guest as an I/O error. | |
25 | */ | |
26 | #define MAX_EVENTS 128 | |
27 | ||
1b3abdcc ML |
28 | #define MAX_QUEUED_IO 128 |
29 | ||
5c6c3a6c | 30 | struct qemu_laiocb { |
7c84b1b8 | 31 | BlockAIOCB common; |
5c6c3a6c CH |
32 | struct qemu_laio_state *ctx; |
33 | struct iocb iocb; | |
34 | ssize_t ret; | |
35 | size_t nbytes; | |
b161e2e4 KW |
36 | QEMUIOVector *qiov; |
37 | bool is_read; | |
28b24087 | 38 | QSIMPLEQ_ENTRY(qemu_laiocb) next; |
5c6c3a6c CH |
39 | }; |
40 | ||
1b3abdcc | 41 | typedef struct { |
1b3abdcc | 42 | int plugged; |
1b3abdcc | 43 | unsigned int idx; |
28b24087 | 44 | QSIMPLEQ_HEAD(, qemu_laiocb) pending; |
1b3abdcc ML |
45 | } LaioQueue; |
46 | ||
5c6c3a6c CH |
47 | struct qemu_laio_state { |
48 | io_context_t ctx; | |
c90caf25 | 49 | EventNotifier e; |
1b3abdcc ML |
50 | |
51 | /* io queue for submit at batch */ | |
52 | LaioQueue io_q; | |
2cdff7f6 SH |
53 | |
54 | /* I/O completion processing */ | |
55 | QEMUBH *completion_bh; | |
56 | struct io_event events[MAX_EVENTS]; | |
57 | int event_idx; | |
58 | int event_max; | |
5c6c3a6c CH |
59 | }; |
60 | ||
28b24087 PB |
61 | static int ioq_submit(struct qemu_laio_state *s); |
62 | ||
5c6c3a6c CH |
63 | static inline ssize_t io_event_ret(struct io_event *ev) |
64 | { | |
65 | return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); | |
66 | } | |
67 | ||
db0ffc24 KW |
68 | /* |
69 | * Completes an AIO request (calls the callback and frees the ACB). | |
db0ffc24 KW |
70 | */ |
71 | static void qemu_laio_process_completion(struct qemu_laio_state *s, | |
72 | struct qemu_laiocb *laiocb) | |
73 | { | |
74 | int ret; | |
75 | ||
db0ffc24 KW |
76 | ret = laiocb->ret; |
77 | if (ret != -ECANCELED) { | |
b161e2e4 | 78 | if (ret == laiocb->nbytes) { |
db0ffc24 | 79 | ret = 0; |
b161e2e4 KW |
80 | } else if (ret >= 0) { |
81 | /* Short reads mean EOF, pad with zeros. */ | |
82 | if (laiocb->is_read) { | |
3d9b4925 MT |
83 | qemu_iovec_memset(laiocb->qiov, ret, 0, |
84 | laiocb->qiov->size - ret); | |
b161e2e4 KW |
85 | } else { |
86 | ret = -EINVAL; | |
87 | } | |
88 | } | |
db0ffc24 | 89 | } |
771b64da | 90 | laiocb->common.cb(laiocb->common.opaque, ret); |
db0ffc24 | 91 | |
8007429a | 92 | qemu_aio_unref(laiocb); |
db0ffc24 KW |
93 | } |
94 | ||
2cdff7f6 SH |
95 | /* The completion BH fetches completed I/O requests and invokes their |
96 | * callbacks. | |
97 | * | |
98 | * The function is somewhat tricky because it supports nested event loops, for | |
99 | * example when a request callback invokes aio_poll(). In order to do this, | |
100 | * the completion events array and index are kept in qemu_laio_state. The BH | |
101 | * reschedules itself as long as there are completions pending so it will | |
102 | * either be called again in a nested event loop or will be called after all | |
103 | * events have been completed. When there are no events left to complete, the | |
104 | * BH returns without rescheduling. | |
105 | */ | |
106 | static void qemu_laio_completion_bh(void *opaque) | |
5c6c3a6c | 107 | { |
2cdff7f6 | 108 | struct qemu_laio_state *s = opaque; |
5c6c3a6c | 109 | |
2cdff7f6 SH |
110 | /* Fetch more completion events when empty */ |
111 | if (s->event_idx == s->event_max) { | |
5c6c3a6c | 112 | do { |
2cdff7f6 SH |
113 | struct timespec ts = { 0 }; |
114 | s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, | |
115 | s->events, &ts); | |
116 | } while (s->event_max == -EINTR); | |
117 | ||
118 | s->event_idx = 0; | |
119 | if (s->event_max <= 0) { | |
120 | s->event_max = 0; | |
121 | return; /* no more events */ | |
122 | } | |
123 | } | |
5c6c3a6c | 124 | |
2cdff7f6 SH |
125 | /* Reschedule so nested event loops see currently pending completions */ |
126 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c | 127 | |
2cdff7f6 SH |
128 | /* Process completion events */ |
129 | while (s->event_idx < s->event_max) { | |
130 | struct iocb *iocb = s->events[s->event_idx].obj; | |
131 | struct qemu_laiocb *laiocb = | |
132 | container_of(iocb, struct qemu_laiocb, iocb); | |
133 | ||
134 | laiocb->ret = io_event_ret(&s->events[s->event_idx]); | |
135 | s->event_idx++; | |
136 | ||
137 | qemu_laio_process_completion(s, laiocb); | |
138 | } | |
28b24087 PB |
139 | |
140 | if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { | |
141 | ioq_submit(s); | |
142 | } | |
2cdff7f6 SH |
143 | } |
144 | ||
145 | static void qemu_laio_completion_cb(EventNotifier *e) | |
146 | { | |
147 | struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); | |
148 | ||
149 | if (event_notifier_test_and_clear(&s->e)) { | |
150 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c CH |
151 | } |
152 | } | |
153 | ||
7c84b1b8 | 154 | static void laio_cancel(BlockAIOCB *blockacb) |
5c6c3a6c CH |
155 | { |
156 | struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; | |
157 | struct io_event event; | |
158 | int ret; | |
159 | ||
771b64da | 160 | if (laiocb->ret != -EINPROGRESS) { |
5c6c3a6c | 161 | return; |
771b64da | 162 | } |
5c6c3a6c | 163 | ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); |
771b64da FZ |
164 | laiocb->ret = -ECANCELED; |
165 | if (ret != 0) { | |
166 | /* iocb is not cancelled, cb will be called by the event loop later */ | |
5c6c3a6c CH |
167 | return; |
168 | } | |
169 | ||
771b64da | 170 | laiocb->common.cb(laiocb->common.opaque, laiocb->ret); |
5c6c3a6c CH |
171 | } |
172 | ||
d7331bed | 173 | static const AIOCBInfo laio_aiocb_info = { |
5c6c3a6c | 174 | .aiocb_size = sizeof(struct qemu_laiocb), |
771b64da | 175 | .cancel_async = laio_cancel, |
5c6c3a6c CH |
176 | }; |
177 | ||
1b3abdcc ML |
178 | static void ioq_init(LaioQueue *io_q) |
179 | { | |
28b24087 | 180 | QSIMPLEQ_INIT(&io_q->pending); |
1b3abdcc | 181 | io_q->plugged = 0; |
28b24087 | 182 | io_q->idx = 0; |
1b3abdcc ML |
183 | } |
184 | ||
185 | static int ioq_submit(struct qemu_laio_state *s) | |
186 | { | |
28b24087 PB |
187 | int ret, i; |
188 | int len = 0; | |
189 | struct qemu_laiocb *aiocb; | |
190 | struct iocb *iocbs[MAX_QUEUED_IO]; | |
1b3abdcc | 191 | |
28b24087 PB |
192 | QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { |
193 | iocbs[len++] = &aiocb->iocb; | |
194 | if (len == MAX_QUEUED_IO) { | |
195 | break; | |
196 | } | |
197 | } | |
1b3abdcc | 198 | |
28b24087 PB |
199 | ret = io_submit(s->ctx, len, iocbs); |
200 | if (ret == -EAGAIN) { | |
201 | ret = 0; | |
202 | } | |
1b3abdcc | 203 | if (ret < 0) { |
28b24087 | 204 | abort(); |
1b3abdcc ML |
205 | } |
206 | ||
28b24087 PB |
207 | for (i = 0; i < ret; i++) { |
208 | s->io_q.idx--; | |
209 | QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next); | |
1b3abdcc ML |
210 | } |
211 | return ret; | |
212 | } | |
213 | ||
1b3abdcc ML |
214 | void laio_io_plug(BlockDriverState *bs, void *aio_ctx) |
215 | { | |
216 | struct qemu_laio_state *s = aio_ctx; | |
217 | ||
218 | s->io_q.plugged++; | |
219 | } | |
220 | ||
221 | int laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) | |
222 | { | |
223 | struct qemu_laio_state *s = aio_ctx; | |
224 | int ret = 0; | |
225 | ||
226 | assert(s->io_q.plugged > 0 || !unplug); | |
227 | ||
228 | if (unplug && --s->io_q.plugged > 0) { | |
229 | return 0; | |
230 | } | |
231 | ||
28b24087 | 232 | if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
1b3abdcc ML |
233 | ret = ioq_submit(s); |
234 | } | |
235 | ||
236 | return ret; | |
237 | } | |
238 | ||
7c84b1b8 | 239 | BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, |
5c6c3a6c | 240 | int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, |
097310b5 | 241 | BlockCompletionFunc *cb, void *opaque, int type) |
5c6c3a6c CH |
242 | { |
243 | struct qemu_laio_state *s = aio_ctx; | |
244 | struct qemu_laiocb *laiocb; | |
245 | struct iocb *iocbs; | |
246 | off_t offset = sector_num * 512; | |
247 | ||
d7331bed | 248 | laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); |
5c6c3a6c CH |
249 | laiocb->nbytes = nb_sectors * 512; |
250 | laiocb->ctx = s; | |
251 | laiocb->ret = -EINPROGRESS; | |
b161e2e4 KW |
252 | laiocb->is_read = (type == QEMU_AIO_READ); |
253 | laiocb->qiov = qiov; | |
5c6c3a6c CH |
254 | |
255 | iocbs = &laiocb->iocb; | |
256 | ||
257 | switch (type) { | |
258 | case QEMU_AIO_WRITE: | |
259 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | |
260 | break; | |
261 | case QEMU_AIO_READ: | |
262 | io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); | |
263 | break; | |
c30e624d | 264 | /* Currently Linux kernel does not support other operations */ |
5c6c3a6c CH |
265 | default: |
266 | fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", | |
267 | __func__, type); | |
268 | goto out_free_aiocb; | |
269 | } | |
c90caf25 | 270 | io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); |
5c6c3a6c | 271 | |
28b24087 PB |
272 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); |
273 | s->io_q.idx++; | |
274 | if (s->io_q.idx == (s->io_q.plugged ? MAX_QUEUED_IO : 1)) { | |
275 | ioq_submit(s); | |
1b3abdcc | 276 | } |
5c6c3a6c CH |
277 | return &laiocb->common; |
278 | ||
449c184e | 279 | out_free_aiocb: |
8007429a | 280 | qemu_aio_unref(laiocb); |
5c6c3a6c CH |
281 | return NULL; |
282 | } | |
283 | ||
c2f3426c SH |
284 | void laio_detach_aio_context(void *s_, AioContext *old_context) |
285 | { | |
286 | struct qemu_laio_state *s = s_; | |
287 | ||
288 | aio_set_event_notifier(old_context, &s->e, NULL); | |
2cdff7f6 | 289 | qemu_bh_delete(s->completion_bh); |
c2f3426c SH |
290 | } |
291 | ||
292 | void laio_attach_aio_context(void *s_, AioContext *new_context) | |
293 | { | |
294 | struct qemu_laio_state *s = s_; | |
295 | ||
2cdff7f6 | 296 | s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); |
c2f3426c SH |
297 | aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb); |
298 | } | |
299 | ||
5c6c3a6c CH |
300 | void *laio_init(void) |
301 | { | |
302 | struct qemu_laio_state *s; | |
303 | ||
7267c094 | 304 | s = g_malloc0(sizeof(*s)); |
c90caf25 | 305 | if (event_notifier_init(&s->e, false) < 0) { |
5c6c3a6c | 306 | goto out_free_state; |
c90caf25 | 307 | } |
5c6c3a6c | 308 | |
c90caf25 | 309 | if (io_setup(MAX_EVENTS, &s->ctx) != 0) { |
5c6c3a6c | 310 | goto out_close_efd; |
c90caf25 | 311 | } |
5c6c3a6c | 312 | |
1b3abdcc ML |
313 | ioq_init(&s->io_q); |
314 | ||
5c6c3a6c CH |
315 | return s; |
316 | ||
317 | out_close_efd: | |
c90caf25 | 318 | event_notifier_cleanup(&s->e); |
5c6c3a6c | 319 | out_free_state: |
7267c094 | 320 | g_free(s); |
5c6c3a6c CH |
321 | return NULL; |
322 | } | |
abd269b7 SH |
323 | |
324 | void laio_cleanup(void *s_) | |
325 | { | |
326 | struct qemu_laio_state *s = s_; | |
327 | ||
328 | event_notifier_cleanup(&s->e); | |
a1abf40d GA |
329 | |
330 | if (io_destroy(s->ctx) != 0) { | |
331 | fprintf(stderr, "%s: destroy AIO context %p failed\n", | |
332 | __func__, &s->ctx); | |
333 | } | |
abd269b7 SH |
334 | g_free(s); |
335 | } |