]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Dedicated thread for virtio-blk I/O processing | |
3 | * | |
4 | * Copyright 2012 IBM, Corp. | |
5 | * Copyright 2012 Red Hat, Inc. and/or its affiliates | |
6 | * | |
7 | * Authors: | |
8 | * Stefan Hajnoczi <stefanha@redhat.com> | |
9 | * | |
10 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
11 | * See the COPYING file in the top-level directory. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "trace.h" | |
16 | #include "qemu/iov.h" | |
17 | #include "qemu/thread.h" | |
18 | #include "qemu/error-report.h" | |
19 | #include "hw/virtio/dataplane/vring.h" | |
20 | #include "ioq.h" | |
21 | #include "block/block.h" | |
22 | #include "hw/virtio/virtio-blk.h" | |
23 | #include "virtio-blk.h" | |
24 | #include "block/aio.h" | |
25 | #include "hw/virtio/virtio-bus.h" | |
26 | ||
27 | enum { | |
28 | SEG_MAX = 126, /* maximum number of I/O segments */ | |
29 | VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */ | |
30 | REQ_MAX = VRING_MAX, /* maximum number of requests in the vring, | |
31 | * is VRING_MAX / 2 with traditional and | |
32 | * VRING_MAX with indirect descriptors */ | |
33 | }; | |
34 | ||
35 | typedef struct { | |
36 | struct iocb iocb; /* Linux AIO control block */ | |
37 | QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */ | |
38 | VirtQueueElement *elem; /* saved data from the virtqueue */ | |
39 | struct iovec *bounce_iov; /* used if guest buffers are unaligned */ | |
40 | QEMUIOVector *read_qiov; /* for read completion /w bounce buffer */ | |
41 | } VirtIOBlockRequest; | |
42 | ||
43 | struct VirtIOBlockDataPlane { | |
44 | bool started; | |
45 | bool starting; | |
46 | bool stopping; | |
47 | QEMUBH *start_bh; | |
48 | QemuThread thread; | |
49 | ||
50 | VirtIOBlkConf *blk; | |
51 | int fd; /* image file descriptor */ | |
52 | ||
53 | VirtIODevice *vdev; | |
54 | Vring vring; /* virtqueue vring */ | |
55 | EventNotifier *guest_notifier; /* irq */ | |
56 | ||
57 | /* Note that these EventNotifiers are assigned by value. This is | |
58 | * fine as long as you do not call event_notifier_cleanup on them | |
59 | * (because you don't own the file descriptor or handle; you just | |
60 | * use it). | |
61 | */ | |
62 | AioContext *ctx; | |
63 | EventNotifier io_notifier; /* Linux AIO completion */ | |
64 | EventNotifier host_notifier; /* doorbell */ | |
65 | ||
66 | IOQueue ioqueue; /* Linux AIO queue (should really be per | |
67 | dataplane thread) */ | |
68 | VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the | |
69 | queue */ | |
70 | ||
71 | unsigned int num_reqs; | |
72 | }; | |
73 | ||
74 | /* Raise an interrupt to signal guest, if necessary */ | |
75 | static void notify_guest(VirtIOBlockDataPlane *s) | |
76 | { | |
77 | if (!vring_should_notify(s->vdev, &s->vring)) { | |
78 | return; | |
79 | } | |
80 | ||
81 | event_notifier_set(s->guest_notifier); | |
82 | } | |
83 | ||
84 | static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) | |
85 | { | |
86 | VirtIOBlockDataPlane *s = opaque; | |
87 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
88 | struct virtio_blk_inhdr hdr; | |
89 | int len; | |
90 | ||
91 | if (likely(ret >= 0)) { | |
92 | hdr.status = VIRTIO_BLK_S_OK; | |
93 | len = ret; | |
94 | } else { | |
95 | hdr.status = VIRTIO_BLK_S_IOERR; | |
96 | len = 0; | |
97 | } | |
98 | ||
99 | trace_virtio_blk_data_plane_complete_request(s, req->elem->index, ret); | |
100 | ||
101 | if (req->read_qiov) { | |
102 | assert(req->bounce_iov); | |
103 | qemu_iovec_from_buf(req->read_qiov, 0, req->bounce_iov->iov_base, len); | |
104 | qemu_iovec_destroy(req->read_qiov); | |
105 | g_slice_free(QEMUIOVector, req->read_qiov); | |
106 | } | |
107 | ||
108 | if (req->bounce_iov) { | |
109 | qemu_vfree(req->bounce_iov->iov_base); | |
110 | g_slice_free(struct iovec, req->bounce_iov); | |
111 | } | |
112 | ||
113 | qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr)); | |
114 | qemu_iovec_destroy(req->inhdr); | |
115 | g_slice_free(QEMUIOVector, req->inhdr); | |
116 | ||
117 | /* According to the virtio specification len should be the number of bytes | |
118 | * written to, but for virtio-blk it seems to be the number of bytes | |
119 | * transferred plus the status bytes. | |
120 | */ | |
121 | vring_push(&s->vring, req->elem, len + sizeof(hdr)); | |
122 | req->elem = NULL; | |
123 | s->num_reqs--; | |
124 | } | |
125 | ||
126 | static void complete_request_early(VirtIOBlockDataPlane *s, VirtQueueElement *elem, | |
127 | QEMUIOVector *inhdr, unsigned char status) | |
128 | { | |
129 | struct virtio_blk_inhdr hdr = { | |
130 | .status = status, | |
131 | }; | |
132 | ||
133 | qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr)); | |
134 | qemu_iovec_destroy(inhdr); | |
135 | g_slice_free(QEMUIOVector, inhdr); | |
136 | ||
137 | vring_push(&s->vring, elem, sizeof(hdr)); | |
138 | notify_guest(s); | |
139 | } | |
140 | ||
141 | /* Get disk serial number */ | |
142 | static void do_get_id_cmd(VirtIOBlockDataPlane *s, | |
143 | struct iovec *iov, unsigned int iov_cnt, | |
144 | VirtQueueElement *elem, QEMUIOVector *inhdr) | |
145 | { | |
146 | char id[VIRTIO_BLK_ID_BYTES]; | |
147 | ||
148 | /* Serial number not NUL-terminated when longer than buffer */ | |
149 | strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id)); | |
150 | iov_from_buf(iov, iov_cnt, 0, id, sizeof(id)); | |
151 | complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK); | |
152 | } | |
153 | ||
154 | static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read, | |
155 | struct iovec *iov, unsigned iov_cnt, | |
156 | long long offset, VirtQueueElement *elem, | |
157 | QEMUIOVector *inhdr) | |
158 | { | |
159 | struct iocb *iocb; | |
160 | QEMUIOVector qiov; | |
161 | struct iovec *bounce_iov = NULL; | |
162 | QEMUIOVector *read_qiov = NULL; | |
163 | ||
164 | qemu_iovec_init_external(&qiov, iov, iov_cnt); | |
165 | if (!bdrv_qiov_is_aligned(s->blk->conf.bs, &qiov)) { | |
166 | void *bounce_buffer = qemu_blockalign(s->blk->conf.bs, qiov.size); | |
167 | ||
168 | if (read) { | |
169 | /* Need to copy back from bounce buffer on completion */ | |
170 | read_qiov = g_slice_new(QEMUIOVector); | |
171 | qemu_iovec_init(read_qiov, iov_cnt); | |
172 | qemu_iovec_concat_iov(read_qiov, iov, iov_cnt, 0, qiov.size); | |
173 | } else { | |
174 | qemu_iovec_to_buf(&qiov, 0, bounce_buffer, qiov.size); | |
175 | } | |
176 | ||
177 | /* Redirect I/O to aligned bounce buffer */ | |
178 | bounce_iov = g_slice_new(struct iovec); | |
179 | bounce_iov->iov_base = bounce_buffer; | |
180 | bounce_iov->iov_len = qiov.size; | |
181 | iov = bounce_iov; | |
182 | iov_cnt = 1; | |
183 | } | |
184 | ||
185 | iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset); | |
186 | ||
187 | /* Fill in virtio block metadata needed for completion */ | |
188 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
189 | req->elem = elem; | |
190 | req->inhdr = inhdr; | |
191 | req->bounce_iov = bounce_iov; | |
192 | req->read_qiov = read_qiov; | |
193 | return 0; | |
194 | } | |
195 | ||
196 | static int process_request(IOQueue *ioq, VirtQueueElement *elem) | |
197 | { | |
198 | VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue); | |
199 | struct iovec *iov = elem->out_sg; | |
200 | struct iovec *in_iov = elem->in_sg; | |
201 | unsigned out_num = elem->out_num; | |
202 | unsigned in_num = elem->in_num; | |
203 | struct virtio_blk_outhdr outhdr; | |
204 | QEMUIOVector *inhdr; | |
205 | size_t in_size; | |
206 | ||
207 | /* Copy in outhdr */ | |
208 | if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr, | |
209 | sizeof(outhdr)) != sizeof(outhdr))) { | |
210 | error_report("virtio-blk request outhdr too short"); | |
211 | return -EFAULT; | |
212 | } | |
213 | iov_discard_front(&iov, &out_num, sizeof(outhdr)); | |
214 | ||
215 | /* Grab inhdr for later */ | |
216 | in_size = iov_size(in_iov, in_num); | |
217 | if (in_size < sizeof(struct virtio_blk_inhdr)) { | |
218 | error_report("virtio_blk request inhdr too short"); | |
219 | return -EFAULT; | |
220 | } | |
221 | inhdr = g_slice_new(QEMUIOVector); | |
222 | qemu_iovec_init(inhdr, 1); | |
223 | qemu_iovec_concat_iov(inhdr, in_iov, in_num, | |
224 | in_size - sizeof(struct virtio_blk_inhdr), | |
225 | sizeof(struct virtio_blk_inhdr)); | |
226 | iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); | |
227 | ||
228 | /* TODO Linux sets the barrier bit even when not advertised! */ | |
229 | outhdr.type &= ~VIRTIO_BLK_T_BARRIER; | |
230 | ||
231 | switch (outhdr.type) { | |
232 | case VIRTIO_BLK_T_IN: | |
233 | do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, elem, inhdr); | |
234 | return 0; | |
235 | ||
236 | case VIRTIO_BLK_T_OUT: | |
237 | do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, elem, inhdr); | |
238 | return 0; | |
239 | ||
240 | case VIRTIO_BLK_T_SCSI_CMD: | |
241 | /* TODO support SCSI commands */ | |
242 | complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_UNSUPP); | |
243 | return 0; | |
244 | ||
245 | case VIRTIO_BLK_T_FLUSH: | |
246 | /* TODO fdsync not supported by Linux AIO, do it synchronously here! */ | |
247 | if (qemu_fdatasync(s->fd) < 0) { | |
248 | complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_IOERR); | |
249 | } else { | |
250 | complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK); | |
251 | } | |
252 | return 0; | |
253 | ||
254 | case VIRTIO_BLK_T_GET_ID: | |
255 | do_get_id_cmd(s, in_iov, in_num, elem, inhdr); | |
256 | return 0; | |
257 | ||
258 | default: | |
259 | error_report("virtio-blk unsupported request type %#x", outhdr.type); | |
260 | qemu_iovec_destroy(inhdr); | |
261 | g_slice_free(QEMUIOVector, inhdr); | |
262 | return -EFAULT; | |
263 | } | |
264 | } | |
265 | ||
266 | static void handle_notify(EventNotifier *e) | |
267 | { | |
268 | VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, | |
269 | host_notifier); | |
270 | ||
271 | VirtQueueElement *elem; | |
272 | int ret; | |
273 | unsigned int num_queued; | |
274 | ||
275 | event_notifier_test_and_clear(&s->host_notifier); | |
276 | for (;;) { | |
277 | /* Disable guest->host notifies to avoid unnecessary vmexits */ | |
278 | vring_disable_notification(s->vdev, &s->vring); | |
279 | ||
280 | for (;;) { | |
281 | ret = vring_pop(s->vdev, &s->vring, &elem); | |
282 | if (ret < 0) { | |
283 | assert(elem == NULL); | |
284 | break; /* no more requests */ | |
285 | } | |
286 | ||
287 | trace_virtio_blk_data_plane_process_request(s, elem->out_num, | |
288 | elem->in_num, elem->index); | |
289 | ||
290 | if (process_request(&s->ioqueue, elem) < 0) { | |
291 | vring_set_broken(&s->vring); | |
292 | vring_free_element(elem); | |
293 | ret = -EFAULT; | |
294 | break; | |
295 | } | |
296 | } | |
297 | ||
298 | if (likely(ret == -EAGAIN)) { /* vring emptied */ | |
299 | /* Re-enable guest->host notifies and stop processing the vring. | |
300 | * But if the guest has snuck in more descriptors, keep processing. | |
301 | */ | |
302 | if (vring_enable_notification(s->vdev, &s->vring)) { | |
303 | break; | |
304 | } | |
305 | } else { /* ret == -ENOBUFS or fatal error, iovecs[] is depleted */ | |
306 | /* Since there are no iovecs[] left, stop processing for now. Do | |
307 | * not re-enable guest->host notifies since the I/O completion | |
308 | * handler knows to check for more vring descriptors anyway. | |
309 | */ | |
310 | break; | |
311 | } | |
312 | } | |
313 | ||
314 | num_queued = ioq_num_queued(&s->ioqueue); | |
315 | if (num_queued > 0) { | |
316 | s->num_reqs += num_queued; | |
317 | ||
318 | int rc = ioq_submit(&s->ioqueue); | |
319 | if (unlikely(rc < 0)) { | |
320 | fprintf(stderr, "ioq_submit failed %d\n", rc); | |
321 | exit(1); | |
322 | } | |
323 | } | |
324 | } | |
325 | ||
326 | static void handle_io(EventNotifier *e) | |
327 | { | |
328 | VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane, | |
329 | io_notifier); | |
330 | ||
331 | event_notifier_test_and_clear(&s->io_notifier); | |
332 | if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) { | |
333 | notify_guest(s); | |
334 | } | |
335 | ||
336 | /* If there were more requests than iovecs, the vring will not be empty yet | |
337 | * so check again. There should now be enough resources to process more | |
338 | * requests. | |
339 | */ | |
340 | if (unlikely(vring_more_avail(&s->vring))) { | |
341 | handle_notify(&s->host_notifier); | |
342 | } | |
343 | } | |
344 | ||
345 | static void *data_plane_thread(void *opaque) | |
346 | { | |
347 | VirtIOBlockDataPlane *s = opaque; | |
348 | ||
349 | while (!s->stopping || s->num_reqs > 0) { | |
350 | aio_poll(s->ctx, true); | |
351 | } | |
352 | return NULL; | |
353 | } | |
354 | ||
355 | static void start_data_plane_bh(void *opaque) | |
356 | { | |
357 | VirtIOBlockDataPlane *s = opaque; | |
358 | ||
359 | qemu_bh_delete(s->start_bh); | |
360 | s->start_bh = NULL; | |
361 | qemu_thread_create(&s->thread, "data_plane", data_plane_thread, | |
362 | s, QEMU_THREAD_JOINABLE); | |
363 | } | |
364 | ||
365 | void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, | |
366 | VirtIOBlockDataPlane **dataplane, | |
367 | Error **errp) | |
368 | { | |
369 | VirtIOBlockDataPlane *s; | |
370 | int fd; | |
371 | ||
372 | *dataplane = NULL; | |
373 | ||
374 | if (!blk->data_plane) { | |
375 | return; | |
376 | } | |
377 | ||
378 | if (blk->scsi) { | |
379 | error_setg(errp, | |
380 | "device is incompatible with x-data-plane, use scsi=off"); | |
381 | return; | |
382 | } | |
383 | ||
384 | if (blk->config_wce) { | |
385 | error_setg(errp, "device is incompatible with x-data-plane, " | |
386 | "use config-wce=off"); | |
387 | return; | |
388 | } | |
389 | ||
390 | /* If dataplane is (re-)enabled while the guest is running there could be | |
391 | * block jobs that can conflict. | |
392 | */ | |
393 | if (bdrv_in_use(blk->conf.bs)) { | |
394 | error_setg(errp, | |
395 | "cannot start dataplane thread while device is in use"); | |
396 | return; | |
397 | } | |
398 | ||
399 | fd = raw_get_aio_fd(blk->conf.bs); | |
400 | if (fd < 0) { | |
401 | error_setg(errp, "drive is incompatible with x-data-plane, " | |
402 | "use format=raw,cache=none,aio=native"); | |
403 | return; | |
404 | } | |
405 | ||
406 | s = g_new0(VirtIOBlockDataPlane, 1); | |
407 | s->vdev = vdev; | |
408 | s->fd = fd; | |
409 | s->blk = blk; | |
410 | ||
411 | /* Prevent block operations that conflict with data plane thread */ | |
412 | bdrv_set_in_use(blk->conf.bs, 1); | |
413 | ||
414 | *dataplane = s; | |
415 | } | |
416 | ||
417 | void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) | |
418 | { | |
419 | if (!s) { | |
420 | return; | |
421 | } | |
422 | ||
423 | virtio_blk_data_plane_stop(s); | |
424 | bdrv_set_in_use(s->blk->conf.bs, 0); | |
425 | g_free(s); | |
426 | } | |
427 | ||
428 | void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) | |
429 | { | |
430 | BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s->vdev))); | |
431 | VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); | |
432 | VirtQueue *vq; | |
433 | int i; | |
434 | ||
435 | if (s->started) { | |
436 | return; | |
437 | } | |
438 | ||
439 | if (s->starting) { | |
440 | return; | |
441 | } | |
442 | ||
443 | s->starting = true; | |
444 | ||
445 | vq = virtio_get_queue(s->vdev, 0); | |
446 | if (!vring_setup(&s->vring, s->vdev, 0)) { | |
447 | s->starting = false; | |
448 | return; | |
449 | } | |
450 | ||
451 | s->ctx = aio_context_new(); | |
452 | ||
453 | /* Set up guest notifier (irq) */ | |
454 | if (k->set_guest_notifiers(qbus->parent, 1, true) != 0) { | |
455 | fprintf(stderr, "virtio-blk failed to set guest notifier, " | |
456 | "ensure -enable-kvm is set\n"); | |
457 | exit(1); | |
458 | } | |
459 | s->guest_notifier = virtio_queue_get_guest_notifier(vq); | |
460 | ||
461 | /* Set up virtqueue notify */ | |
462 | if (k->set_host_notifier(qbus->parent, 0, true) != 0) { | |
463 | fprintf(stderr, "virtio-blk failed to set host notifier\n"); | |
464 | exit(1); | |
465 | } | |
466 | s->host_notifier = *virtio_queue_get_host_notifier(vq); | |
467 | aio_set_event_notifier(s->ctx, &s->host_notifier, handle_notify); | |
468 | ||
469 | /* Set up ioqueue */ | |
470 | ioq_init(&s->ioqueue, s->fd, REQ_MAX); | |
471 | for (i = 0; i < ARRAY_SIZE(s->requests); i++) { | |
472 | ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb); | |
473 | } | |
474 | s->io_notifier = *ioq_get_notifier(&s->ioqueue); | |
475 | aio_set_event_notifier(s->ctx, &s->io_notifier, handle_io); | |
476 | ||
477 | s->starting = false; | |
478 | s->started = true; | |
479 | trace_virtio_blk_data_plane_start(s); | |
480 | ||
481 | /* Kick right away to begin processing requests already in vring */ | |
482 | event_notifier_set(virtio_queue_get_host_notifier(vq)); | |
483 | ||
484 | /* Spawn thread in BH so it inherits iothread cpusets */ | |
485 | s->start_bh = qemu_bh_new(start_data_plane_bh, s); | |
486 | qemu_bh_schedule(s->start_bh); | |
487 | } | |
488 | ||
489 | void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) | |
490 | { | |
491 | BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s->vdev))); | |
492 | VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); | |
493 | if (!s->started || s->stopping) { | |
494 | return; | |
495 | } | |
496 | s->stopping = true; | |
497 | trace_virtio_blk_data_plane_stop(s); | |
498 | ||
499 | /* Stop thread or cancel pending thread creation BH */ | |
500 | if (s->start_bh) { | |
501 | qemu_bh_delete(s->start_bh); | |
502 | s->start_bh = NULL; | |
503 | } else { | |
504 | aio_notify(s->ctx); | |
505 | qemu_thread_join(&s->thread); | |
506 | } | |
507 | ||
508 | aio_set_event_notifier(s->ctx, &s->io_notifier, NULL); | |
509 | ioq_cleanup(&s->ioqueue); | |
510 | ||
511 | aio_set_event_notifier(s->ctx, &s->host_notifier, NULL); | |
512 | k->set_host_notifier(qbus->parent, 0, false); | |
513 | ||
514 | aio_context_unref(s->ctx); | |
515 | ||
516 | /* Clean up guest notifier (irq) */ | |
517 | k->set_guest_notifiers(qbus->parent, 1, false); | |
518 | ||
519 | vring_teardown(&s->vring, s->vdev, 0); | |
520 | s->started = false; | |
521 | s->stopping = false; | |
522 | } |