]>
Commit | Line | Data |
---|---|---|
e72f66a0 SH |
1 | /* |
2 | * Dedicated thread for virtio-blk I/O processing | |
3 | * | |
4 | * Copyright 2012 IBM, Corp. | |
5 | * Copyright 2012 Red Hat, Inc. and/or its affiliates | |
6 | * | |
7 | * Authors: | |
8 | * Stefan Hajnoczi <stefanha@redhat.com> | |
9 | * | |
10 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
11 | * See the COPYING file in the top-level directory. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "trace.h" | |
16 | #include "qemu/iov.h" | |
17 | #include "event-poll.h" | |
18 | #include "qemu/thread.h" | |
19 | #include "vring.h" | |
20 | #include "ioq.h" | |
21 | #include "migration/migration.h" | |
22 | #include "hw/virtio-blk.h" | |
23 | #include "hw/dataplane/virtio-blk.h" | |
24 | ||
25 | enum { | |
26 | SEG_MAX = 126, /* maximum number of I/O segments */ | |
27 | VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */ | |
28 | REQ_MAX = VRING_MAX, /* maximum number of requests in the vring, | |
29 | * is VRING_MAX / 2 with traditional and | |
30 | * VRING_MAX with indirect descriptors */ | |
31 | }; | |
32 | ||
33 | typedef struct { | |
34 | struct iocb iocb; /* Linux AIO control block */ | |
35 | QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */ | |
36 | unsigned int head; /* vring descriptor index */ | |
37 | } VirtIOBlockRequest; | |
38 | ||
39 | struct VirtIOBlockDataPlane { | |
40 | bool started; | |
41 | QEMUBH *start_bh; | |
42 | QemuThread thread; | |
43 | ||
44 | VirtIOBlkConf *blk; | |
45 | int fd; /* image file descriptor */ | |
46 | ||
47 | VirtIODevice *vdev; | |
48 | Vring vring; /* virtqueue vring */ | |
49 | EventNotifier *guest_notifier; /* irq */ | |
50 | ||
51 | EventPoll event_poll; /* event poller */ | |
52 | EventHandler io_handler; /* Linux AIO completion handler */ | |
53 | EventHandler notify_handler; /* virtqueue notify handler */ | |
54 | ||
55 | IOQueue ioqueue; /* Linux AIO queue (should really be per | |
56 | dataplane thread) */ | |
57 | VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the | |
58 | queue */ | |
59 | ||
60 | unsigned int num_reqs; | |
61 | ||
62 | Error *migration_blocker; | |
63 | }; | |
64 | ||
65 | /* Raise an interrupt to signal guest, if necessary */ | |
66 | static void notify_guest(VirtIOBlockDataPlane *s) | |
67 | { | |
68 | if (!vring_should_notify(s->vdev, &s->vring)) { | |
69 | return; | |
70 | } | |
71 | ||
72 | event_notifier_set(s->guest_notifier); | |
73 | } | |
74 | ||
75 | static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) | |
76 | { | |
77 | VirtIOBlockDataPlane *s = opaque; | |
78 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
79 | struct virtio_blk_inhdr hdr; | |
80 | int len; | |
81 | ||
82 | if (likely(ret >= 0)) { | |
83 | hdr.status = VIRTIO_BLK_S_OK; | |
84 | len = ret; | |
85 | } else { | |
86 | hdr.status = VIRTIO_BLK_S_IOERR; | |
87 | len = 0; | |
88 | } | |
89 | ||
90 | trace_virtio_blk_data_plane_complete_request(s, req->head, ret); | |
91 | ||
92 | qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr)); | |
93 | qemu_iovec_destroy(req->inhdr); | |
94 | g_slice_free(QEMUIOVector, req->inhdr); | |
95 | ||
96 | /* According to the virtio specification len should be the number of bytes | |
97 | * written to, but for virtio-blk it seems to be the number of bytes | |
98 | * transferred plus the status bytes. | |
99 | */ | |
100 | vring_push(&s->vring, req->head, len + sizeof(hdr)); | |
101 | ||
102 | s->num_reqs--; | |
103 | } | |
104 | ||
105 | static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head, | |
106 | QEMUIOVector *inhdr, unsigned char status) | |
107 | { | |
108 | struct virtio_blk_inhdr hdr = { | |
109 | .status = status, | |
110 | }; | |
111 | ||
112 | qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr)); | |
113 | qemu_iovec_destroy(inhdr); | |
114 | g_slice_free(QEMUIOVector, inhdr); | |
115 | ||
116 | vring_push(&s->vring, head, sizeof(hdr)); | |
117 | notify_guest(s); | |
118 | } | |
119 | ||
120 | /* Get disk serial number */ | |
121 | static void do_get_id_cmd(VirtIOBlockDataPlane *s, | |
122 | struct iovec *iov, unsigned int iov_cnt, | |
123 | unsigned int head, QEMUIOVector *inhdr) | |
124 | { | |
125 | char id[VIRTIO_BLK_ID_BYTES]; | |
126 | ||
127 | /* Serial number not NUL-terminated when shorter than buffer */ | |
128 | strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id)); | |
129 | iov_from_buf(iov, iov_cnt, 0, id, sizeof(id)); | |
130 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); | |
131 | } | |
132 | ||
b5ef1aab SH |
133 | static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read, |
134 | struct iovec *iov, unsigned int iov_cnt, | |
135 | long long offset, unsigned int head, | |
136 | QEMUIOVector *inhdr) | |
137 | { | |
138 | struct iocb *iocb; | |
139 | ||
140 | iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset); | |
141 | ||
142 | /* Fill in virtio block metadata needed for completion */ | |
143 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
144 | req->head = head; | |
145 | req->inhdr = inhdr; | |
146 | return 0; | |
147 | } | |
148 | ||
e72f66a0 SH |
149 | static int process_request(IOQueue *ioq, struct iovec iov[], |
150 | unsigned int out_num, unsigned int in_num, | |
151 | unsigned int head) | |
152 | { | |
153 | VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue); | |
154 | struct iovec *in_iov = &iov[out_num]; | |
155 | struct virtio_blk_outhdr outhdr; | |
156 | QEMUIOVector *inhdr; | |
157 | size_t in_size; | |
e72f66a0 SH |
158 | |
159 | /* Copy in outhdr */ | |
160 | if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr, | |
161 | sizeof(outhdr)) != sizeof(outhdr))) { | |
162 | error_report("virtio-blk request outhdr too short"); | |
163 | return -EFAULT; | |
164 | } | |
165 | iov_discard_front(&iov, &out_num, sizeof(outhdr)); | |
166 | ||
167 | /* Grab inhdr for later */ | |
168 | in_size = iov_size(in_iov, in_num); | |
169 | if (in_size < sizeof(struct virtio_blk_inhdr)) { | |
170 | error_report("virtio_blk request inhdr too short"); | |
171 | return -EFAULT; | |
172 | } | |
173 | inhdr = g_slice_new(QEMUIOVector); | |
174 | qemu_iovec_init(inhdr, 1); | |
175 | qemu_iovec_concat_iov(inhdr, in_iov, in_num, | |
176 | in_size - sizeof(struct virtio_blk_inhdr), | |
177 | sizeof(struct virtio_blk_inhdr)); | |
178 | iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); | |
179 | ||
180 | /* TODO Linux sets the barrier bit even when not advertised! */ | |
181 | outhdr.type &= ~VIRTIO_BLK_T_BARRIER; | |
182 | ||
183 | switch (outhdr.type) { | |
184 | case VIRTIO_BLK_T_IN: | |
b5ef1aab SH |
185 | do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr); |
186 | return 0; | |
e72f66a0 SH |
187 | |
188 | case VIRTIO_BLK_T_OUT: | |
b5ef1aab SH |
189 | do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr); |
190 | return 0; | |
e72f66a0 SH |
191 | |
192 | case VIRTIO_BLK_T_SCSI_CMD: | |
193 | /* TODO support SCSI commands */ | |
194 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP); | |
195 | return 0; | |
196 | ||
197 | case VIRTIO_BLK_T_FLUSH: | |
198 | /* TODO fdsync not supported by Linux AIO, do it synchronously here! */ | |
199 | if (qemu_fdatasync(s->fd) < 0) { | |
200 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR); | |
201 | } else { | |
202 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); | |
203 | } | |
204 | return 0; | |
205 | ||
206 | case VIRTIO_BLK_T_GET_ID: | |
207 | do_get_id_cmd(s, in_iov, in_num, head, inhdr); | |
208 | return 0; | |
209 | ||
210 | default: | |
211 | error_report("virtio-blk unsupported request type %#x", outhdr.type); | |
212 | qemu_iovec_destroy(inhdr); | |
213 | g_slice_free(QEMUIOVector, inhdr); | |
214 | return -EFAULT; | |
215 | } | |
e72f66a0 SH |
216 | } |
217 | ||
218 | static void handle_notify(EventHandler *handler) | |
219 | { | |
220 | VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane, | |
221 | notify_handler); | |
222 | ||
223 | /* There is one array of iovecs into which all new requests are extracted | |
224 | * from the vring. Requests are read from the vring and the translated | |
225 | * descriptors are written to the iovecs array. The iovecs do not have to | |
226 | * persist across handle_notify() calls because the kernel copies the | |
227 | * iovecs on io_submit(). | |
228 | * | |
229 | * Handling io_submit() EAGAIN may require storing the requests across | |
230 | * handle_notify() calls until the kernel has sufficient resources to | |
231 | * accept more I/O. This is not implemented yet. | |
232 | */ | |
233 | struct iovec iovec[VRING_MAX]; | |
234 | struct iovec *end = &iovec[VRING_MAX]; | |
235 | struct iovec *iov = iovec; | |
236 | ||
237 | /* When a request is read from the vring, the index of the first descriptor | |
238 | * (aka head) is returned so that the completed request can be pushed onto | |
239 | * the vring later. | |
240 | * | |
241 | * The number of hypervisor read-only iovecs is out_num. The number of | |
242 | * hypervisor write-only iovecs is in_num. | |
243 | */ | |
244 | int head; | |
245 | unsigned int out_num = 0, in_num = 0; | |
246 | unsigned int num_queued; | |
247 | ||
248 | for (;;) { | |
249 | /* Disable guest->host notifies to avoid unnecessary vmexits */ | |
250 | vring_disable_notification(s->vdev, &s->vring); | |
251 | ||
252 | for (;;) { | |
253 | head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num); | |
254 | if (head < 0) { | |
255 | break; /* no more requests */ | |
256 | } | |
257 | ||
258 | trace_virtio_blk_data_plane_process_request(s, out_num, in_num, | |
259 | head); | |
260 | ||
261 | if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) { | |
262 | vring_set_broken(&s->vring); | |
263 | break; | |
264 | } | |
265 | iov += out_num + in_num; | |
266 | } | |
267 | ||
268 | if (likely(head == -EAGAIN)) { /* vring emptied */ | |
269 | /* Re-enable guest->host notifies and stop processing the vring. | |
270 | * But if the guest has snuck in more descriptors, keep processing. | |
271 | */ | |
272 | if (vring_enable_notification(s->vdev, &s->vring)) { | |
273 | break; | |
274 | } | |
275 | } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */ | |
276 | /* Since there are no iovecs[] left, stop processing for now. Do | |
277 | * not re-enable guest->host notifies since the I/O completion | |
278 | * handler knows to check for more vring descriptors anyway. | |
279 | */ | |
280 | break; | |
281 | } | |
282 | } | |
283 | ||
284 | num_queued = ioq_num_queued(&s->ioqueue); | |
285 | if (num_queued > 0) { | |
286 | s->num_reqs += num_queued; | |
287 | ||
288 | int rc = ioq_submit(&s->ioqueue); | |
289 | if (unlikely(rc < 0)) { | |
290 | fprintf(stderr, "ioq_submit failed %d\n", rc); | |
291 | exit(1); | |
292 | } | |
293 | } | |
294 | } | |
295 | ||
296 | static void handle_io(EventHandler *handler) | |
297 | { | |
298 | VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane, | |
299 | io_handler); | |
300 | ||
301 | if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) { | |
302 | notify_guest(s); | |
303 | } | |
304 | ||
305 | /* If there were more requests than iovecs, the vring will not be empty yet | |
306 | * so check again. There should now be enough resources to process more | |
307 | * requests. | |
308 | */ | |
309 | if (unlikely(vring_more_avail(&s->vring))) { | |
310 | handle_notify(&s->notify_handler); | |
311 | } | |
312 | } | |
313 | ||
314 | static void *data_plane_thread(void *opaque) | |
315 | { | |
316 | VirtIOBlockDataPlane *s = opaque; | |
317 | ||
318 | do { | |
319 | event_poll(&s->event_poll); | |
320 | } while (s->started || s->num_reqs > 0); | |
321 | return NULL; | |
322 | } | |
323 | ||
324 | static void start_data_plane_bh(void *opaque) | |
325 | { | |
326 | VirtIOBlockDataPlane *s = opaque; | |
327 | ||
328 | qemu_bh_delete(s->start_bh); | |
329 | s->start_bh = NULL; | |
330 | qemu_thread_create(&s->thread, data_plane_thread, | |
331 | s, QEMU_THREAD_JOINABLE); | |
332 | } | |
333 | ||
334 | bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, | |
335 | VirtIOBlockDataPlane **dataplane) | |
336 | { | |
337 | VirtIOBlockDataPlane *s; | |
338 | int fd; | |
339 | ||
340 | *dataplane = NULL; | |
341 | ||
342 | if (!blk->data_plane) { | |
343 | return true; | |
344 | } | |
345 | ||
346 | if (blk->scsi) { | |
347 | error_report("device is incompatible with x-data-plane, use scsi=off"); | |
348 | return false; | |
349 | } | |
350 | ||
351 | if (blk->config_wce) { | |
352 | error_report("device is incompatible with x-data-plane, " | |
353 | "use config-wce=off"); | |
354 | return false; | |
355 | } | |
356 | ||
357 | fd = raw_get_aio_fd(blk->conf.bs); | |
358 | if (fd < 0) { | |
359 | error_report("drive is incompatible with x-data-plane, " | |
360 | "use format=raw,cache=none,aio=native"); | |
361 | return false; | |
362 | } | |
363 | ||
364 | s = g_new0(VirtIOBlockDataPlane, 1); | |
365 | s->vdev = vdev; | |
366 | s->fd = fd; | |
367 | s->blk = blk; | |
368 | ||
369 | /* Prevent block operations that conflict with data plane thread */ | |
370 | bdrv_set_in_use(blk->conf.bs, 1); | |
371 | ||
372 | error_setg(&s->migration_blocker, | |
373 | "x-data-plane does not support migration"); | |
374 | migrate_add_blocker(s->migration_blocker); | |
375 | ||
376 | *dataplane = s; | |
377 | return true; | |
378 | } | |
379 | ||
380 | void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) | |
381 | { | |
382 | if (!s) { | |
383 | return; | |
384 | } | |
385 | ||
386 | virtio_blk_data_plane_stop(s); | |
387 | migrate_del_blocker(s->migration_blocker); | |
388 | error_free(s->migration_blocker); | |
389 | bdrv_set_in_use(s->blk->conf.bs, 0); | |
390 | g_free(s); | |
391 | } | |
392 | ||
393 | void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) | |
394 | { | |
395 | VirtQueue *vq; | |
396 | int i; | |
397 | ||
398 | if (s->started) { | |
399 | return; | |
400 | } | |
401 | ||
402 | vq = virtio_get_queue(s->vdev, 0); | |
403 | if (!vring_setup(&s->vring, s->vdev, 0)) { | |
404 | return; | |
405 | } | |
406 | ||
407 | event_poll_init(&s->event_poll); | |
408 | ||
409 | /* Set up guest notifier (irq) */ | |
410 | if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, | |
411 | true) != 0) { | |
412 | fprintf(stderr, "virtio-blk failed to set guest notifier, " | |
413 | "ensure -enable-kvm is set\n"); | |
414 | exit(1); | |
415 | } | |
416 | s->guest_notifier = virtio_queue_get_guest_notifier(vq); | |
417 | ||
418 | /* Set up virtqueue notify */ | |
419 | if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, | |
420 | 0, true) != 0) { | |
421 | fprintf(stderr, "virtio-blk failed to set host notifier\n"); | |
422 | exit(1); | |
423 | } | |
424 | event_poll_add(&s->event_poll, &s->notify_handler, | |
425 | virtio_queue_get_host_notifier(vq), | |
426 | handle_notify); | |
427 | ||
428 | /* Set up ioqueue */ | |
429 | ioq_init(&s->ioqueue, s->fd, REQ_MAX); | |
430 | for (i = 0; i < ARRAY_SIZE(s->requests); i++) { | |
431 | ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb); | |
432 | } | |
433 | event_poll_add(&s->event_poll, &s->io_handler, | |
434 | ioq_get_notifier(&s->ioqueue), handle_io); | |
435 | ||
436 | s->started = true; | |
437 | trace_virtio_blk_data_plane_start(s); | |
438 | ||
439 | /* Kick right away to begin processing requests already in vring */ | |
440 | event_notifier_set(virtio_queue_get_host_notifier(vq)); | |
441 | ||
442 | /* Spawn thread in BH so it inherits iothread cpusets */ | |
443 | s->start_bh = qemu_bh_new(start_data_plane_bh, s); | |
444 | qemu_bh_schedule(s->start_bh); | |
445 | } | |
446 | ||
447 | void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) | |
448 | { | |
449 | if (!s->started) { | |
450 | return; | |
451 | } | |
452 | s->started = false; | |
453 | trace_virtio_blk_data_plane_stop(s); | |
454 | ||
455 | /* Stop thread or cancel pending thread creation BH */ | |
456 | if (s->start_bh) { | |
457 | qemu_bh_delete(s->start_bh); | |
458 | s->start_bh = NULL; | |
459 | } else { | |
460 | event_poll_notify(&s->event_poll); | |
461 | qemu_thread_join(&s->thread); | |
462 | } | |
463 | ||
464 | ioq_cleanup(&s->ioqueue); | |
465 | ||
466 | s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false); | |
467 | ||
468 | event_poll_cleanup(&s->event_poll); | |
469 | ||
470 | /* Clean up guest notifier (irq) */ | |
471 | s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, false); | |
472 | ||
473 | vring_teardown(&s->vring); | |
474 | } |