]> git.proxmox.com Git - mirror_qemu.git/blob - hw/virtio-blk.c
Merge remote-tracking branch 'kwolf/for-anthony' into staging
[mirror_qemu.git] / hw / virtio-blk.c
1 /*
2 * Virtio Block Device
3 *
4 * Copyright IBM, Corp. 2007
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 */
13
14 #include "qemu-common.h"
15 #include "qemu-error.h"
16 #include "trace.h"
17 #include "blockdev.h"
18 #include "virtio-blk.h"
19 #include "scsi-defs.h"
20 #ifdef __linux__
21 # include <scsi/sg.h>
22 #endif
23
24 typedef struct VirtIOBlock
25 {
26 VirtIODevice vdev;
27 BlockDriverState *bs;
28 VirtQueue *vq;
29 void *rq;
30 QEMUBH *bh;
31 BlockConf *conf;
32 VirtIOBlkConf *blk;
33 unsigned short sector_mask;
34 DeviceState *qdev;
35 } VirtIOBlock;
36
37 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
38 {
39 return (VirtIOBlock *)vdev;
40 }
41
42 typedef struct VirtIOBlockReq
43 {
44 VirtIOBlock *dev;
45 VirtQueueElement elem;
46 struct virtio_blk_inhdr *in;
47 struct virtio_blk_outhdr *out;
48 struct virtio_scsi_inhdr *scsi;
49 QEMUIOVector qiov;
50 struct VirtIOBlockReq *next;
51 BlockAcctCookie acct;
52 } VirtIOBlockReq;
53
54 static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
55 {
56 VirtIOBlock *s = req->dev;
57
58 trace_virtio_blk_req_complete(req, status);
59
60 stb_p(&req->in->status, status);
61 virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
62 virtio_notify(&s->vdev, s->vq);
63 }
64
65 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
66 int is_read)
67 {
68 BlockErrorAction action = bdrv_get_on_error(req->dev->bs, is_read);
69 VirtIOBlock *s = req->dev;
70
71 if (action == BLOCK_ERR_IGNORE) {
72 bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_IGNORE, is_read);
73 return 0;
74 }
75
76 if ((error == ENOSPC && action == BLOCK_ERR_STOP_ENOSPC)
77 || action == BLOCK_ERR_STOP_ANY) {
78 req->next = s->rq;
79 s->rq = req;
80 bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_STOP, is_read);
81 vm_stop(RUN_STATE_IO_ERROR);
82 bdrv_iostatus_set_err(s->bs, error);
83 } else {
84 virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
85 bdrv_acct_done(s->bs, &req->acct);
86 g_free(req);
87 bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_REPORT, is_read);
88 }
89
90 return 1;
91 }
92
93 static void virtio_blk_rw_complete(void *opaque, int ret)
94 {
95 VirtIOBlockReq *req = opaque;
96
97 trace_virtio_blk_rw_complete(req, ret);
98
99 if (ret) {
100 int is_read = !(ldl_p(&req->out->type) & VIRTIO_BLK_T_OUT);
101 if (virtio_blk_handle_rw_error(req, -ret, is_read))
102 return;
103 }
104
105 virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
106 bdrv_acct_done(req->dev->bs, &req->acct);
107 g_free(req);
108 }
109
110 static void virtio_blk_flush_complete(void *opaque, int ret)
111 {
112 VirtIOBlockReq *req = opaque;
113
114 if (ret) {
115 if (virtio_blk_handle_rw_error(req, -ret, 0)) {
116 return;
117 }
118 }
119
120 virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
121 bdrv_acct_done(req->dev->bs, &req->acct);
122 g_free(req);
123 }
124
125 static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
126 {
127 VirtIOBlockReq *req = g_malloc(sizeof(*req));
128 req->dev = s;
129 req->qiov.size = 0;
130 req->next = NULL;
131 return req;
132 }
133
134 static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
135 {
136 VirtIOBlockReq *req = virtio_blk_alloc_request(s);
137
138 if (req != NULL) {
139 if (!virtqueue_pop(s->vq, &req->elem)) {
140 g_free(req);
141 return NULL;
142 }
143 }
144
145 return req;
146 }
147
148 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
149 {
150 #ifdef __linux__
151 int ret;
152 int i;
153 #endif
154 int status = VIRTIO_BLK_S_OK;
155
156 /*
157 * We require at least one output segment each for the virtio_blk_outhdr
158 * and the SCSI command block.
159 *
160 * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
161 * and the sense buffer pointer in the input segments.
162 */
163 if (req->elem.out_num < 2 || req->elem.in_num < 3) {
164 virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
165 g_free(req);
166 return;
167 }
168
169 /*
170 * The scsi inhdr is placed in the second-to-last input segment, just
171 * before the regular inhdr.
172 */
173 req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
174
175 if (!req->dev->blk->scsi) {
176 status = VIRTIO_BLK_S_UNSUPP;
177 goto fail;
178 }
179
180 /*
181 * No support for bidirection commands yet.
182 */
183 if (req->elem.out_num > 2 && req->elem.in_num > 3) {
184 status = VIRTIO_BLK_S_UNSUPP;
185 goto fail;
186 }
187
188 #ifdef __linux__
189 struct sg_io_hdr hdr;
190 memset(&hdr, 0, sizeof(struct sg_io_hdr));
191 hdr.interface_id = 'S';
192 hdr.cmd_len = req->elem.out_sg[1].iov_len;
193 hdr.cmdp = req->elem.out_sg[1].iov_base;
194 hdr.dxfer_len = 0;
195
196 if (req->elem.out_num > 2) {
197 /*
198 * If there are more than the minimally required 2 output segments
199 * there is write payload starting from the third iovec.
200 */
201 hdr.dxfer_direction = SG_DXFER_TO_DEV;
202 hdr.iovec_count = req->elem.out_num - 2;
203
204 for (i = 0; i < hdr.iovec_count; i++)
205 hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len;
206
207 hdr.dxferp = req->elem.out_sg + 2;
208
209 } else if (req->elem.in_num > 3) {
210 /*
211 * If we have more than 3 input segments the guest wants to actually
212 * read data.
213 */
214 hdr.dxfer_direction = SG_DXFER_FROM_DEV;
215 hdr.iovec_count = req->elem.in_num - 3;
216 for (i = 0; i < hdr.iovec_count; i++)
217 hdr.dxfer_len += req->elem.in_sg[i].iov_len;
218
219 hdr.dxferp = req->elem.in_sg;
220 } else {
221 /*
222 * Some SCSI commands don't actually transfer any data.
223 */
224 hdr.dxfer_direction = SG_DXFER_NONE;
225 }
226
227 hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base;
228 hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len;
229
230 ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr);
231 if (ret) {
232 status = VIRTIO_BLK_S_UNSUPP;
233 goto fail;
234 }
235
236 /*
237 * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi)
238 * clear the masked_status field [hence status gets cleared too, see
239 * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED
240 * status has occurred. However they do set DRIVER_SENSE in driver_status
241 * field. Also a (sb_len_wr > 0) indicates there is a sense buffer.
242 */
243 if (hdr.status == 0 && hdr.sb_len_wr > 0) {
244 hdr.status = CHECK_CONDITION;
245 }
246
247 stl_p(&req->scsi->errors,
248 hdr.status | (hdr.msg_status << 8) |
249 (hdr.host_status << 16) | (hdr.driver_status << 24));
250 stl_p(&req->scsi->residual, hdr.resid);
251 stl_p(&req->scsi->sense_len, hdr.sb_len_wr);
252 stl_p(&req->scsi->data_len, hdr.dxfer_len);
253
254 virtio_blk_req_complete(req, status);
255 g_free(req);
256 #else
257 abort();
258 #endif
259
260 fail:
261 /* Just put anything nonzero so that the ioctl fails in the guest. */
262 stl_p(&req->scsi->errors, 255);
263 virtio_blk_req_complete(req, status);
264 g_free(req);
265 }
266
267 typedef struct MultiReqBuffer {
268 BlockRequest blkreq[32];
269 unsigned int num_writes;
270 } MultiReqBuffer;
271
272 static void virtio_submit_multiwrite(BlockDriverState *bs, MultiReqBuffer *mrb)
273 {
274 int i, ret;
275
276 if (!mrb->num_writes) {
277 return;
278 }
279
280 ret = bdrv_aio_multiwrite(bs, mrb->blkreq, mrb->num_writes);
281 if (ret != 0) {
282 for (i = 0; i < mrb->num_writes; i++) {
283 if (mrb->blkreq[i].error) {
284 virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
285 }
286 }
287 }
288
289 mrb->num_writes = 0;
290 }
291
292 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
293 {
294 bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
295
296 /*
297 * Make sure all outstanding writes are posted to the backing device.
298 */
299 virtio_submit_multiwrite(req->dev->bs, mrb);
300 bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
301 }
302
303 static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
304 {
305 BlockRequest *blkreq;
306 uint64_t sector;
307
308 sector = ldq_p(&req->out->sector);
309
310 bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
311
312 trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
313
314 if (sector & req->dev->sector_mask) {
315 virtio_blk_rw_complete(req, -EIO);
316 return;
317 }
318 if (req->qiov.size % req->dev->conf->logical_block_size) {
319 virtio_blk_rw_complete(req, -EIO);
320 return;
321 }
322
323 if (mrb->num_writes == 32) {
324 virtio_submit_multiwrite(req->dev->bs, mrb);
325 }
326
327 blkreq = &mrb->blkreq[mrb->num_writes];
328 blkreq->sector = sector;
329 blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
330 blkreq->qiov = &req->qiov;
331 blkreq->cb = virtio_blk_rw_complete;
332 blkreq->opaque = req;
333 blkreq->error = 0;
334
335 mrb->num_writes++;
336 }
337
338 static void virtio_blk_handle_read(VirtIOBlockReq *req)
339 {
340 uint64_t sector;
341
342 sector = ldq_p(&req->out->sector);
343
344 bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
345
346 trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
347
348 if (sector & req->dev->sector_mask) {
349 virtio_blk_rw_complete(req, -EIO);
350 return;
351 }
352 if (req->qiov.size % req->dev->conf->logical_block_size) {
353 virtio_blk_rw_complete(req, -EIO);
354 return;
355 }
356 bdrv_aio_readv(req->dev->bs, sector, &req->qiov,
357 req->qiov.size / BDRV_SECTOR_SIZE,
358 virtio_blk_rw_complete, req);
359 }
360
361 static void virtio_blk_handle_request(VirtIOBlockReq *req,
362 MultiReqBuffer *mrb)
363 {
364 uint32_t type;
365
366 if (req->elem.out_num < 1 || req->elem.in_num < 1) {
367 error_report("virtio-blk missing headers");
368 exit(1);
369 }
370
371 if (req->elem.out_sg[0].iov_len < sizeof(*req->out) ||
372 req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) {
373 error_report("virtio-blk header not in correct element");
374 exit(1);
375 }
376
377 req->out = (void *)req->elem.out_sg[0].iov_base;
378 req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;
379
380 type = ldl_p(&req->out->type);
381
382 if (type & VIRTIO_BLK_T_FLUSH) {
383 virtio_blk_handle_flush(req, mrb);
384 } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
385 virtio_blk_handle_scsi(req);
386 } else if (type & VIRTIO_BLK_T_GET_ID) {
387 VirtIOBlock *s = req->dev;
388
389 /*
390 * NB: per existing s/n string convention the string is
391 * terminated by '\0' only when shorter than buffer.
392 */
393 strncpy(req->elem.in_sg[0].iov_base,
394 s->blk->serial ? s->blk->serial : "",
395 MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
396 virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
397 g_free(req);
398 } else if (type & VIRTIO_BLK_T_OUT) {
399 qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
400 req->elem.out_num - 1);
401 virtio_blk_handle_write(req, mrb);
402 } else {
403 qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
404 req->elem.in_num - 1);
405 virtio_blk_handle_read(req);
406 }
407 }
408
409 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
410 {
411 VirtIOBlock *s = to_virtio_blk(vdev);
412 VirtIOBlockReq *req;
413 MultiReqBuffer mrb = {
414 .num_writes = 0,
415 };
416
417 while ((req = virtio_blk_get_request(s))) {
418 virtio_blk_handle_request(req, &mrb);
419 }
420
421 virtio_submit_multiwrite(s->bs, &mrb);
422
423 /*
424 * FIXME: Want to check for completions before returning to guest mode,
425 * so cached reads and writes are reported as quickly as possible. But
426 * that should be done in the generic block layer.
427 */
428 }
429
430 static void virtio_blk_dma_restart_bh(void *opaque)
431 {
432 VirtIOBlock *s = opaque;
433 VirtIOBlockReq *req = s->rq;
434 MultiReqBuffer mrb = {
435 .num_writes = 0,
436 };
437
438 qemu_bh_delete(s->bh);
439 s->bh = NULL;
440
441 s->rq = NULL;
442
443 while (req) {
444 virtio_blk_handle_request(req, &mrb);
445 req = req->next;
446 }
447
448 virtio_submit_multiwrite(s->bs, &mrb);
449 }
450
451 static void virtio_blk_dma_restart_cb(void *opaque, int running,
452 RunState state)
453 {
454 VirtIOBlock *s = opaque;
455
456 if (!running)
457 return;
458
459 if (!s->bh) {
460 s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s);
461 qemu_bh_schedule(s->bh);
462 }
463 }
464
465 static void virtio_blk_reset(VirtIODevice *vdev)
466 {
467 /*
468 * This should cancel pending requests, but can't do nicely until there
469 * are per-device request lists.
470 */
471 bdrv_drain_all();
472 }
473
474 /* coalesce internal state, copy to pci i/o region 0
475 */
476 static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
477 {
478 VirtIOBlock *s = to_virtio_blk(vdev);
479 struct virtio_blk_config blkcfg;
480 uint64_t capacity;
481 int cylinders, heads, secs;
482 int blk_size = s->conf->logical_block_size;
483
484 bdrv_get_geometry(s->bs, &capacity);
485 bdrv_get_geometry_hint(s->bs, &cylinders, &heads, &secs);
486 memset(&blkcfg, 0, sizeof(blkcfg));
487 stq_raw(&blkcfg.capacity, capacity);
488 stl_raw(&blkcfg.seg_max, 128 - 2);
489 stw_raw(&blkcfg.cylinders, cylinders);
490 stl_raw(&blkcfg.blk_size, blk_size);
491 stw_raw(&blkcfg.min_io_size, s->conf->min_io_size / blk_size);
492 stw_raw(&blkcfg.opt_io_size, s->conf->opt_io_size / blk_size);
493 blkcfg.heads = heads;
494 /*
495 * We must ensure that the block device capacity is a multiple of
496 * the logical block size. If that is not the case, lets use
497 * sector_mask to adopt the geometry to have a correct picture.
498 * For those devices where the capacity is ok for the given geometry
499 * we dont touch the sector value of the geometry, since some devices
500 * (like s390 dasd) need a specific value. Here the capacity is already
501 * cyls*heads*secs*blk_size and the sector value is not block size
502 * divided by 512 - instead it is the amount of blk_size blocks
503 * per track (cylinder).
504 */
505 if (bdrv_getlength(s->bs) / heads / secs % blk_size) {
506 blkcfg.sectors = secs & ~s->sector_mask;
507 } else {
508 blkcfg.sectors = secs;
509 }
510 blkcfg.size_max = 0;
511 blkcfg.physical_block_exp = get_physical_block_exp(s->conf);
512 blkcfg.alignment_offset = 0;
513 memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
514 }
515
516 static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
517 {
518 VirtIOBlock *s = to_virtio_blk(vdev);
519
520 features |= (1 << VIRTIO_BLK_F_SEG_MAX);
521 features |= (1 << VIRTIO_BLK_F_GEOMETRY);
522 features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
523 features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
524 features |= (1 << VIRTIO_BLK_F_SCSI);
525
526 if (bdrv_enable_write_cache(s->bs))
527 features |= (1 << VIRTIO_BLK_F_WCACHE);
528
529 if (bdrv_is_read_only(s->bs))
530 features |= 1 << VIRTIO_BLK_F_RO;
531
532 return features;
533 }
534
535 static void virtio_blk_save(QEMUFile *f, void *opaque)
536 {
537 VirtIOBlock *s = opaque;
538 VirtIOBlockReq *req = s->rq;
539
540 virtio_save(&s->vdev, f);
541
542 while (req) {
543 qemu_put_sbyte(f, 1);
544 qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
545 req = req->next;
546 }
547 qemu_put_sbyte(f, 0);
548 }
549
550 static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
551 {
552 VirtIOBlock *s = opaque;
553 int ret;
554
555 if (version_id != 2)
556 return -EINVAL;
557
558 ret = virtio_load(&s->vdev, f);
559 if (ret) {
560 return ret;
561 }
562
563 while (qemu_get_sbyte(f)) {
564 VirtIOBlockReq *req = virtio_blk_alloc_request(s);
565 qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
566 req->next = s->rq;
567 s->rq = req;
568
569 virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr,
570 req->elem.in_num, 1);
571 virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr,
572 req->elem.out_num, 0);
573 }
574
575 return 0;
576 }
577
578 static void virtio_blk_resize(void *opaque)
579 {
580 VirtIOBlock *s = opaque;
581
582 virtio_notify_config(&s->vdev);
583 }
584
585 static const BlockDevOps virtio_block_ops = {
586 .resize_cb = virtio_blk_resize,
587 };
588
589 VirtIODevice *virtio_blk_init(DeviceState *dev, VirtIOBlkConf *blk)
590 {
591 VirtIOBlock *s;
592 int cylinders, heads, secs;
593 static int virtio_blk_id;
594 DriveInfo *dinfo;
595
596 if (!blk->conf.bs) {
597 error_report("drive property not set");
598 return NULL;
599 }
600 if (!bdrv_is_inserted(blk->conf.bs)) {
601 error_report("Device needs media, but drive is empty");
602 return NULL;
603 }
604
605 if (!blk->serial) {
606 /* try to fall back to value set with legacy -drive serial=... */
607 dinfo = drive_get_by_blockdev(blk->conf.bs);
608 if (*dinfo->serial) {
609 blk->serial = strdup(dinfo->serial);
610 }
611 }
612
613 s = (VirtIOBlock *)virtio_common_init("virtio-blk", VIRTIO_ID_BLOCK,
614 sizeof(struct virtio_blk_config),
615 sizeof(VirtIOBlock));
616
617 s->vdev.get_config = virtio_blk_update_config;
618 s->vdev.get_features = virtio_blk_get_features;
619 s->vdev.reset = virtio_blk_reset;
620 s->bs = blk->conf.bs;
621 s->conf = &blk->conf;
622 s->blk = blk;
623 s->rq = NULL;
624 s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1;
625 bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
626
627 s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
628
629 qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
630 s->qdev = dev;
631 register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
632 virtio_blk_save, virtio_blk_load, s);
633 bdrv_set_dev_ops(s->bs, &virtio_block_ops, s);
634 bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size);
635
636 bdrv_iostatus_enable(s->bs);
637 add_boot_device_path(s->conf->bootindex, dev, "/disk@0,0");
638
639 return &s->vdev;
640 }
641
642 void virtio_blk_exit(VirtIODevice *vdev)
643 {
644 VirtIOBlock *s = to_virtio_blk(vdev);
645 unregister_savevm(s->qdev, "virtio-blk", s);
646 blockdev_mark_auto_del(s->bs);
647 virtio_cleanup(vdev);
648 }