]> git.proxmox.com Git - mirror_qemu.git/blob - hw/virtio-blk.c
Rearrange block headers
[mirror_qemu.git] / hw / virtio-blk.c
1 /*
2 * Virtio Block Device
3 *
4 * Copyright IBM, Corp. 2007
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 */
13
14 #include <qemu-common.h>
15 #include "qemu-error.h"
16 #include "blockdev.h"
17 #include "virtio-blk.h"
18 #ifdef __linux__
19 # include <scsi/sg.h>
20 #endif
21
22 typedef struct VirtIOBlock
23 {
24 VirtIODevice vdev;
25 BlockDriverState *bs;
26 VirtQueue *vq;
27 void *rq;
28 QEMUBH *bh;
29 BlockConf *conf;
30 unsigned short sector_mask;
31 char sn[BLOCK_SERIAL_STRLEN];
32 DeviceState *qdev;
33 } VirtIOBlock;
34
35 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
36 {
37 return (VirtIOBlock *)vdev;
38 }
39
40 typedef struct VirtIOBlockReq
41 {
42 VirtIOBlock *dev;
43 VirtQueueElement elem;
44 struct virtio_blk_inhdr *in;
45 struct virtio_blk_outhdr *out;
46 struct virtio_scsi_inhdr *scsi;
47 QEMUIOVector qiov;
48 struct VirtIOBlockReq *next;
49 } VirtIOBlockReq;
50
51 static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
52 {
53 VirtIOBlock *s = req->dev;
54
55 req->in->status = status;
56 virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
57 virtio_notify(&s->vdev, s->vq);
58
59 qemu_free(req);
60 }
61
62 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
63 int is_read)
64 {
65 BlockErrorAction action = bdrv_get_on_error(req->dev->bs, is_read);
66 VirtIOBlock *s = req->dev;
67
68 if (action == BLOCK_ERR_IGNORE) {
69 bdrv_mon_event(s->bs, BDRV_ACTION_IGNORE, is_read);
70 return 0;
71 }
72
73 if ((error == ENOSPC && action == BLOCK_ERR_STOP_ENOSPC)
74 || action == BLOCK_ERR_STOP_ANY) {
75 req->next = s->rq;
76 s->rq = req;
77 bdrv_mon_event(s->bs, BDRV_ACTION_STOP, is_read);
78 vm_stop(0);
79 } else {
80 virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
81 bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read);
82 }
83
84 return 1;
85 }
86
87 static void virtio_blk_rw_complete(void *opaque, int ret)
88 {
89 VirtIOBlockReq *req = opaque;
90
91 if (ret) {
92 int is_read = !(req->out->type & VIRTIO_BLK_T_OUT);
93 if (virtio_blk_handle_rw_error(req, -ret, is_read))
94 return;
95 }
96
97 virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
98 }
99
100 static void virtio_blk_flush_complete(void *opaque, int ret)
101 {
102 VirtIOBlockReq *req = opaque;
103
104 virtio_blk_req_complete(req, ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK);
105 }
106
107 static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
108 {
109 VirtIOBlockReq *req = qemu_malloc(sizeof(*req));
110 req->dev = s;
111 req->qiov.size = 0;
112 req->next = NULL;
113 return req;
114 }
115
116 static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
117 {
118 VirtIOBlockReq *req = virtio_blk_alloc_request(s);
119
120 if (req != NULL) {
121 if (!virtqueue_pop(s->vq, &req->elem)) {
122 qemu_free(req);
123 return NULL;
124 }
125 }
126
127 return req;
128 }
129
130 #ifdef __linux__
131 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
132 {
133 struct sg_io_hdr hdr;
134 int ret;
135 int status;
136 int i;
137
138 /*
139 * We require at least one output segment each for the virtio_blk_outhdr
140 * and the SCSI command block.
141 *
142 * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
143 * and the sense buffer pointer in the input segments.
144 */
145 if (req->elem.out_num < 2 || req->elem.in_num < 3) {
146 virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
147 return;
148 }
149
150 /*
151 * No support for bidirection commands yet.
152 */
153 if (req->elem.out_num > 2 && req->elem.in_num > 3) {
154 virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
155 return;
156 }
157
158 /*
159 * The scsi inhdr is placed in the second-to-last input segment, just
160 * before the regular inhdr.
161 */
162 req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
163
164 memset(&hdr, 0, sizeof(struct sg_io_hdr));
165 hdr.interface_id = 'S';
166 hdr.cmd_len = req->elem.out_sg[1].iov_len;
167 hdr.cmdp = req->elem.out_sg[1].iov_base;
168 hdr.dxfer_len = 0;
169
170 if (req->elem.out_num > 2) {
171 /*
172 * If there are more than the minimally required 2 output segments
173 * there is write payload starting from the third iovec.
174 */
175 hdr.dxfer_direction = SG_DXFER_TO_DEV;
176 hdr.iovec_count = req->elem.out_num - 2;
177
178 for (i = 0; i < hdr.iovec_count; i++)
179 hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len;
180
181 hdr.dxferp = req->elem.out_sg + 2;
182
183 } else if (req->elem.in_num > 3) {
184 /*
185 * If we have more than 3 input segments the guest wants to actually
186 * read data.
187 */
188 hdr.dxfer_direction = SG_DXFER_FROM_DEV;
189 hdr.iovec_count = req->elem.in_num - 3;
190 for (i = 0; i < hdr.iovec_count; i++)
191 hdr.dxfer_len += req->elem.in_sg[i].iov_len;
192
193 hdr.dxferp = req->elem.in_sg;
194 } else {
195 /*
196 * Some SCSI commands don't actually transfer any data.
197 */
198 hdr.dxfer_direction = SG_DXFER_NONE;
199 }
200
201 hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base;
202 hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len;
203
204 ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr);
205 if (ret) {
206 status = VIRTIO_BLK_S_UNSUPP;
207 hdr.status = ret;
208 hdr.resid = hdr.dxfer_len;
209 } else if (hdr.status) {
210 status = VIRTIO_BLK_S_IOERR;
211 } else {
212 status = VIRTIO_BLK_S_OK;
213 }
214
215 req->scsi->errors = hdr.status;
216 req->scsi->residual = hdr.resid;
217 req->scsi->sense_len = hdr.sb_len_wr;
218 req->scsi->data_len = hdr.dxfer_len;
219
220 virtio_blk_req_complete(req, status);
221 }
222 #else
223 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
224 {
225 virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
226 }
227 #endif /* __linux__ */
228
229 typedef struct MultiReqBuffer {
230 BlockRequest blkreq[32];
231 unsigned int num_writes;
232 } MultiReqBuffer;
233
234 static void virtio_submit_multiwrite(BlockDriverState *bs, MultiReqBuffer *mrb)
235 {
236 int i, ret;
237
238 if (!mrb->num_writes) {
239 return;
240 }
241
242 ret = bdrv_aio_multiwrite(bs, mrb->blkreq, mrb->num_writes);
243 if (ret != 0) {
244 for (i = 0; i < mrb->num_writes; i++) {
245 if (mrb->blkreq[i].error) {
246 virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
247 }
248 }
249 }
250
251 mrb->num_writes = 0;
252 }
253
254 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
255 {
256 BlockDriverAIOCB *acb;
257
258 /*
259 * Make sure all outstanding writes are posted to the backing device.
260 */
261 virtio_submit_multiwrite(req->dev->bs, mrb);
262
263 acb = bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
264 if (!acb) {
265 virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
266 }
267 }
268
269 static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
270 {
271 BlockRequest *blkreq;
272
273 if (req->out->sector & req->dev->sector_mask) {
274 virtio_blk_rw_complete(req, -EIO);
275 return;
276 }
277
278 if (mrb->num_writes == 32) {
279 virtio_submit_multiwrite(req->dev->bs, mrb);
280 }
281
282 blkreq = &mrb->blkreq[mrb->num_writes];
283 blkreq->sector = req->out->sector;
284 blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
285 blkreq->qiov = &req->qiov;
286 blkreq->cb = virtio_blk_rw_complete;
287 blkreq->opaque = req;
288 blkreq->error = 0;
289
290 mrb->num_writes++;
291 }
292
293 static void virtio_blk_handle_read(VirtIOBlockReq *req)
294 {
295 BlockDriverAIOCB *acb;
296
297 if (req->out->sector & req->dev->sector_mask) {
298 virtio_blk_rw_complete(req, -EIO);
299 return;
300 }
301
302 acb = bdrv_aio_readv(req->dev->bs, req->out->sector, &req->qiov,
303 req->qiov.size / BDRV_SECTOR_SIZE,
304 virtio_blk_rw_complete, req);
305 if (!acb) {
306 virtio_blk_rw_complete(req, -EIO);
307 }
308 }
309
310 static void virtio_blk_handle_request(VirtIOBlockReq *req,
311 MultiReqBuffer *mrb)
312 {
313 if (req->elem.out_num < 1 || req->elem.in_num < 1) {
314 fprintf(stderr, "virtio-blk missing headers\n");
315 exit(1);
316 }
317
318 if (req->elem.out_sg[0].iov_len < sizeof(*req->out) ||
319 req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) {
320 fprintf(stderr, "virtio-blk header not in correct element\n");
321 exit(1);
322 }
323
324 req->out = (void *)req->elem.out_sg[0].iov_base;
325 req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;
326
327 if (req->out->type & VIRTIO_BLK_T_FLUSH) {
328 virtio_blk_handle_flush(req, mrb);
329 } else if (req->out->type & VIRTIO_BLK_T_SCSI_CMD) {
330 virtio_blk_handle_scsi(req);
331 } else if (req->out->type & VIRTIO_BLK_T_GET_ID) {
332 VirtIOBlock *s = req->dev;
333
334 memcpy(req->elem.in_sg[0].iov_base, s->sn,
335 MIN(req->elem.in_sg[0].iov_len, sizeof(s->sn)));
336 virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
337 } else if (req->out->type & VIRTIO_BLK_T_OUT) {
338 qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
339 req->elem.out_num - 1);
340 virtio_blk_handle_write(req, mrb);
341 } else {
342 qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
343 req->elem.in_num - 1);
344 virtio_blk_handle_read(req);
345 }
346 }
347
348 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
349 {
350 VirtIOBlock *s = to_virtio_blk(vdev);
351 VirtIOBlockReq *req;
352 MultiReqBuffer mrb = {
353 .num_writes = 0,
354 };
355
356 while ((req = virtio_blk_get_request(s))) {
357 virtio_blk_handle_request(req, &mrb);
358 }
359
360 virtio_submit_multiwrite(s->bs, &mrb);
361
362 /*
363 * FIXME: Want to check for completions before returning to guest mode,
364 * so cached reads and writes are reported as quickly as possible. But
365 * that should be done in the generic block layer.
366 */
367 }
368
369 static void virtio_blk_dma_restart_bh(void *opaque)
370 {
371 VirtIOBlock *s = opaque;
372 VirtIOBlockReq *req = s->rq;
373 MultiReqBuffer mrb = {
374 .num_writes = 0,
375 };
376
377 qemu_bh_delete(s->bh);
378 s->bh = NULL;
379
380 s->rq = NULL;
381
382 while (req) {
383 virtio_blk_handle_request(req, &mrb);
384 req = req->next;
385 }
386
387 virtio_submit_multiwrite(s->bs, &mrb);
388 }
389
390 static void virtio_blk_dma_restart_cb(void *opaque, int running, int reason)
391 {
392 VirtIOBlock *s = opaque;
393
394 if (!running)
395 return;
396
397 if (!s->bh) {
398 s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s);
399 qemu_bh_schedule(s->bh);
400 }
401 }
402
403 static void virtio_blk_reset(VirtIODevice *vdev)
404 {
405 /*
406 * This should cancel pending requests, but can't do nicely until there
407 * are per-device request lists.
408 */
409 qemu_aio_flush();
410 }
411
412 /* coalesce internal state, copy to pci i/o region 0
413 */
414 static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
415 {
416 VirtIOBlock *s = to_virtio_blk(vdev);
417 struct virtio_blk_config blkcfg;
418 uint64_t capacity;
419 int cylinders, heads, secs;
420
421 bdrv_get_geometry(s->bs, &capacity);
422 bdrv_get_geometry_hint(s->bs, &cylinders, &heads, &secs);
423 memset(&blkcfg, 0, sizeof(blkcfg));
424 stq_raw(&blkcfg.capacity, capacity);
425 stl_raw(&blkcfg.seg_max, 128 - 2);
426 stw_raw(&blkcfg.cylinders, cylinders);
427 blkcfg.heads = heads;
428 blkcfg.sectors = secs & ~s->sector_mask;
429 blkcfg.blk_size = s->conf->logical_block_size;
430 blkcfg.size_max = 0;
431 blkcfg.physical_block_exp = get_physical_block_exp(s->conf);
432 blkcfg.alignment_offset = 0;
433 blkcfg.min_io_size = s->conf->min_io_size / blkcfg.blk_size;
434 blkcfg.opt_io_size = s->conf->opt_io_size / blkcfg.blk_size;
435 memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
436 }
437
438 static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
439 {
440 VirtIOBlock *s = to_virtio_blk(vdev);
441
442 features |= (1 << VIRTIO_BLK_F_SEG_MAX);
443 features |= (1 << VIRTIO_BLK_F_GEOMETRY);
444 features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
445 features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
446
447 if (bdrv_enable_write_cache(s->bs))
448 features |= (1 << VIRTIO_BLK_F_WCACHE);
449
450 if (bdrv_is_read_only(s->bs))
451 features |= 1 << VIRTIO_BLK_F_RO;
452
453 return features;
454 }
455
456 static void virtio_blk_save(QEMUFile *f, void *opaque)
457 {
458 VirtIOBlock *s = opaque;
459 VirtIOBlockReq *req = s->rq;
460
461 virtio_save(&s->vdev, f);
462
463 while (req) {
464 qemu_put_sbyte(f, 1);
465 qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
466 req = req->next;
467 }
468 qemu_put_sbyte(f, 0);
469 }
470
471 static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
472 {
473 VirtIOBlock *s = opaque;
474
475 if (version_id != 2)
476 return -EINVAL;
477
478 virtio_load(&s->vdev, f);
479 while (qemu_get_sbyte(f)) {
480 VirtIOBlockReq *req = virtio_blk_alloc_request(s);
481 qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
482 req->next = s->rq;
483 s->rq = req;
484 }
485
486 return 0;
487 }
488
489 VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf *conf)
490 {
491 VirtIOBlock *s;
492 int cylinders, heads, secs;
493 static int virtio_blk_id;
494 DriveInfo *dinfo;
495
496 if (!conf->bs) {
497 error_report("virtio-blk-pci: drive property not set");
498 return NULL;
499 }
500 if (!bdrv_is_inserted(conf->bs)) {
501 error_report("Device needs media, but drive is empty");
502 return NULL;
503 }
504
505 s = (VirtIOBlock *)virtio_common_init("virtio-blk", VIRTIO_ID_BLOCK,
506 sizeof(struct virtio_blk_config),
507 sizeof(VirtIOBlock));
508
509 s->vdev.get_config = virtio_blk_update_config;
510 s->vdev.get_features = virtio_blk_get_features;
511 s->vdev.reset = virtio_blk_reset;
512 s->bs = conf->bs;
513 s->conf = conf;
514 s->rq = NULL;
515 s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1;
516 bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
517
518 /* NB: per existing s/n string convention the string is terminated
519 * by '\0' only when less than sizeof (s->sn)
520 */
521 dinfo = drive_get_by_blockdev(s->bs);
522 strncpy(s->sn, dinfo->serial, sizeof (s->sn));
523
524 s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
525
526 qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
527 s->qdev = dev;
528 register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
529 virtio_blk_save, virtio_blk_load, s);
530 bdrv_set_removable(s->bs, 0);
531
532 return &s->vdev;
533 }
534
535 void virtio_blk_exit(VirtIODevice *vdev)
536 {
537 VirtIOBlock *s = to_virtio_blk(vdev);
538 unregister_savevm(s->qdev, "virtio-blk", s);
539 }