2 * Sharing QEMU block devices via vhost-user protocal
4 * Parts of the code based on nbd/server.c.
6 * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
7 * Copyright (c) 2020 Red Hat, Inc.
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "block/block.h"
14 #include "subprojects/libvhost-user/libvhost-user.h" /* only for the type definitions */
15 #include "standard-headers/linux/virtio_blk.h"
16 #include "qemu/vhost-user-server.h"
17 #include "vhost-user-blk-server.h"
18 #include "qapi/error.h"
19 #include "qom/object_interfaces.h"
20 #include "sysemu/block-backend.h"
21 #include "util/block-helpers.h"
24 * Sector units are 512 bytes regardless of the
25 * virtio_blk_config->blk_size value.
27 #define VIRTIO_BLK_SECTOR_BITS 9
28 #define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS)
31 VHOST_USER_BLK_NUM_QUEUES_DEFAULT
= 1,
32 VHOST_USER_BLK_MAX_DISCARD_SECTORS
= 32768,
33 VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
= 32768,
35 struct virtio_blk_inhdr
{
39 typedef struct VuBlkReq
{
43 struct virtio_blk_inhdr
*in
;
44 struct virtio_blk_outhdr out
;
49 /* vhost user block device */
54 QIOChannelSocket
*sioc
;
55 struct virtio_blk_config blkcfg
;
59 static void vu_blk_req_complete(VuBlkReq
*req
)
61 VuDev
*vu_dev
= &req
->server
->vu_dev
;
63 /* IO size with 1 extra status byte */
64 vu_queue_push(vu_dev
, req
->vq
, &req
->elem
, req
->size
+ 1);
65 vu_queue_notify(vu_dev
, req
->vq
);
70 static bool vu_blk_sect_range_ok(VuBlkExport
*vexp
, uint64_t sector
,
73 uint64_t nb_sectors
= size
>> BDRV_SECTOR_BITS
;
74 uint64_t total_sectors
;
76 if (nb_sectors
> BDRV_REQUEST_MAX_SECTORS
) {
79 if ((sector
<< VIRTIO_BLK_SECTOR_BITS
) % vexp
->blk_size
) {
82 blk_get_geometry(vexp
->export
.blk
, &total_sectors
);
83 if (sector
> total_sectors
|| nb_sectors
> total_sectors
- sector
) {
89 static int coroutine_fn
90 vu_blk_discard_write_zeroes(VuBlkExport
*vexp
, struct iovec
*iov
,
91 uint32_t iovcnt
, uint32_t type
)
93 BlockBackend
*blk
= vexp
->export
.blk
;
94 struct virtio_blk_discard_write_zeroes desc
;
102 /* Only one desc is currently supported */
103 if (unlikely(iov_size(iov
, iovcnt
) > sizeof(desc
))) {
104 return VIRTIO_BLK_S_UNSUPP
;
107 size
= iov_to_buf(iov
, iovcnt
, 0, &desc
, sizeof(desc
));
108 if (unlikely(size
!= sizeof(desc
))) {
109 error_report("Invalid size %zd, expected %zu", size
, sizeof(desc
));
110 return VIRTIO_BLK_S_IOERR
;
113 sector
= le64_to_cpu(desc
.sector
);
114 num_sectors
= le32_to_cpu(desc
.num_sectors
);
115 flags
= le32_to_cpu(desc
.flags
);
116 max_sectors
= (type
== VIRTIO_BLK_T_WRITE_ZEROES
) ?
117 VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
:
118 VHOST_USER_BLK_MAX_DISCARD_SECTORS
;
120 /* This check ensures that 'bytes' fits in an int */
121 if (unlikely(num_sectors
> max_sectors
)) {
122 return VIRTIO_BLK_S_IOERR
;
125 bytes
= num_sectors
<< VIRTIO_BLK_SECTOR_BITS
;
127 if (unlikely(!vu_blk_sect_range_ok(vexp
, sector
, bytes
))) {
128 return VIRTIO_BLK_S_IOERR
;
132 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
133 * and write zeroes commands if any unknown flag is set.
135 if (unlikely(flags
& ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
)) {
136 return VIRTIO_BLK_S_UNSUPP
;
139 if (type
== VIRTIO_BLK_T_WRITE_ZEROES
) {
142 if (flags
& VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
) {
143 blk_flags
|= BDRV_REQ_MAY_UNMAP
;
146 if (blk_co_pwrite_zeroes(blk
, sector
<< VIRTIO_BLK_SECTOR_BITS
,
147 bytes
, blk_flags
) == 0) {
148 return VIRTIO_BLK_S_OK
;
150 } else if (type
== VIRTIO_BLK_T_DISCARD
) {
152 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
153 * discard commands if the unmap flag is set.
155 if (unlikely(flags
& VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
)) {
156 return VIRTIO_BLK_S_UNSUPP
;
159 if (blk_co_pdiscard(blk
, sector
<< VIRTIO_BLK_SECTOR_BITS
,
161 return VIRTIO_BLK_S_OK
;
165 return VIRTIO_BLK_S_IOERR
;
168 static void coroutine_fn
vu_blk_virtio_process_req(void *opaque
)
170 VuBlkReq
*req
= opaque
;
171 VuServer
*server
= req
->server
;
172 VuVirtqElement
*elem
= &req
->elem
;
175 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
176 BlockBackend
*blk
= vexp
->export
.blk
;
178 struct iovec
*in_iov
= elem
->in_sg
;
179 struct iovec
*out_iov
= elem
->out_sg
;
180 unsigned in_num
= elem
->in_num
;
181 unsigned out_num
= elem
->out_num
;
183 /* refer to hw/block/virtio_blk.c */
184 if (elem
->out_num
< 1 || elem
->in_num
< 1) {
185 error_report("virtio-blk request missing headers");
189 if (unlikely(iov_to_buf(out_iov
, out_num
, 0, &req
->out
,
190 sizeof(req
->out
)) != sizeof(req
->out
))) {
191 error_report("virtio-blk request outhdr too short");
195 iov_discard_front(&out_iov
, &out_num
, sizeof(req
->out
));
197 if (in_iov
[in_num
- 1].iov_len
< sizeof(struct virtio_blk_inhdr
)) {
198 error_report("virtio-blk request inhdr too short");
202 /* We always touch the last byte, so just see how big in_iov is. */
203 req
->in
= (void *)in_iov
[in_num
- 1].iov_base
204 + in_iov
[in_num
- 1].iov_len
205 - sizeof(struct virtio_blk_inhdr
);
206 iov_discard_back(in_iov
, &in_num
, sizeof(struct virtio_blk_inhdr
));
208 type
= le32_to_cpu(req
->out
.type
);
209 switch (type
& ~VIRTIO_BLK_T_BARRIER
) {
210 case VIRTIO_BLK_T_IN
:
211 case VIRTIO_BLK_T_OUT
: {
215 bool is_write
= type
& VIRTIO_BLK_T_OUT
;
216 req
->sector_num
= le64_to_cpu(req
->out
.sector
);
218 if (is_write
&& !vexp
->writable
) {
219 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
224 qemu_iovec_init_external(&qiov
, out_iov
, out_num
);
226 qemu_iovec_init_external(&qiov
, in_iov
, in_num
);
229 if (unlikely(!vu_blk_sect_range_ok(vexp
,
232 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
236 offset
= req
->sector_num
<< VIRTIO_BLK_SECTOR_BITS
;
239 ret
= blk_co_pwritev(blk
, offset
, qiov
.size
, &qiov
, 0);
241 ret
= blk_co_preadv(blk
, offset
, qiov
.size
, &qiov
, 0);
244 req
->in
->status
= VIRTIO_BLK_S_OK
;
246 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
250 case VIRTIO_BLK_T_FLUSH
:
251 if (blk_co_flush(blk
) == 0) {
252 req
->in
->status
= VIRTIO_BLK_S_OK
;
254 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
257 case VIRTIO_BLK_T_GET_ID
: {
258 size_t size
= MIN(iov_size(&elem
->in_sg
[0], in_num
),
259 VIRTIO_BLK_ID_BYTES
);
260 snprintf(elem
->in_sg
[0].iov_base
, size
, "%s", "vhost_user_blk");
261 req
->in
->status
= VIRTIO_BLK_S_OK
;
262 req
->size
= elem
->in_sg
[0].iov_len
;
265 case VIRTIO_BLK_T_DISCARD
:
266 case VIRTIO_BLK_T_WRITE_ZEROES
: {
267 if (!vexp
->writable
) {
268 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
272 req
->in
->status
= vu_blk_discard_write_zeroes(vexp
, out_iov
, out_num
,
277 req
->in
->status
= VIRTIO_BLK_S_UNSUPP
;
281 vu_blk_req_complete(req
);
288 static void vu_blk_process_vq(VuDev
*vu_dev
, int idx
)
290 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
291 VuVirtq
*vq
= vu_get_queue(vu_dev
, idx
);
296 req
= vu_queue_pop(vu_dev
, vq
, sizeof(VuBlkReq
));
301 req
->server
= server
;
305 qemu_coroutine_create(vu_blk_virtio_process_req
, req
);
306 qemu_coroutine_enter(co
);
310 static void vu_blk_queue_set_started(VuDev
*vu_dev
, int idx
, bool started
)
316 vq
= vu_get_queue(vu_dev
, idx
);
317 vu_set_queue_handler(vu_dev
, vq
, started
? vu_blk_process_vq
: NULL
);
320 static uint64_t vu_blk_get_features(VuDev
*dev
)
323 VuServer
*server
= container_of(dev
, VuServer
, vu_dev
);
324 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
325 features
= 1ull << VIRTIO_BLK_F_SIZE_MAX
|
326 1ull << VIRTIO_BLK_F_SEG_MAX
|
327 1ull << VIRTIO_BLK_F_TOPOLOGY
|
328 1ull << VIRTIO_BLK_F_BLK_SIZE
|
329 1ull << VIRTIO_BLK_F_FLUSH
|
330 1ull << VIRTIO_BLK_F_DISCARD
|
331 1ull << VIRTIO_BLK_F_WRITE_ZEROES
|
332 1ull << VIRTIO_BLK_F_CONFIG_WCE
|
333 1ull << VIRTIO_BLK_F_MQ
|
334 1ull << VIRTIO_F_VERSION_1
|
335 1ull << VIRTIO_RING_F_INDIRECT_DESC
|
336 1ull << VIRTIO_RING_F_EVENT_IDX
|
337 1ull << VHOST_USER_F_PROTOCOL_FEATURES
;
339 if (!vexp
->writable
) {
340 features
|= 1ull << VIRTIO_BLK_F_RO
;
346 static uint64_t vu_blk_get_protocol_features(VuDev
*dev
)
348 return 1ull << VHOST_USER_PROTOCOL_F_CONFIG
|
349 1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
;
353 vu_blk_get_config(VuDev
*vu_dev
, uint8_t *config
, uint32_t len
)
355 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
356 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
358 if (len
> sizeof(struct virtio_blk_config
)) {
362 memcpy(config
, &vexp
->blkcfg
, len
);
367 vu_blk_set_config(VuDev
*vu_dev
, const uint8_t *data
,
368 uint32_t offset
, uint32_t size
, uint32_t flags
)
370 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
371 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
374 /* don't support live migration */
375 if (flags
!= VHOST_SET_CONFIG_TYPE_MASTER
) {
379 if (offset
!= offsetof(struct virtio_blk_config
, wce
) ||
385 vexp
->blkcfg
.wce
= wce
;
386 blk_set_enable_write_cache(vexp
->export
.blk
, wce
);
391 * When the client disconnects, it sends a VHOST_USER_NONE request
392 * and vu_process_message will simple call exit which cause the VM
394 * To avoid this issue, process VHOST_USER_NONE request ahead
395 * of vu_process_message.
398 static int vu_blk_process_msg(VuDev
*dev
, VhostUserMsg
*vmsg
, int *do_reply
)
400 if (vmsg
->request
== VHOST_USER_NONE
) {
401 dev
->panic(dev
, "disconnect");
407 static const VuDevIface vu_blk_iface
= {
408 .get_features
= vu_blk_get_features
,
409 .queue_set_started
= vu_blk_queue_set_started
,
410 .get_protocol_features
= vu_blk_get_protocol_features
,
411 .get_config
= vu_blk_get_config
,
412 .set_config
= vu_blk_set_config
,
413 .process_msg
= vu_blk_process_msg
,
416 static void blk_aio_attached(AioContext
*ctx
, void *opaque
)
418 VuBlkExport
*vexp
= opaque
;
420 vexp
->export
.ctx
= ctx
;
421 vhost_user_server_attach_aio_context(&vexp
->vu_server
, ctx
);
424 static void blk_aio_detach(void *opaque
)
426 VuBlkExport
*vexp
= opaque
;
428 vhost_user_server_detach_aio_context(&vexp
->vu_server
);
429 vexp
->export
.ctx
= NULL
;
433 vu_blk_initialize_config(BlockDriverState
*bs
,
434 struct virtio_blk_config
*config
,
439 cpu_to_le64(bdrv_getlength(bs
) >> VIRTIO_BLK_SECTOR_BITS
);
440 config
->blk_size
= cpu_to_le32(blk_size
);
441 config
->size_max
= cpu_to_le32(0);
442 config
->seg_max
= cpu_to_le32(128 - 2);
443 config
->min_io_size
= cpu_to_le16(1);
444 config
->opt_io_size
= cpu_to_le32(1);
445 config
->num_queues
= cpu_to_le16(num_queues
);
446 config
->max_discard_sectors
=
447 cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS
);
448 config
->max_discard_seg
= cpu_to_le32(1);
449 config
->discard_sector_alignment
=
450 cpu_to_le32(blk_size
>> VIRTIO_BLK_SECTOR_BITS
);
451 config
->max_write_zeroes_sectors
452 = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
);
453 config
->max_write_zeroes_seg
= cpu_to_le32(1);
456 static void vu_blk_exp_request_shutdown(BlockExport
*exp
)
458 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
460 vhost_user_server_stop(&vexp
->vu_server
);
463 static int vu_blk_exp_create(BlockExport
*exp
, BlockExportOptions
*opts
,
466 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
467 BlockExportOptionsVhostUserBlk
*vu_opts
= &opts
->u
.vhost_user_blk
;
468 Error
*local_err
= NULL
;
469 uint64_t logical_block_size
;
470 uint16_t num_queues
= VHOST_USER_BLK_NUM_QUEUES_DEFAULT
;
472 vexp
->writable
= opts
->writable
;
473 vexp
->blkcfg
.wce
= 0;
475 if (vu_opts
->has_logical_block_size
) {
476 logical_block_size
= vu_opts
->logical_block_size
;
478 logical_block_size
= VIRTIO_BLK_SECTOR_SIZE
;
480 check_block_size(exp
->id
, "logical-block-size", logical_block_size
,
483 error_propagate(errp
, local_err
);
486 vexp
->blk_size
= logical_block_size
;
487 blk_set_guest_block_size(exp
->blk
, logical_block_size
);
489 if (vu_opts
->has_num_queues
) {
490 num_queues
= vu_opts
->num_queues
;
492 if (num_queues
== 0) {
493 error_setg(errp
, "num-queues must be greater than 0");
497 vu_blk_initialize_config(blk_bs(exp
->blk
), &vexp
->blkcfg
,
498 logical_block_size
, num_queues
);
500 blk_add_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
503 if (!vhost_user_server_start(&vexp
->vu_server
, vu_opts
->addr
, exp
->ctx
,
504 num_queues
, &vu_blk_iface
, errp
)) {
505 blk_remove_aio_context_notifier(exp
->blk
, blk_aio_attached
,
506 blk_aio_detach
, vexp
);
507 return -EADDRNOTAVAIL
;
513 static void vu_blk_exp_delete(BlockExport
*exp
)
515 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
517 blk_remove_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
521 const BlockExportDriver blk_exp_vhost_user_blk
= {
522 .type
= BLOCK_EXPORT_TYPE_VHOST_USER_BLK
,
523 .instance_size
= sizeof(VuBlkExport
),
524 .create
= vu_blk_exp_create
,
525 .delete = vu_blk_exp_delete
,
526 .request_shutdown
= vu_blk_exp_request_shutdown
,