4 * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 #include <sys/types.h>
45 #ifdef RTE_LIBRTE_VHOST_NUMA
49 #include <rte_common.h>
50 #include <rte_malloc.h>
54 #include "vhost_user.h"
56 #define VIRTIO_MIN_MTU 68
57 #define VIRTIO_MAX_MTU 65535
59 static const char *vhost_message_str
[VHOST_USER_MAX
] = {
60 [VHOST_USER_NONE
] = "VHOST_USER_NONE",
61 [VHOST_USER_GET_FEATURES
] = "VHOST_USER_GET_FEATURES",
62 [VHOST_USER_SET_FEATURES
] = "VHOST_USER_SET_FEATURES",
63 [VHOST_USER_SET_OWNER
] = "VHOST_USER_SET_OWNER",
64 [VHOST_USER_RESET_OWNER
] = "VHOST_USER_RESET_OWNER",
65 [VHOST_USER_SET_MEM_TABLE
] = "VHOST_USER_SET_MEM_TABLE",
66 [VHOST_USER_SET_LOG_BASE
] = "VHOST_USER_SET_LOG_BASE",
67 [VHOST_USER_SET_LOG_FD
] = "VHOST_USER_SET_LOG_FD",
68 [VHOST_USER_SET_VRING_NUM
] = "VHOST_USER_SET_VRING_NUM",
69 [VHOST_USER_SET_VRING_ADDR
] = "VHOST_USER_SET_VRING_ADDR",
70 [VHOST_USER_SET_VRING_BASE
] = "VHOST_USER_SET_VRING_BASE",
71 [VHOST_USER_GET_VRING_BASE
] = "VHOST_USER_GET_VRING_BASE",
72 [VHOST_USER_SET_VRING_KICK
] = "VHOST_USER_SET_VRING_KICK",
73 [VHOST_USER_SET_VRING_CALL
] = "VHOST_USER_SET_VRING_CALL",
74 [VHOST_USER_SET_VRING_ERR
] = "VHOST_USER_SET_VRING_ERR",
75 [VHOST_USER_GET_PROTOCOL_FEATURES
] = "VHOST_USER_GET_PROTOCOL_FEATURES",
76 [VHOST_USER_SET_PROTOCOL_FEATURES
] = "VHOST_USER_SET_PROTOCOL_FEATURES",
77 [VHOST_USER_GET_QUEUE_NUM
] = "VHOST_USER_GET_QUEUE_NUM",
78 [VHOST_USER_SET_VRING_ENABLE
] = "VHOST_USER_SET_VRING_ENABLE",
79 [VHOST_USER_SEND_RARP
] = "VHOST_USER_SEND_RARP",
80 [VHOST_USER_NET_SET_MTU
] = "VHOST_USER_NET_SET_MTU",
81 [VHOST_USER_GET_CONFIG
] = "VHOST_USER_GET_CONFIG",
82 [VHOST_USER_SET_CONFIG
] = "VHOST_USER_SET_CONFIG",
83 [VHOST_USER_NVME_ADMIN
] = "VHOST_USER_NVME_ADMIN",
84 [VHOST_USER_NVME_SET_CQ_CALL
] = "VHOST_USER_NVME_SET_CQ_CALL",
85 [VHOST_USER_NVME_GET_CAP
] = "VHOST_USER_NVME_GET_CAP",
86 [VHOST_USER_NVME_START_STOP
] = "VHOST_USER_NVME_START_STOP",
87 [VHOST_USER_NVME_IO_CMD
] = "VHOST_USER_NVME_IO_CMD"
96 ret
= fstat(fd
, &stat
);
97 return ret
== -1 ? (uint64_t)-1 : (uint64_t)stat
.st_blksize
;
101 free_mem_region(struct virtio_net
*dev
)
104 struct rte_vhost_mem_region
*reg
;
106 if (!dev
|| !dev
->mem
)
109 for (i
= 0; i
< dev
->mem
->nregions
; i
++) {
110 reg
= &dev
->mem
->regions
[i
];
111 if (reg
->host_user_addr
) {
112 munmap(reg
->mmap_addr
, reg
->mmap_size
);
119 vhost_backend_cleanup(struct virtio_net
*dev
)
124 if (dev
->has_new_mem_table
) {
125 for (i
= 0; i
< dev
->mem
->nregions
; i
++) {
126 close(dev
->mem_table_fds
[i
]);
128 dev
->has_new_mem_table
= 0;
130 free_mem_region(dev
);
135 free(dev
->guest_pages
);
136 dev
->guest_pages
= NULL
;
139 munmap((void *)(uintptr_t)dev
->log_addr
, dev
->log_size
);
145 * This function just returns success at the moment unless
146 * the device hasn't been initialised.
149 vhost_user_set_owner(void)
155 vhost_user_reset_owner(struct virtio_net
*dev
)
157 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
158 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
159 dev
->notify_ops
->destroy_device(dev
->vid
);
162 cleanup_device(dev
, 0);
168 * The features that we support are requested.
171 vhost_user_get_features(struct virtio_net
*dev
)
173 return dev
->features
;
177 * We receive the negotiated features supported by us and the virtio device.
180 vhost_user_set_features(struct virtio_net
*dev
, uint64_t features
)
182 uint64_t vhost_features
= 0;
184 vhost_features
= vhost_user_get_features(dev
);
185 if (features
& ~vhost_features
) {
186 RTE_LOG(ERR
, VHOST_CONFIG
,
187 "(%d) received invalid negotiated features.\n",
192 if ((dev
->flags
& VIRTIO_DEV_RUNNING
) && dev
->negotiated_features
!= features
) {
193 if (dev
->notify_ops
->features_changed
) {
194 dev
->notify_ops
->features_changed(dev
->vid
, features
);
196 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
197 dev
->notify_ops
->destroy_device(dev
->vid
);
201 dev
->negotiated_features
= features
;
202 if (dev
->negotiated_features
&
203 ((1 << VIRTIO_NET_F_MRG_RXBUF
) | (1ULL << VIRTIO_F_VERSION_1
))) {
204 dev
->vhost_hlen
= sizeof(struct virtio_net_hdr_mrg_rxbuf
);
206 dev
->vhost_hlen
= sizeof(struct virtio_net_hdr
);
208 VHOST_LOG_DEBUG(VHOST_CONFIG
,
209 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
211 (dev
->negotiated_features
& (1 << VIRTIO_NET_F_MRG_RXBUF
)) ? "on" : "off",
212 (dev
->negotiated_features
& (1ULL << VIRTIO_F_VERSION_1
)) ? "on" : "off");
218 * The virtio device sends us the size of the descriptor ring.
221 vhost_user_set_vring_num(struct virtio_net
*dev
,
224 struct vhost_virtqueue
*vq
= dev
->virtqueue
[msg
->payload
.state
.index
];
226 vq
->size
= msg
->payload
.state
.num
;
228 if (dev
->dequeue_zero_copy
) {
230 vq
->last_zmbuf_idx
= 0;
231 vq
->zmbuf_size
= vq
->size
;
232 vq
->zmbufs
= rte_zmalloc(NULL
, vq
->zmbuf_size
*
233 sizeof(struct zcopy_mbuf
), 0);
234 if (vq
->zmbufs
== NULL
) {
235 RTE_LOG(WARNING
, VHOST_CONFIG
,
236 "failed to allocate mem for zero copy; "
237 "zero copy is force disabled\n");
238 dev
->dequeue_zero_copy
= 0;
242 vq
->shadow_used_ring
= rte_malloc(NULL
,
243 vq
->size
* sizeof(struct vring_used_elem
),
244 RTE_CACHE_LINE_SIZE
);
245 if (!vq
->shadow_used_ring
) {
246 RTE_LOG(ERR
, VHOST_CONFIG
,
247 "failed to allocate memory for shadow used ring.\n");
255 * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
256 * same numa node as the memory of vring descriptor.
258 #ifdef RTE_LIBRTE_VHOST_NUMA
259 static struct virtio_net
*
260 numa_realloc(struct virtio_net
*dev
, int index
)
262 int oldnode
, newnode
;
263 struct virtio_net
*old_dev
;
264 struct vhost_virtqueue
*old_vq
, *vq
;
268 vq
= old_vq
= dev
->virtqueue
[index
];
270 ret
= get_mempolicy(&newnode
, NULL
, 0, old_vq
->desc
,
271 MPOL_F_NODE
| MPOL_F_ADDR
);
273 /* check if we need to reallocate vq */
274 ret
|= get_mempolicy(&oldnode
, NULL
, 0, old_vq
,
275 MPOL_F_NODE
| MPOL_F_ADDR
);
277 RTE_LOG(ERR
, VHOST_CONFIG
,
278 "Unable to get vq numa information.\n");
281 if (oldnode
!= newnode
) {
282 RTE_LOG(INFO
, VHOST_CONFIG
,
283 "reallocate vq from %d to %d node\n", oldnode
, newnode
);
284 vq
= rte_malloc_socket(NULL
, sizeof(*vq
), 0, newnode
);
288 memcpy(vq
, old_vq
, sizeof(*vq
));
292 /* check if we need to reallocate dev */
293 ret
= get_mempolicy(&oldnode
, NULL
, 0, old_dev
,
294 MPOL_F_NODE
| MPOL_F_ADDR
);
296 RTE_LOG(ERR
, VHOST_CONFIG
,
297 "Unable to get dev numa information.\n");
300 if (oldnode
!= newnode
) {
301 RTE_LOG(INFO
, VHOST_CONFIG
,
302 "reallocate dev from %d to %d node\n",
304 dev
= rte_malloc_socket(NULL
, sizeof(*dev
), 0, newnode
);
310 memcpy(dev
, old_dev
, sizeof(*dev
));
315 dev
->virtqueue
[index
] = vq
;
316 vhost_devices
[dev
->vid
] = dev
;
321 static struct virtio_net
*
322 numa_realloc(struct virtio_net
*dev
, int index __rte_unused
)
329 * Converts QEMU virtual address to Vhost virtual address. This function is
330 * used to convert the ring addresses to our address space.
333 qva_to_vva(struct virtio_net
*dev
, uint64_t qva
, uint64_t *len
)
335 struct rte_vhost_mem_region
*reg
;
338 /* Find the region where the address lives. */
339 for (i
= 0; i
< dev
->mem
->nregions
; i
++) {
340 reg
= &dev
->mem
->regions
[i
];
342 if (qva
>= reg
->guest_user_addr
&&
343 qva
< reg
->guest_user_addr
+ reg
->size
) {
345 if (unlikely(*len
> reg
->guest_user_addr
+ reg
->size
- qva
))
346 *len
= reg
->guest_user_addr
+ reg
->size
- qva
;
348 return qva
- reg
->guest_user_addr
+
356 static int vhost_setup_mem_table(struct virtio_net
*dev
);
359 * The virtio device sends us the desc, used and avail ring addresses.
360 * This function then converts these to our address space.
363 vhost_user_set_vring_addr(struct virtio_net
*dev
, VhostUserMsg
*msg
)
365 struct vhost_virtqueue
*vq
;
368 /* Remove from the data plane. */
369 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
370 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
371 dev
->notify_ops
->destroy_device(dev
->vid
);
374 if (dev
->has_new_mem_table
) {
375 vhost_setup_mem_table(dev
);
376 dev
->has_new_mem_table
= 0;
379 if (dev
->mem
== NULL
)
382 /* addr->index refers to the queue index. The txq 1, rxq is 0. */
383 vq
= dev
->virtqueue
[msg
->payload
.addr
.index
];
385 /* The addresses are converted from QEMU virtual to Vhost virtual. */
386 len
= sizeof(struct vring_desc
) * vq
->size
;
387 vq
->desc
= (struct vring_desc
*)(uintptr_t)qva_to_vva(dev
,
388 msg
->payload
.addr
.desc_user_addr
, &len
);
389 if (vq
->desc
== 0 || len
!= sizeof(struct vring_desc
) * vq
->size
) {
390 RTE_LOG(ERR
, VHOST_CONFIG
,
391 "(%d) failed to map desc ring.\n",
396 dev
= numa_realloc(dev
, msg
->payload
.addr
.index
);
397 vq
= dev
->virtqueue
[msg
->payload
.addr
.index
];
399 len
= sizeof(struct vring_avail
) + sizeof(uint16_t) * vq
->size
;
400 vq
->avail
= (struct vring_avail
*)(uintptr_t)qva_to_vva(dev
,
401 msg
->payload
.addr
.avail_user_addr
, &len
);
402 if (vq
->avail
== 0 ||
403 len
!= sizeof(struct vring_avail
)
404 + sizeof(uint16_t) * vq
->size
) {
405 RTE_LOG(ERR
, VHOST_CONFIG
,
406 "(%d) failed to find avail ring address.\n",
411 len
= sizeof(struct vring_used
) +
412 sizeof(struct vring_used_elem
) * vq
->size
;
413 vq
->used
= (struct vring_used
*)(uintptr_t)qva_to_vva(dev
,
414 msg
->payload
.addr
.used_user_addr
, &len
);
415 if (vq
->used
== 0 || len
!= sizeof(struct vring_used
) +
416 sizeof(struct vring_used_elem
) * vq
->size
) {
418 RTE_LOG(ERR
, VHOST_CONFIG
,
419 "(%d) failed to find used ring address.\n",
424 if (vq
->last_used_idx
!= vq
->used
->idx
) {
425 RTE_LOG(WARNING
, VHOST_CONFIG
,
426 "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
427 "some packets maybe resent for Tx and dropped for Rx\n",
428 vq
->last_used_idx
, vq
->used
->idx
);
429 vq
->last_used_idx
= vq
->used
->idx
;
430 vq
->last_avail_idx
= vq
->used
->idx
;
433 vq
->log_guest_addr
= msg
->payload
.addr
.log_guest_addr
;
435 VHOST_LOG_DEBUG(VHOST_CONFIG
, "(%d) mapped address desc: %p\n",
437 VHOST_LOG_DEBUG(VHOST_CONFIG
, "(%d) mapped address avail: %p\n",
438 dev
->vid
, vq
->avail
);
439 VHOST_LOG_DEBUG(VHOST_CONFIG
, "(%d) mapped address used: %p\n",
441 VHOST_LOG_DEBUG(VHOST_CONFIG
, "(%d) log_guest_addr: %" PRIx64
"\n",
442 dev
->vid
, vq
->log_guest_addr
);
448 * The virtio device sends us the available ring last used index.
451 vhost_user_set_vring_base(struct virtio_net
*dev
,
454 /* Remove from the data plane. */
455 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
456 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
457 dev
->notify_ops
->destroy_device(dev
->vid
);
460 dev
->virtqueue
[msg
->payload
.state
.index
]->last_used_idx
= msg
->payload
.state
.num
;
461 dev
->virtqueue
[msg
->payload
.state
.index
]->last_avail_idx
= msg
->payload
.state
.num
;
467 add_one_guest_page(struct virtio_net
*dev
, uint64_t guest_phys_addr
,
468 uint64_t host_phys_addr
, uint64_t size
)
470 struct guest_page
*page
, *last_page
;
472 if (dev
->nr_guest_pages
== dev
->max_guest_pages
) {
473 dev
->max_guest_pages
= RTE_MAX(8U, dev
->max_guest_pages
* 2);
474 dev
->guest_pages
= realloc(dev
->guest_pages
,
475 dev
->max_guest_pages
* sizeof(*page
));
478 if (dev
->nr_guest_pages
> 0) {
479 last_page
= &dev
->guest_pages
[dev
->nr_guest_pages
- 1];
480 /* merge if the two pages are continuous */
481 if (host_phys_addr
== last_page
->host_phys_addr
+
483 last_page
->size
+= size
;
488 page
= &dev
->guest_pages
[dev
->nr_guest_pages
++];
489 page
->guest_phys_addr
= guest_phys_addr
;
490 page
->host_phys_addr
= host_phys_addr
;
495 add_guest_pages(struct virtio_net
*dev
, struct rte_vhost_mem_region
*reg
,
498 uint64_t reg_size
= reg
->size
;
499 uint64_t host_user_addr
= reg
->host_user_addr
;
500 uint64_t guest_phys_addr
= reg
->guest_phys_addr
;
501 uint64_t host_phys_addr
;
504 host_phys_addr
= rte_mem_virt2phy((void *)(uintptr_t)host_user_addr
);
505 size
= page_size
- (guest_phys_addr
& (page_size
- 1));
506 size
= RTE_MIN(size
, reg_size
);
508 add_one_guest_page(dev
, guest_phys_addr
, host_phys_addr
, size
);
509 host_user_addr
+= size
;
510 guest_phys_addr
+= size
;
513 while (reg_size
> 0) {
514 size
= RTE_MIN(reg_size
, page_size
);
515 host_phys_addr
= rte_mem_virt2phy((void *)(uintptr_t)
517 add_one_guest_page(dev
, guest_phys_addr
, host_phys_addr
, size
);
519 host_user_addr
+= size
;
520 guest_phys_addr
+= size
;
525 #ifdef RTE_LIBRTE_VHOST_DEBUG
526 /* TODO: enable it only in debug mode? */
528 dump_guest_pages(struct virtio_net
*dev
)
531 struct guest_page
*page
;
533 for (i
= 0; i
< dev
->nr_guest_pages
; i
++) {
534 page
= &dev
->guest_pages
[i
];
536 RTE_LOG(INFO
, VHOST_CONFIG
,
537 "guest physical page region %u\n"
538 "\t guest_phys_addr: %" PRIx64
"\n"
539 "\t host_phys_addr : %" PRIx64
"\n"
540 "\t size : %" PRIx64
"\n",
542 page
->guest_phys_addr
,
543 page
->host_phys_addr
,
548 #define dump_guest_pages(dev)
552 vhost_user_set_mem_table(struct virtio_net
*dev
, struct VhostUserMsg
*pmsg
)
556 if (dev
->has_new_mem_table
) {
558 * The previous mem table was not consumed, so close the
559 * file descriptors from that mem table before copying
562 for (i
= 0; i
< dev
->mem_table
.nregions
; i
++) {
563 close(dev
->mem_table_fds
[i
]);
567 memcpy(&dev
->mem_table
, &pmsg
->payload
.memory
, sizeof(dev
->mem_table
));
568 memcpy(dev
->mem_table_fds
, pmsg
->fds
, sizeof(dev
->mem_table_fds
));
569 dev
->has_new_mem_table
= 1;
570 /* vhost-user-nvme will not send
571 * set vring addr message, enable
572 * memory address table now.
574 if (dev
->has_new_mem_table
&& dev
->is_nvme
) {
575 vhost_setup_mem_table(dev
);
576 dev
->has_new_mem_table
= 0;
583 vhost_setup_mem_table(struct virtio_net
*dev
)
585 struct VhostUserMemory memory
= dev
->mem_table
;
586 struct rte_vhost_mem_region
*reg
;
587 struct vhost_virtqueue
*vq
;
590 uint64_t mmap_offset
;
596 free_mem_region(dev
);
601 for (i
= 0; i
< dev
->nr_vring
; i
++) {
602 vq
= dev
->virtqueue
[i
];
603 /* Those addresses won't be valid anymore in host address space
604 * after setting new mem table. Initiator need to resend these
612 dev
->nr_guest_pages
= 0;
613 if (!dev
->guest_pages
) {
614 dev
->max_guest_pages
= 8;
615 dev
->guest_pages
= malloc(dev
->max_guest_pages
*
616 sizeof(struct guest_page
));
619 dev
->mem
= rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory
) +
620 sizeof(struct rte_vhost_mem_region
) * memory
.nregions
, 0);
621 if (dev
->mem
== NULL
) {
622 RTE_LOG(ERR
, VHOST_CONFIG
,
623 "(%d) failed to allocate memory for dev->mem\n",
627 dev
->mem
->nregions
= memory
.nregions
;
629 for (i
= 0; i
< memory
.nregions
; i
++) {
630 fd
= dev
->mem_table_fds
[i
];
631 reg
= &dev
->mem
->regions
[i
];
633 reg
->guest_phys_addr
= memory
.regions
[i
].guest_phys_addr
;
634 reg
->guest_user_addr
= memory
.regions
[i
].userspace_addr
;
635 reg
->size
= memory
.regions
[i
].memory_size
;
638 mmap_offset
= memory
.regions
[i
].mmap_offset
;
639 mmap_size
= reg
->size
+ mmap_offset
;
641 /* mmap() without flag of MAP_ANONYMOUS, should be called
642 * with length argument aligned with hugepagesz at older
643 * longterm version Linux, like 2.6.32 and 3.2.72, or
644 * mmap() will fail with EINVAL.
646 * to avoid failure, make sure in caller to keep length
649 alignment
= get_blk_size(fd
);
650 if (alignment
== (uint64_t)-1) {
651 RTE_LOG(ERR
, VHOST_CONFIG
,
652 "couldn't get hugepage size through fstat\n");
655 mmap_size
= RTE_ALIGN_CEIL(mmap_size
, alignment
);
657 mmap_addr
= mmap(NULL
, mmap_size
, PROT_READ
| PROT_WRITE
,
658 MAP_SHARED
| MAP_POPULATE
, fd
, 0);
660 if (mmap_addr
== MAP_FAILED
) {
661 RTE_LOG(ERR
, VHOST_CONFIG
,
662 "mmap region %u failed.\n", i
);
666 if (madvise(mmap_addr
, mmap_size
, MADV_DONTDUMP
) != 0) {
667 RTE_LOG(INFO
, VHOST_CONFIG
,
668 "MADV_DONTDUMP advice setting failed.\n");
671 reg
->mmap_addr
= mmap_addr
;
672 reg
->mmap_size
= mmap_size
;
673 reg
->host_user_addr
= (uint64_t)(uintptr_t)mmap_addr
+
676 if (dev
->dequeue_zero_copy
)
677 add_guest_pages(dev
, reg
, alignment
);
679 RTE_LOG(INFO
, VHOST_CONFIG
,
680 "guest memory region %u, size: 0x%" PRIx64
"\n"
681 "\t guest physical addr: 0x%" PRIx64
"\n"
682 "\t guest virtual addr: 0x%" PRIx64
"\n"
683 "\t host virtual addr: 0x%" PRIx64
"\n"
684 "\t mmap addr : 0x%" PRIx64
"\n"
685 "\t mmap size : 0x%" PRIx64
"\n"
686 "\t mmap align: 0x%" PRIx64
"\n"
687 "\t mmap off : 0x%" PRIx64
"\n",
689 reg
->guest_phys_addr
,
690 reg
->guest_user_addr
,
692 (uint64_t)(uintptr_t)mmap_addr
,
698 dump_guest_pages(dev
);
703 free_mem_region(dev
);
710 vq_is_ready(struct vhost_virtqueue
*vq
)
712 return vq
&& vq
->desc
&&
713 vq
->kickfd
!= VIRTIO_UNINITIALIZED_EVENTFD
&&
714 vq
->callfd
!= VIRTIO_UNINITIALIZED_EVENTFD
&&
715 vq
->kickfd
!= VIRTIO_INVALID_EVENTFD
&&
716 vq
->callfd
!= VIRTIO_INVALID_EVENTFD
;
720 virtio_is_ready(struct virtio_net
*dev
)
722 struct vhost_virtqueue
*vq
;
725 if (dev
->nr_vring
== 0)
728 for (i
= 0; i
< dev
->nr_vring
; i
++) {
729 vq
= dev
->virtqueue
[i
];
731 if (vq_is_ready(vq
)) {
732 RTE_LOG(INFO
, VHOST_CONFIG
,
733 "virtio is now ready for processing.\n");
742 vhost_user_set_vring_call(struct virtio_net
*dev
, struct VhostUserMsg
*pmsg
)
744 struct vhost_vring_file file
;
745 struct vhost_virtqueue
*vq
;
747 /* Remove from the data plane. */
748 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
749 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
750 dev
->notify_ops
->destroy_device(dev
->vid
);
753 file
.index
= pmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
754 if (pmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
)
755 file
.fd
= VIRTIO_INVALID_EVENTFD
;
757 file
.fd
= pmsg
->fds
[0];
758 RTE_LOG(INFO
, VHOST_CONFIG
,
759 "vring call idx:%d file:%d\n", file
.index
, file
.fd
);
761 vq
= dev
->virtqueue
[file
.index
];
765 vq
->callfd
= file
.fd
;
769 vhost_user_set_vring_kick(struct virtio_net
*dev
, struct VhostUserMsg
*pmsg
)
771 struct vhost_vring_file file
;
772 struct vhost_virtqueue
*vq
;
774 /* Remove from the data plane. */
775 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
776 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
777 dev
->notify_ops
->destroy_device(dev
->vid
);
780 file
.index
= pmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
781 if (pmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
)
782 file
.fd
= VIRTIO_INVALID_EVENTFD
;
784 file
.fd
= pmsg
->fds
[0];
785 RTE_LOG(INFO
, VHOST_CONFIG
,
786 "vring kick idx:%d file:%d\n", file
.index
, file
.fd
);
788 vq
= dev
->virtqueue
[file
.index
];
791 vq
->kickfd
= file
.fd
;
795 free_zmbufs(struct vhost_virtqueue
*vq
)
797 struct zcopy_mbuf
*zmbuf
, *next
;
799 for (zmbuf
= TAILQ_FIRST(&vq
->zmbuf_list
);
800 zmbuf
!= NULL
; zmbuf
= next
) {
801 next
= TAILQ_NEXT(zmbuf
, next
);
803 rte_pktmbuf_free(zmbuf
->mbuf
);
804 TAILQ_REMOVE(&vq
->zmbuf_list
, zmbuf
, next
);
807 rte_free(vq
->zmbufs
);
811 * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
814 vhost_user_get_vring_base(struct virtio_net
*dev
,
817 struct vhost_virtqueue
*vq
= dev
->virtqueue
[msg
->payload
.state
.index
];
819 /* We have to stop the queue (virtio) if it is running. */
820 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
821 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
822 dev
->notify_ops
->destroy_device(dev
->vid
);
825 dev
->flags
&= ~VIRTIO_DEV_READY
;
827 /* Here we are safe to get the last used index */
828 msg
->payload
.state
.num
= vq
->last_used_idx
;
830 RTE_LOG(INFO
, VHOST_CONFIG
,
831 "vring base idx:%d file:%d\n", msg
->payload
.state
.index
, msg
->payload
.state
.num
);
833 * Based on current qemu vhost-user implementation, this message is
834 * sent and only sent in vhost_vring_stop.
835 * TODO: cleanup the vring, it isn't usable since here.
840 vq
->kickfd
= VIRTIO_UNINITIALIZED_EVENTFD
;
845 vq
->callfd
= VIRTIO_UNINITIALIZED_EVENTFD
;
847 if (dev
->dequeue_zero_copy
)
849 rte_free(vq
->shadow_used_ring
);
850 vq
->shadow_used_ring
= NULL
;
856 * when virtio queues are ready to work, qemu will send us to
857 * enable the virtio queue pair.
860 vhost_user_set_vring_enable(struct virtio_net
*dev
,
863 int enable
= (int)msg
->payload
.state
.num
;
865 RTE_LOG(INFO
, VHOST_CONFIG
,
866 "set queue enable: %d to qp idx: %d\n",
867 enable
, msg
->payload
.state
.index
);
869 if (dev
->notify_ops
->vring_state_changed
)
870 dev
->notify_ops
->vring_state_changed(dev
->vid
, msg
->payload
.state
.index
, enable
);
872 dev
->virtqueue
[msg
->payload
.state
.index
]->enabled
= enable
;
878 vhost_user_set_protocol_features(struct virtio_net
*dev
,
879 uint64_t protocol_features
)
881 if (protocol_features
& ~VHOST_USER_PROTOCOL_FEATURES
)
884 /* Remove from the data plane. */
885 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
886 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
887 dev
->notify_ops
->destroy_device(dev
->vid
);
890 dev
->protocol_features
= protocol_features
;
894 vhost_user_set_log_base(struct virtio_net
*dev
, struct VhostUserMsg
*msg
)
896 int fd
= msg
->fds
[0];
901 RTE_LOG(ERR
, VHOST_CONFIG
, "invalid log fd: %d\n", fd
);
905 if (msg
->size
!= sizeof(VhostUserLog
)) {
906 RTE_LOG(ERR
, VHOST_CONFIG
,
907 "invalid log base msg size: %"PRId32
" != %d\n",
908 msg
->size
, (int)sizeof(VhostUserLog
));
912 /* Remove from the data plane. */
913 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
914 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
915 dev
->notify_ops
->destroy_device(dev
->vid
);
918 size
= msg
->payload
.log
.mmap_size
;
919 off
= msg
->payload
.log
.mmap_offset
;
920 RTE_LOG(INFO
, VHOST_CONFIG
,
921 "log mmap size: %"PRId64
", offset: %"PRId64
"\n",
925 * mmap from 0 to workaround a hugepage mmap bug: mmap will
926 * fail when offset is not page size aligned.
928 addr
= mmap(0, size
+ off
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, fd
, 0);
930 if (addr
== MAP_FAILED
) {
931 RTE_LOG(ERR
, VHOST_CONFIG
, "mmap log base failed!\n");
936 * Free previously mapped log memory on occasionally
937 * multiple VHOST_USER_SET_LOG_BASE.
940 munmap((void *)(uintptr_t)dev
->log_addr
, dev
->log_size
);
942 dev
->log_addr
= (uint64_t)(uintptr_t)addr
;
943 dev
->log_base
= dev
->log_addr
+ off
;
944 dev
->log_size
= size
;
950 * An rarp packet is constructed and broadcasted to notify switches about
951 * the new location of the migrated VM, so that packets from outside will
952 * not be lost after migration.
954 * However, we don't actually "send" a rarp packet here, instead, we set
955 * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
958 vhost_user_send_rarp(struct virtio_net
*dev
, struct VhostUserMsg
*msg
)
960 uint8_t *mac
= (uint8_t *)&msg
->payload
.u64
;
962 RTE_LOG(DEBUG
, VHOST_CONFIG
,
963 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
964 mac
[0], mac
[1], mac
[2], mac
[3], mac
[4], mac
[5]);
965 memcpy(dev
->mac
.addr_bytes
, mac
, 6);
968 * Set the flag to inject a RARP broadcast packet at
969 * rte_vhost_dequeue_burst().
971 * rte_smp_wmb() is for making sure the mac is copied
972 * before the flag is set.
975 rte_atomic16_set(&dev
->broadcast_rarp
, 1);
981 vhost_user_net_set_mtu(struct virtio_net
*dev
, struct VhostUserMsg
*msg
)
983 if (msg
->payload
.u64
< VIRTIO_MIN_MTU
||
984 msg
->payload
.u64
> VIRTIO_MAX_MTU
) {
985 RTE_LOG(ERR
, VHOST_CONFIG
, "Invalid MTU size (%"PRIu64
")\n",
991 dev
->mtu
= msg
->payload
.u64
;
996 /* return bytes# of read on success or negative val on failure. */
998 read_vhost_message(int sockfd
, struct VhostUserMsg
*msg
)
1002 ret
= read_fd_message(sockfd
, (char *)msg
, VHOST_USER_HDR_SIZE
,
1003 msg
->fds
, VHOST_MEMORY_MAX_NREGIONS
);
1007 if (msg
&& msg
->size
) {
1008 if (msg
->size
> sizeof(msg
->payload
)) {
1009 RTE_LOG(ERR
, VHOST_CONFIG
,
1010 "invalid msg size: %d\n", msg
->size
);
1013 ret
= read(sockfd
, &msg
->payload
, msg
->size
);
1016 if (ret
!= (int)msg
->size
) {
1017 RTE_LOG(ERR
, VHOST_CONFIG
,
1018 "read control message failed\n");
1027 send_vhost_message(int sockfd
, struct VhostUserMsg
*msg
)
1034 msg
->flags
&= ~VHOST_USER_VERSION_MASK
;
1035 msg
->flags
&= ~VHOST_USER_NEED_REPLY
;
1036 msg
->flags
|= VHOST_USER_VERSION
;
1037 msg
->flags
|= VHOST_USER_REPLY_MASK
;
1039 ret
= send_fd_message(sockfd
, (char *)msg
,
1040 VHOST_USER_HDR_SIZE
+ msg
->size
, NULL
, 0);
1046 * Allocate a queue pair if it hasn't been allocated yet
1049 vhost_user_check_and_alloc_queue_pair(struct virtio_net
*dev
, VhostUserMsg
*msg
)
1053 switch (msg
->request
) {
1054 case VHOST_USER_SET_VRING_KICK
:
1055 case VHOST_USER_SET_VRING_CALL
:
1056 case VHOST_USER_SET_VRING_ERR
:
1057 vring_idx
= msg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
1059 case VHOST_USER_SET_VRING_NUM
:
1060 case VHOST_USER_SET_VRING_BASE
:
1061 case VHOST_USER_SET_VRING_ENABLE
:
1062 vring_idx
= msg
->payload
.state
.index
;
1064 case VHOST_USER_SET_VRING_ADDR
:
1065 vring_idx
= msg
->payload
.addr
.index
;
1071 if (vring_idx
>= VHOST_MAX_VRING
) {
1072 RTE_LOG(ERR
, VHOST_CONFIG
,
1073 "invalid vring index: %u\n", vring_idx
);
1077 if (dev
->virtqueue
[vring_idx
])
1080 return alloc_vring_queue(dev
, vring_idx
);
1084 vhost_user_nvme_io_request_passthrough(struct virtio_net
*dev
,
1085 uint16_t qid
, uint16_t tail_head
,
1086 bool is_submission_queue
)
1092 vhost_user_nvme_admin_passthrough(struct virtio_net
*dev
,
1093 void *cmd
, void *cqe
, void *buf
)
1095 if (dev
->notify_ops
->vhost_nvme_admin_passthrough
) {
1096 return dev
->notify_ops
->vhost_nvme_admin_passthrough(dev
->vid
, cmd
, cqe
, buf
);
1103 vhost_user_nvme_set_cq_call(struct virtio_net
*dev
, uint16_t qid
, int fd
)
1105 if (dev
->notify_ops
->vhost_nvme_set_cq_call
) {
1106 return dev
->notify_ops
->vhost_nvme_set_cq_call(dev
->vid
, qid
, fd
);
1113 vhost_user_nvme_get_cap(struct virtio_net
*dev
, uint64_t *cap
)
1115 if (dev
->notify_ops
->vhost_nvme_get_cap
) {
1116 return dev
->notify_ops
->vhost_nvme_get_cap(dev
->vid
, cap
);
1123 vhost_user_msg_handler(int vid
, int fd
)
1125 struct virtio_net
*dev
;
1126 struct VhostUserMsg msg
;
1127 struct vhost_vring_file file
;
1134 uint16_t qid
, tail_head
;
1135 bool is_submission_queue
;
1137 dev
= get_device(vid
);
1141 if (!dev
->notify_ops
) {
1142 dev
->notify_ops
= vhost_driver_callback_get(dev
->ifname
);
1143 if (!dev
->notify_ops
) {
1144 RTE_LOG(ERR
, VHOST_CONFIG
,
1145 "failed to get callback ops for driver %s\n",
1151 ret
= read_vhost_message(fd
, &msg
);
1152 if (ret
<= 0 || msg
.request
>= VHOST_USER_MAX
) {
1154 RTE_LOG(ERR
, VHOST_CONFIG
,
1155 "vhost read message failed\n");
1157 RTE_LOG(INFO
, VHOST_CONFIG
,
1158 "vhost peer closed\n");
1160 RTE_LOG(ERR
, VHOST_CONFIG
,
1161 "vhost read incorrect message\n");
1166 RTE_LOG(INFO
, VHOST_CONFIG
, "%s: read message %s\n",
1167 dev
->ifname
, vhost_message_str
[msg
.request
]);
1169 ret
= vhost_user_check_and_alloc_queue_pair(dev
, &msg
);
1171 RTE_LOG(ERR
, VHOST_CONFIG
,
1172 "failed to alloc queue\n");
1176 switch (msg
.request
) {
1177 case VHOST_USER_GET_CONFIG
:
1178 if (dev
->notify_ops
->get_config(dev
->vid
,
1179 msg
.payload
.config
.region
,
1180 msg
.payload
.config
.size
) != 0) {
1181 msg
.size
= sizeof(uint64_t);
1183 send_vhost_message(fd
, &msg
);
1185 case VHOST_USER_SET_CONFIG
:
1186 if ((dev
->notify_ops
->set_config(dev
->vid
,
1187 msg
.payload
.config
.region
,
1188 msg
.payload
.config
.offset
,
1189 msg
.payload
.config
.size
,
1190 msg
.payload
.config
.flags
)) != 0) {
1196 case VHOST_USER_NVME_ADMIN
:
1197 if (!dev
->is_nvme
) {
1200 memcpy(cmd
, msg
.payload
.nvme
.cmd
.req
, sizeof(cmd
));
1201 ret
= vhost_user_nvme_admin_passthrough(dev
, cmd
, cqe
, buf
);
1202 memcpy(msg
.payload
.nvme
.cmd
.cqe
, cqe
, sizeof(cqe
));
1203 msg
.size
= sizeof(cqe
);
1204 /* NVMe Identify Command */
1205 if (cmd
[0] == 0x06) {
1206 memcpy(msg
.payload
.nvme
.buf
, &buf
, 4096);
1209 send_vhost_message(fd
, &msg
);
1211 case VHOST_USER_NVME_SET_CQ_CALL
:
1212 file
.index
= msg
.payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
1213 file
.fd
= msg
.fds
[0];
1214 ret
= vhost_user_nvme_set_cq_call(dev
, file
.index
, file
.fd
);
1216 case VHOST_USER_NVME_GET_CAP
:
1217 ret
= vhost_user_nvme_get_cap(dev
, &cap
);
1219 msg
.payload
.u64
= cap
;
1221 msg
.payload
.u64
= 0;
1222 msg
.size
= sizeof(msg
.payload
.u64
);
1223 send_vhost_message(fd
, &msg
);
1225 case VHOST_USER_NVME_START_STOP
:
1226 enable
= msg
.payload
.u64
;
1227 /* device must be started before set cq call */
1229 if (!(dev
->flags
& VIRTIO_DEV_RUNNING
)) {
1230 if (dev
->notify_ops
->new_device(dev
->vid
) == 0)
1231 dev
->flags
|= VIRTIO_DEV_RUNNING
;
1234 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
1235 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
1236 dev
->notify_ops
->destroy_device(dev
->vid
);
1240 case VHOST_USER_NVME_IO_CMD
:
1241 qid
= msg
.payload
.nvme_io
.qid
;
1242 tail_head
= msg
.payload
.nvme_io
.tail_head
;
1243 is_submission_queue
= (msg
.payload
.nvme_io
.queue_type
== VHOST_USER_NVME_SUBMISSION_QUEUE
) ? true : false;
1244 vhost_user_nvme_io_request_passthrough(dev
, qid
, tail_head
, is_submission_queue
);
1246 case VHOST_USER_GET_FEATURES
:
1247 msg
.payload
.u64
= vhost_user_get_features(dev
);
1248 msg
.size
= sizeof(msg
.payload
.u64
);
1249 send_vhost_message(fd
, &msg
);
1251 case VHOST_USER_SET_FEATURES
:
1252 vhost_user_set_features(dev
, msg
.payload
.u64
);
1255 case VHOST_USER_GET_PROTOCOL_FEATURES
:
1256 msg
.payload
.u64
= VHOST_USER_PROTOCOL_FEATURES
;
1257 msg
.size
= sizeof(msg
.payload
.u64
);
1258 send_vhost_message(fd
, &msg
);
1260 case VHOST_USER_SET_PROTOCOL_FEATURES
:
1261 vhost_user_set_protocol_features(dev
, msg
.payload
.u64
);
1264 case VHOST_USER_SET_OWNER
:
1265 vhost_user_set_owner();
1267 case VHOST_USER_RESET_OWNER
:
1268 vhost_user_reset_owner(dev
);
1271 case VHOST_USER_SET_MEM_TABLE
:
1272 ret
= vhost_user_set_mem_table(dev
, &msg
);
1275 case VHOST_USER_SET_LOG_BASE
:
1276 vhost_user_set_log_base(dev
, &msg
);
1278 /* it needs a reply */
1279 msg
.size
= sizeof(msg
.payload
.u64
);
1280 send_vhost_message(fd
, &msg
);
1282 case VHOST_USER_SET_LOG_FD
:
1284 RTE_LOG(INFO
, VHOST_CONFIG
, "not implemented.\n");
1287 case VHOST_USER_SET_VRING_NUM
:
1288 vhost_user_set_vring_num(dev
, &msg
);
1290 case VHOST_USER_SET_VRING_ADDR
:
1291 vhost_user_set_vring_addr(dev
, &msg
);
1293 case VHOST_USER_SET_VRING_BASE
:
1294 vhost_user_set_vring_base(dev
, &msg
);
1297 case VHOST_USER_GET_VRING_BASE
:
1298 vhost_user_get_vring_base(dev
, &msg
);
1299 msg
.size
= sizeof(msg
.payload
.state
);
1300 send_vhost_message(fd
, &msg
);
1303 case VHOST_USER_SET_VRING_KICK
:
1304 vhost_user_set_vring_kick(dev
, &msg
);
1306 case VHOST_USER_SET_VRING_CALL
:
1307 vhost_user_set_vring_call(dev
, &msg
);
1310 case VHOST_USER_SET_VRING_ERR
:
1311 if (!(msg
.payload
.u64
& VHOST_USER_VRING_NOFD_MASK
))
1313 RTE_LOG(INFO
, VHOST_CONFIG
, "not implemented\n");
1316 case VHOST_USER_GET_QUEUE_NUM
:
1317 msg
.payload
.u64
= VHOST_MAX_QUEUE_PAIRS
;
1318 msg
.size
= sizeof(msg
.payload
.u64
);
1319 send_vhost_message(fd
, &msg
);
1322 case VHOST_USER_SET_VRING_ENABLE
:
1323 vhost_user_set_vring_enable(dev
, &msg
);
1325 case VHOST_USER_SEND_RARP
:
1326 vhost_user_send_rarp(dev
, &msg
);
1329 case VHOST_USER_NET_SET_MTU
:
1330 ret
= vhost_user_net_set_mtu(dev
, &msg
);
1339 if (msg
.flags
& VHOST_USER_NEED_REPLY
) {
1340 msg
.payload
.u64
= !!ret
;
1341 msg
.size
= sizeof(msg
.payload
.u64
);
1342 send_vhost_message(fd
, &msg
);
1345 if (!(dev
->flags
& VIRTIO_DEV_RUNNING
) && virtio_is_ready(dev
)) {
1346 dev
->flags
|= VIRTIO_DEV_READY
;
1348 if (!(dev
->flags
& VIRTIO_DEV_RUNNING
)) {
1349 if (dev
->dequeue_zero_copy
) {
1350 RTE_LOG(INFO
, VHOST_CONFIG
,
1351 "dequeue zero copy is enabled\n");
1354 if (dev
->notify_ops
->new_device(dev
->vid
) == 0)
1355 dev
->flags
|= VIRTIO_DEV_RUNNING
;