4 * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <sys/types.h>
43 #ifdef RTE_LIBRTE_VHOST_NUMA
47 #include <rte_common.h>
48 #include <rte_malloc.h>
52 #include "vhost_user.h"
54 static const char *vhost_message_str
[VHOST_USER_MAX
] = {
55 [VHOST_USER_NONE
] = "VHOST_USER_NONE",
56 [VHOST_USER_GET_FEATURES
] = "VHOST_USER_GET_FEATURES",
57 [VHOST_USER_SET_FEATURES
] = "VHOST_USER_SET_FEATURES",
58 [VHOST_USER_SET_OWNER
] = "VHOST_USER_SET_OWNER",
59 [VHOST_USER_RESET_OWNER
] = "VHOST_USER_RESET_OWNER",
60 [VHOST_USER_SET_MEM_TABLE
] = "VHOST_USER_SET_MEM_TABLE",
61 [VHOST_USER_SET_LOG_BASE
] = "VHOST_USER_SET_LOG_BASE",
62 [VHOST_USER_SET_LOG_FD
] = "VHOST_USER_SET_LOG_FD",
63 [VHOST_USER_SET_VRING_NUM
] = "VHOST_USER_SET_VRING_NUM",
64 [VHOST_USER_SET_VRING_ADDR
] = "VHOST_USER_SET_VRING_ADDR",
65 [VHOST_USER_SET_VRING_BASE
] = "VHOST_USER_SET_VRING_BASE",
66 [VHOST_USER_GET_VRING_BASE
] = "VHOST_USER_GET_VRING_BASE",
67 [VHOST_USER_SET_VRING_KICK
] = "VHOST_USER_SET_VRING_KICK",
68 [VHOST_USER_SET_VRING_CALL
] = "VHOST_USER_SET_VRING_CALL",
69 [VHOST_USER_SET_VRING_ERR
] = "VHOST_USER_SET_VRING_ERR",
70 [VHOST_USER_GET_PROTOCOL_FEATURES
] = "VHOST_USER_GET_PROTOCOL_FEATURES",
71 [VHOST_USER_SET_PROTOCOL_FEATURES
] = "VHOST_USER_SET_PROTOCOL_FEATURES",
72 [VHOST_USER_GET_QUEUE_NUM
] = "VHOST_USER_GET_QUEUE_NUM",
73 [VHOST_USER_SET_VRING_ENABLE
] = "VHOST_USER_SET_VRING_ENABLE",
74 [VHOST_USER_SEND_RARP
] = "VHOST_USER_SEND_RARP",
83 ret
= fstat(fd
, &stat
);
84 return ret
== -1 ? (uint64_t)-1 : (uint64_t)stat
.st_blksize
;
88 free_mem_region(struct virtio_net
*dev
)
91 struct virtio_memory_region
*reg
;
93 if (!dev
|| !dev
->mem
)
96 for (i
= 0; i
< dev
->mem
->nregions
; i
++) {
97 reg
= &dev
->mem
->regions
[i
];
98 if (reg
->host_user_addr
) {
99 munmap(reg
->mmap_addr
, reg
->mmap_size
);
106 vhost_backend_cleanup(struct virtio_net
*dev
)
109 free_mem_region(dev
);
114 munmap((void *)(uintptr_t)dev
->log_addr
, dev
->log_size
);
120 * This function just returns success at the moment unless
121 * the device hasn't been initialised.
124 vhost_user_set_owner(void)
130 vhost_user_reset_owner(struct virtio_net
*dev
)
132 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
133 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
134 notify_ops
->destroy_device(dev
->vid
);
137 cleanup_device(dev
, 0);
143 * The features that we support are requested.
146 vhost_user_get_features(void)
148 return VHOST_FEATURES
;
152 * We receive the negotiated features supported by us and the virtio device.
155 vhost_user_set_features(struct virtio_net
*dev
, uint64_t features
)
157 if (features
& ~VHOST_FEATURES
)
160 dev
->features
= features
;
162 ((1 << VIRTIO_NET_F_MRG_RXBUF
) | (1ULL << VIRTIO_F_VERSION_1
))) {
163 dev
->vhost_hlen
= sizeof(struct virtio_net_hdr_mrg_rxbuf
);
165 dev
->vhost_hlen
= sizeof(struct virtio_net_hdr
);
167 LOG_DEBUG(VHOST_CONFIG
,
168 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
170 (dev
->features
& (1 << VIRTIO_NET_F_MRG_RXBUF
)) ? "on" : "off",
171 (dev
->features
& (1ULL << VIRTIO_F_VERSION_1
)) ? "on" : "off");
177 * The virtio device sends us the size of the descriptor ring.
180 vhost_user_set_vring_num(struct virtio_net
*dev
,
183 struct vhost_virtqueue
*vq
= dev
->virtqueue
[msg
->payload
.state
.index
];
185 vq
->size
= msg
->payload
.state
.num
;
187 if (dev
->dequeue_zero_copy
) {
189 vq
->last_zmbuf_idx
= 0;
190 vq
->zmbuf_size
= vq
->size
;
191 vq
->zmbufs
= rte_zmalloc(NULL
, vq
->zmbuf_size
*
192 sizeof(struct zcopy_mbuf
), 0);
193 if (vq
->zmbufs
== NULL
) {
194 RTE_LOG(WARNING
, VHOST_CONFIG
,
195 "failed to allocate mem for zero copy; "
196 "zero copy is force disabled\n");
197 dev
->dequeue_zero_copy
= 0;
201 vq
->shadow_used_ring
= rte_malloc(NULL
,
202 vq
->size
* sizeof(struct vring_used_elem
),
203 RTE_CACHE_LINE_SIZE
);
204 if (!vq
->shadow_used_ring
) {
205 RTE_LOG(ERR
, VHOST_CONFIG
,
206 "failed to allocate memory for shadow used ring.\n");
214 * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
215 * same numa node as the memory of vring descriptor.
217 #ifdef RTE_LIBRTE_VHOST_NUMA
218 static struct virtio_net
*
219 numa_realloc(struct virtio_net
*dev
, int index
)
221 int oldnode
, newnode
;
222 struct virtio_net
*old_dev
;
223 struct vhost_virtqueue
*old_vq
, *vq
;
227 * vq is allocated on pairs, we should try to do realloc
228 * on first queue of one queue pair only.
230 if (index
% VIRTIO_QNUM
!= 0)
234 vq
= old_vq
= dev
->virtqueue
[index
];
236 ret
= get_mempolicy(&newnode
, NULL
, 0, old_vq
->desc
,
237 MPOL_F_NODE
| MPOL_F_ADDR
);
239 /* check if we need to reallocate vq */
240 ret
|= get_mempolicy(&oldnode
, NULL
, 0, old_vq
,
241 MPOL_F_NODE
| MPOL_F_ADDR
);
243 RTE_LOG(ERR
, VHOST_CONFIG
,
244 "Unable to get vq numa information.\n");
247 if (oldnode
!= newnode
) {
248 RTE_LOG(INFO
, VHOST_CONFIG
,
249 "reallocate vq from %d to %d node\n", oldnode
, newnode
);
250 vq
= rte_malloc_socket(NULL
, sizeof(*vq
) * VIRTIO_QNUM
, 0,
255 memcpy(vq
, old_vq
, sizeof(*vq
) * VIRTIO_QNUM
);
259 /* check if we need to reallocate dev */
260 ret
= get_mempolicy(&oldnode
, NULL
, 0, old_dev
,
261 MPOL_F_NODE
| MPOL_F_ADDR
);
263 RTE_LOG(ERR
, VHOST_CONFIG
,
264 "Unable to get dev numa information.\n");
267 if (oldnode
!= newnode
) {
268 RTE_LOG(INFO
, VHOST_CONFIG
,
269 "reallocate dev from %d to %d node\n",
271 dev
= rte_malloc_socket(NULL
, sizeof(*dev
), 0, newnode
);
277 memcpy(dev
, old_dev
, sizeof(*dev
));
282 dev
->virtqueue
[index
] = vq
;
283 dev
->virtqueue
[index
+ 1] = vq
+ 1;
284 vhost_devices
[dev
->vid
] = dev
;
289 static struct virtio_net
*
290 numa_realloc(struct virtio_net
*dev
, int index __rte_unused
)
297 * Converts QEMU virtual address to Vhost virtual address. This function is
298 * used to convert the ring addresses to our address space.
301 qva_to_vva(struct virtio_net
*dev
, uint64_t qva
)
303 struct virtio_memory_region
*reg
;
306 /* Find the region where the address lives. */
307 for (i
= 0; i
< dev
->mem
->nregions
; i
++) {
308 reg
= &dev
->mem
->regions
[i
];
310 if (qva
>= reg
->guest_user_addr
&&
311 qva
< reg
->guest_user_addr
+ reg
->size
) {
312 return qva
- reg
->guest_user_addr
+
320 static int vhost_setup_mem_table(struct virtio_net
*dev
);
323 * The virtio device sends us the desc, used and avail ring addresses.
324 * This function then converts these to our address space.
327 vhost_user_set_vring_addr(struct virtio_net
*dev
, VhostUserMsg
*msg
)
329 struct vhost_virtqueue
*vq
;
331 if (dev
->has_new_mem_table
) {
332 vhost_setup_mem_table(dev
);
333 dev
->has_new_mem_table
= 0;
337 if (dev
->mem
== NULL
)
340 /* addr->index refers to the queue index. The txq 1, rxq is 0. */
341 vq
= dev
->virtqueue
[msg
->payload
.addr
.index
];
343 /* The addresses are converted from QEMU virtual to Vhost virtual. */
344 vq
->desc
= (struct vring_desc
*)(uintptr_t)qva_to_vva(dev
,
345 msg
->payload
.addr
.desc_user_addr
);
347 RTE_LOG(ERR
, VHOST_CONFIG
,
348 "(%d) failed to find desc ring address.\n",
353 dev
= numa_realloc(dev
, msg
->payload
.addr
.index
);
354 vq
= dev
->virtqueue
[msg
->payload
.addr
.index
];
356 vq
->avail
= (struct vring_avail
*)(uintptr_t)qva_to_vva(dev
,
357 msg
->payload
.addr
.avail_user_addr
);
358 if (vq
->avail
== 0) {
359 RTE_LOG(ERR
, VHOST_CONFIG
,
360 "(%d) failed to find avail ring address.\n",
365 vq
->used
= (struct vring_used
*)(uintptr_t)qva_to_vva(dev
,
366 msg
->payload
.addr
.used_user_addr
);
368 RTE_LOG(ERR
, VHOST_CONFIG
,
369 "(%d) failed to find used ring address.\n",
374 if (vq
->last_used_idx
!= vq
->used
->idx
) {
375 RTE_LOG(WARNING
, VHOST_CONFIG
,
376 "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
377 "some packets maybe resent for Tx and dropped for Rx\n",
378 vq
->last_used_idx
, vq
->used
->idx
);
379 vq
->last_used_idx
= vq
->used
->idx
;
380 vq
->last_avail_idx
= vq
->used
->idx
;
383 vq
->log_guest_addr
= msg
->payload
.addr
.log_guest_addr
;
385 LOG_DEBUG(VHOST_CONFIG
, "(%d) mapped address desc: %p\n",
387 LOG_DEBUG(VHOST_CONFIG
, "(%d) mapped address avail: %p\n",
388 dev
->vid
, vq
->avail
);
389 LOG_DEBUG(VHOST_CONFIG
, "(%d) mapped address used: %p\n",
391 LOG_DEBUG(VHOST_CONFIG
, "(%d) log_guest_addr: %" PRIx64
"\n",
392 dev
->vid
, vq
->log_guest_addr
);
398 * The virtio device sends us the available ring last used index.
401 vhost_user_set_vring_base(struct virtio_net
*dev
,
404 dev
->virtqueue
[msg
->payload
.state
.index
]->last_used_idx
= msg
->payload
.state
.num
;
405 dev
->virtqueue
[msg
->payload
.state
.index
]->last_avail_idx
= msg
->payload
.state
.num
;
411 add_one_guest_page(struct virtio_net
*dev
, uint64_t guest_phys_addr
,
412 uint64_t host_phys_addr
, uint64_t size
)
414 struct guest_page
*page
, *last_page
;
416 if (dev
->nr_guest_pages
== dev
->max_guest_pages
&&
417 dev
->nr_guest_pages
> 0) {
418 dev
->max_guest_pages
*= 2;
419 dev
->guest_pages
= realloc(dev
->guest_pages
,
420 dev
->max_guest_pages
* sizeof(*page
));
421 if (!dev
->guest_pages
) {
422 RTE_LOG(ERR
, VHOST_CONFIG
, "cannot realloc guest_pages\n");
427 if (dev
->nr_guest_pages
> 0) {
428 last_page
= &dev
->guest_pages
[dev
->nr_guest_pages
- 1];
429 /* merge if the two pages are continuous */
430 if (host_phys_addr
== last_page
->host_phys_addr
+
432 last_page
->size
+= size
;
437 page
= &dev
->guest_pages
[dev
->nr_guest_pages
++];
438 page
->guest_phys_addr
= guest_phys_addr
;
439 page
->host_phys_addr
= host_phys_addr
;
444 add_guest_pages(struct virtio_net
*dev
, struct virtio_memory_region
*reg
,
447 uint64_t reg_size
= reg
->size
;
448 uint64_t host_user_addr
= reg
->host_user_addr
;
449 uint64_t guest_phys_addr
= reg
->guest_phys_addr
;
450 uint64_t host_phys_addr
;
453 host_phys_addr
= rte_mem_virt2phy((void *)(uintptr_t)host_user_addr
);
454 size
= page_size
- (guest_phys_addr
& (page_size
- 1));
455 size
= RTE_MIN(size
, reg_size
);
457 add_one_guest_page(dev
, guest_phys_addr
, host_phys_addr
, size
);
458 host_user_addr
+= size
;
459 guest_phys_addr
+= size
;
462 while (reg_size
> 0) {
463 host_phys_addr
= rte_mem_virt2phy((void *)(uintptr_t)
465 add_one_guest_page(dev
, guest_phys_addr
, host_phys_addr
,
468 host_user_addr
+= page_size
;
469 guest_phys_addr
+= page_size
;
470 reg_size
-= page_size
;
474 #ifdef RTE_LIBRTE_VHOST_DEBUG
475 /* TODO: enable it only in debug mode? */
477 dump_guest_pages(struct virtio_net
*dev
)
480 struct guest_page
*page
;
482 for (i
= 0; i
< dev
->nr_guest_pages
; i
++) {
483 page
= &dev
->guest_pages
[i
];
485 RTE_LOG(INFO
, VHOST_CONFIG
,
486 "guest physical page region %u\n"
487 "\t guest_phys_addr: %" PRIx64
"\n"
488 "\t host_phys_addr : %" PRIx64
"\n"
489 "\t size : %" PRIx64
"\n",
491 page
->guest_phys_addr
,
492 page
->host_phys_addr
,
497 #define dump_guest_pages(dev)
501 vhost_user_set_mem_table(struct virtio_net
*dev
, struct VhostUserMsg
*pmsg
)
505 if (dev
->has_new_mem_table
) {
507 * The previous mem table was not consumed, so close the
508 * file descriptors from that mem table before copying
511 for (i
= 0; i
< dev
->mem_table
.nregions
; i
++) {
512 close(dev
->mem_table_fds
[i
]);
516 memcpy(&dev
->mem_table
, &pmsg
->payload
.memory
, sizeof(dev
->mem_table
));
517 memcpy(dev
->mem_table_fds
, pmsg
->fds
, sizeof(dev
->mem_table_fds
));
518 dev
->has_new_mem_table
= 1;
524 vhost_setup_mem_table(struct virtio_net
*dev
)
526 struct VhostUserMemory memory
= dev
->mem_table
;
527 struct virtio_memory_region
*reg
;
530 uint64_t mmap_offset
;
536 free_mem_region(dev
);
541 dev
->nr_guest_pages
= 0;
542 if (!dev
->guest_pages
) {
543 dev
->max_guest_pages
= 8;
544 dev
->guest_pages
= malloc(dev
->max_guest_pages
*
545 sizeof(struct guest_page
));
548 dev
->mem
= rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory
) +
549 sizeof(struct virtio_memory_region
) * memory
.nregions
, 0);
550 if (dev
->mem
== NULL
) {
551 RTE_LOG(ERR
, VHOST_CONFIG
,
552 "(%d) failed to allocate memory for dev->mem\n",
556 dev
->mem
->nregions
= memory
.nregions
;
558 for (i
= 0; i
< memory
.nregions
; i
++) {
559 fd
= dev
->mem_table_fds
[i
];
560 reg
= &dev
->mem
->regions
[i
];
562 reg
->guest_phys_addr
= memory
.regions
[i
].guest_phys_addr
;
563 reg
->guest_user_addr
= memory
.regions
[i
].userspace_addr
;
564 reg
->size
= memory
.regions
[i
].memory_size
;
567 mmap_offset
= memory
.regions
[i
].mmap_offset
;
568 mmap_size
= reg
->size
+ mmap_offset
;
570 /* mmap() without flag of MAP_ANONYMOUS, should be called
571 * with length argument aligned with hugepagesz at older
572 * longterm version Linux, like 2.6.32 and 3.2.72, or
573 * mmap() will fail with EINVAL.
575 * to avoid failure, make sure in caller to keep length
578 alignment
= get_blk_size(fd
);
579 if (alignment
== (uint64_t)-1) {
580 RTE_LOG(ERR
, VHOST_CONFIG
,
581 "couldn't get hugepage size through fstat\n");
584 mmap_size
= RTE_ALIGN_CEIL(mmap_size
, alignment
);
586 mmap_addr
= mmap(NULL
, mmap_size
, PROT_READ
| PROT_WRITE
,
587 MAP_SHARED
| MAP_POPULATE
, fd
, 0);
589 if (mmap_addr
== MAP_FAILED
) {
590 RTE_LOG(ERR
, VHOST_CONFIG
,
591 "mmap region %u failed.\n", i
);
595 reg
->mmap_addr
= mmap_addr
;
596 reg
->mmap_size
= mmap_size
;
597 reg
->host_user_addr
= (uint64_t)(uintptr_t)mmap_addr
+
600 add_guest_pages(dev
, reg
, alignment
);
602 RTE_LOG(INFO
, VHOST_CONFIG
,
603 "guest memory region %u, size: 0x%" PRIx64
"\n"
604 "\t guest physical addr: 0x%" PRIx64
"\n"
605 "\t guest virtual addr: 0x%" PRIx64
"\n"
606 "\t host virtual addr: 0x%" PRIx64
"\n"
607 "\t mmap addr : 0x%" PRIx64
"\n"
608 "\t mmap size : 0x%" PRIx64
"\n"
609 "\t mmap align: 0x%" PRIx64
"\n"
610 "\t mmap off : 0x%" PRIx64
"\n",
612 reg
->guest_phys_addr
,
613 reg
->guest_user_addr
,
615 (uint64_t)(uintptr_t)mmap_addr
,
621 dump_guest_pages(dev
);
626 free_mem_region(dev
);
633 vq_is_ready(struct vhost_virtqueue
*vq
)
635 return vq
&& vq
->desc
&&
636 vq
->kickfd
!= VIRTIO_UNINITIALIZED_EVENTFD
&&
637 vq
->callfd
!= VIRTIO_UNINITIALIZED_EVENTFD
;
641 virtio_is_ready(struct virtio_net
*dev
)
643 struct vhost_virtqueue
*vq
;
646 for (i
= 0; i
< dev
->num_queues
; i
++) {
647 vq
= dev
->virtqueue
[i
];
649 if (!vq_is_ready(vq
)) {
650 RTE_LOG(INFO
, VHOST_CONFIG
,
651 "virtio is not ready for processing.\n");
656 RTE_LOG(INFO
, VHOST_CONFIG
,
657 "virtio is now ready for processing.\n");
662 vhost_user_set_vring_call(struct virtio_net
*dev
, struct VhostUserMsg
*pmsg
)
664 struct vhost_vring_file file
;
665 struct vhost_virtqueue
*vq
;
668 file
.index
= pmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
669 if (pmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
)
670 file
.fd
= VIRTIO_INVALID_EVENTFD
;
672 file
.fd
= pmsg
->fds
[0];
673 RTE_LOG(INFO
, VHOST_CONFIG
,
674 "vring call idx:%d file:%d\n", file
.index
, file
.fd
);
676 if (file
.index
+ 1 > dev
->num_queues
) {
677 dev
->num_queues
= file
.index
+ 1;
681 * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
682 * we get, so we do vring queue pair allocation here.
684 cur_qp_idx
= file
.index
/ VIRTIO_QNUM
;
685 if (cur_qp_idx
+ 1 > dev
->virt_qp_nb
) {
686 if (alloc_vring_queue_pair(dev
, cur_qp_idx
) < 0)
690 vq
= dev
->virtqueue
[file
.index
];
696 vq
->callfd
= file
.fd
;
698 if (virtio_is_ready(dev
) && !(dev
->flags
& VIRTIO_DEV_RUNNING
)) {
699 notify_ops
->new_device(dev
->vid
);
704 * In vhost-user, when we receive kick message, will test whether virtio
705 * device is ready for packet processing.
708 vhost_user_set_vring_kick(struct virtio_net
*dev
, struct VhostUserMsg
*pmsg
)
710 struct vhost_vring_file file
;
711 struct vhost_virtqueue
*vq
;
713 file
.index
= pmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
714 if (pmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
)
715 file
.fd
= VIRTIO_INVALID_EVENTFD
;
717 file
.fd
= pmsg
->fds
[0];
718 RTE_LOG(INFO
, VHOST_CONFIG
,
719 "vring kick idx:%d file:%d\n", file
.index
, file
.fd
);
721 vq
= dev
->virtqueue
[file
.index
];
724 vq
->kickfd
= file
.fd
;
726 if (virtio_is_ready(dev
) && !(dev
->flags
& VIRTIO_DEV_RUNNING
)) {
727 if (dev
->dequeue_zero_copy
) {
728 RTE_LOG(INFO
, VHOST_CONFIG
,
729 "dequeue zero copy is enabled\n");
732 if (notify_ops
->new_device(dev
->vid
) == 0)
733 dev
->flags
|= VIRTIO_DEV_RUNNING
;
738 free_zmbufs(struct vhost_virtqueue
*vq
)
740 struct zcopy_mbuf
*zmbuf
, *next
;
742 for (zmbuf
= TAILQ_FIRST(&vq
->zmbuf_list
);
743 zmbuf
!= NULL
; zmbuf
= next
) {
744 next
= TAILQ_NEXT(zmbuf
, next
);
746 rte_pktmbuf_free(zmbuf
->mbuf
);
747 TAILQ_REMOVE(&vq
->zmbuf_list
, zmbuf
, next
);
750 rte_free(vq
->zmbufs
);
754 * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
757 vhost_user_get_vring_base(struct virtio_net
*dev
,
760 struct vhost_virtqueue
*vq
= dev
->virtqueue
[msg
->payload
.state
.index
];
762 /* We have to stop the queue (virtio) if it is running. */
763 if (dev
->flags
& VIRTIO_DEV_RUNNING
) {
764 dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
765 notify_ops
->destroy_device(dev
->vid
);
768 /* Here we are safe to get the last used index */
769 msg
->payload
.state
.num
= vq
->last_used_idx
;
771 RTE_LOG(INFO
, VHOST_CONFIG
,
772 "vring base idx:%d file:%d\n", msg
->payload
.state
.index
, msg
->payload
.state
.num
);
774 * Based on current qemu vhost-user implementation, this message is
775 * sent and only sent in vhost_vring_stop.
776 * TODO: cleanup the vring, it isn't usable since here.
781 vq
->kickfd
= VIRTIO_UNINITIALIZED_EVENTFD
;
782 vq
->callfd
= VIRTIO_UNINITIALIZED_EVENTFD
;
784 if (dev
->dequeue_zero_copy
)
786 rte_free(vq
->shadow_used_ring
);
787 vq
->shadow_used_ring
= NULL
;
793 * when virtio queues are ready to work, qemu will send us to
794 * enable the virtio queue pair.
797 vhost_user_set_vring_enable(struct virtio_net
*dev
,
800 int enable
= (int)msg
->payload
.state
.num
;
802 RTE_LOG(INFO
, VHOST_CONFIG
,
803 "set queue enable: %d to qp idx: %d\n",
804 enable
, msg
->payload
.state
.index
);
806 if (notify_ops
->vring_state_changed
)
807 notify_ops
->vring_state_changed(dev
->vid
, msg
->payload
.state
.index
, enable
);
809 dev
->virtqueue
[msg
->payload
.state
.index
]->enabled
= enable
;
815 vhost_user_set_protocol_features(struct virtio_net
*dev
,
816 uint64_t protocol_features
)
818 if (protocol_features
& ~VHOST_USER_PROTOCOL_FEATURES
)
821 dev
->protocol_features
= protocol_features
;
825 vhost_user_set_log_base(struct virtio_net
*dev
, struct VhostUserMsg
*msg
)
827 int fd
= msg
->fds
[0];
832 RTE_LOG(ERR
, VHOST_CONFIG
, "invalid log fd: %d\n", fd
);
836 if (msg
->size
!= sizeof(VhostUserLog
)) {
837 RTE_LOG(ERR
, VHOST_CONFIG
,
838 "invalid log base msg size: %"PRId32
" != %d\n",
839 msg
->size
, (int)sizeof(VhostUserLog
));
843 size
= msg
->payload
.log
.mmap_size
;
844 off
= msg
->payload
.log
.mmap_offset
;
845 RTE_LOG(INFO
, VHOST_CONFIG
,
846 "log mmap size: %"PRId64
", offset: %"PRId64
"\n",
850 * mmap from 0 to workaround a hugepage mmap bug: mmap will
851 * fail when offset is not page size aligned.
853 addr
= mmap(0, size
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, fd
, 0);
855 if (addr
== MAP_FAILED
) {
856 RTE_LOG(ERR
, VHOST_CONFIG
, "mmap log base failed!\n");
861 * Free previously mapped log memory on occasionally
862 * multiple VHOST_USER_SET_LOG_BASE.
865 munmap((void *)(uintptr_t)dev
->log_addr
, dev
->log_size
);
867 dev
->log_addr
= (uint64_t)(uintptr_t)addr
;
868 dev
->log_base
= dev
->log_addr
+ off
;
869 dev
->log_size
= size
;
875 * An rarp packet is constructed and broadcasted to notify switches about
876 * the new location of the migrated VM, so that packets from outside will
877 * not be lost after migration.
879 * However, we don't actually "send" a rarp packet here, instead, we set
880 * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
883 vhost_user_send_rarp(struct virtio_net
*dev
, struct VhostUserMsg
*msg
)
885 uint8_t *mac
= (uint8_t *)&msg
->payload
.u64
;
887 RTE_LOG(DEBUG
, VHOST_CONFIG
,
888 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
889 mac
[0], mac
[1], mac
[2], mac
[3], mac
[4], mac
[5]);
890 memcpy(dev
->mac
.addr_bytes
, mac
, 6);
893 * Set the flag to inject a RARP broadcast packet at
894 * rte_vhost_dequeue_burst().
896 * rte_smp_wmb() is for making sure the mac is copied
897 * before the flag is set.
900 rte_atomic16_set(&dev
->broadcast_rarp
, 1);
905 /* return bytes# of read on success or negative val on failure. */
907 read_vhost_message(int sockfd
, struct VhostUserMsg
*msg
)
911 ret
= read_fd_message(sockfd
, (char *)msg
, VHOST_USER_HDR_SIZE
,
912 msg
->fds
, VHOST_MEMORY_MAX_NREGIONS
);
916 if (msg
&& msg
->size
) {
917 if (msg
->size
> sizeof(msg
->payload
)) {
918 RTE_LOG(ERR
, VHOST_CONFIG
,
919 "invalid msg size: %d\n", msg
->size
);
922 ret
= read(sockfd
, &msg
->payload
, msg
->size
);
925 if (ret
!= (int)msg
->size
) {
926 RTE_LOG(ERR
, VHOST_CONFIG
,
927 "read control message failed\n");
936 send_vhost_message(int sockfd
, struct VhostUserMsg
*msg
)
943 msg
->flags
&= ~VHOST_USER_VERSION_MASK
;
944 msg
->flags
|= VHOST_USER_VERSION
;
945 msg
->flags
|= VHOST_USER_REPLY_MASK
;
947 ret
= send_fd_message(sockfd
, (char *)msg
,
948 VHOST_USER_HDR_SIZE
+ msg
->size
, NULL
, 0);
954 vhost_user_msg_handler(int vid
, int fd
)
956 struct virtio_net
*dev
;
957 struct VhostUserMsg msg
;
960 dev
= get_device(vid
);
964 ret
= read_vhost_message(fd
, &msg
);
965 if (ret
<= 0 || msg
.request
>= VHOST_USER_MAX
) {
967 RTE_LOG(ERR
, VHOST_CONFIG
,
968 "vhost read message failed\n");
970 RTE_LOG(INFO
, VHOST_CONFIG
,
971 "vhost peer closed\n");
973 RTE_LOG(ERR
, VHOST_CONFIG
,
974 "vhost read incorrect message\n");
979 RTE_LOG(INFO
, VHOST_CONFIG
, "read message %s\n",
980 vhost_message_str
[msg
.request
]);
981 switch (msg
.request
) {
982 case VHOST_USER_GET_FEATURES
:
983 msg
.payload
.u64
= vhost_user_get_features();
984 msg
.size
= sizeof(msg
.payload
.u64
);
985 send_vhost_message(fd
, &msg
);
987 case VHOST_USER_SET_FEATURES
:
988 vhost_user_set_features(dev
, msg
.payload
.u64
);
991 case VHOST_USER_GET_PROTOCOL_FEATURES
:
992 msg
.payload
.u64
= VHOST_USER_PROTOCOL_FEATURES
;
993 msg
.size
= sizeof(msg
.payload
.u64
);
994 send_vhost_message(fd
, &msg
);
996 case VHOST_USER_SET_PROTOCOL_FEATURES
:
997 vhost_user_set_protocol_features(dev
, msg
.payload
.u64
);
1000 case VHOST_USER_SET_OWNER
:
1001 vhost_user_set_owner();
1003 case VHOST_USER_RESET_OWNER
:
1004 vhost_user_reset_owner(dev
);
1007 case VHOST_USER_SET_MEM_TABLE
:
1008 vhost_user_set_mem_table(dev
, &msg
);
1011 case VHOST_USER_SET_LOG_BASE
:
1012 vhost_user_set_log_base(dev
, &msg
);
1014 /* it needs a reply */
1015 msg
.size
= sizeof(msg
.payload
.u64
);
1016 send_vhost_message(fd
, &msg
);
1018 case VHOST_USER_SET_LOG_FD
:
1020 RTE_LOG(INFO
, VHOST_CONFIG
, "not implemented.\n");
1023 case VHOST_USER_SET_VRING_NUM
:
1024 vhost_user_set_vring_num(dev
, &msg
);
1026 case VHOST_USER_SET_VRING_ADDR
:
1027 vhost_user_set_vring_addr(dev
, &msg
);
1029 case VHOST_USER_SET_VRING_BASE
:
1030 vhost_user_set_vring_base(dev
, &msg
);
1033 case VHOST_USER_GET_VRING_BASE
:
1034 vhost_user_get_vring_base(dev
, &msg
);
1035 msg
.size
= sizeof(msg
.payload
.state
);
1036 send_vhost_message(fd
, &msg
);
1039 case VHOST_USER_SET_VRING_KICK
:
1040 vhost_user_set_vring_kick(dev
, &msg
);
1042 case VHOST_USER_SET_VRING_CALL
:
1043 vhost_user_set_vring_call(dev
, &msg
);
1046 case VHOST_USER_SET_VRING_ERR
:
1047 if (!(msg
.payload
.u64
& VHOST_USER_VRING_NOFD_MASK
))
1049 RTE_LOG(INFO
, VHOST_CONFIG
, "not implemented\n");
1052 case VHOST_USER_GET_QUEUE_NUM
:
1053 msg
.payload
.u64
= VHOST_MAX_QUEUE_PAIRS
;
1054 msg
.size
= sizeof(msg
.payload
.u64
);
1055 send_vhost_message(fd
, &msg
);
1058 case VHOST_USER_SET_VRING_ENABLE
:
1059 vhost_user_set_vring_enable(dev
, &msg
);
1061 case VHOST_USER_SEND_RARP
:
1062 vhost_user_send_rarp(dev
, &msg
);