]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/lib/vhost/rte_vhost/vhost_user.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / spdk / lib / vhost / rte_vhost / vhost_user.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <unistd.h>
39 #include <sys/mman.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <assert.h>
43 #ifdef RTE_LIBRTE_VHOST_NUMA
44 #include <numaif.h>
45 #endif
46
47 #include <rte_common.h>
48 #include <rte_malloc.h>
49 #include <rte_log.h>
50
51 #include "vhost.h"
52 #include "vhost_user.h"
53
54 static const char *vhost_message_str[VHOST_USER_MAX] = {
55 [VHOST_USER_NONE] = "VHOST_USER_NONE",
56 [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
57 [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
58 [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
59 [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
60 [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
61 [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
62 [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
63 [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
64 [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
65 [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
66 [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
67 [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
68 [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
69 [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
70 [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
71 [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
72 [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
73 [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
74 [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
75 };
76
77 static uint64_t
78 get_blk_size(int fd)
79 {
80 struct stat stat;
81 int ret;
82
83 ret = fstat(fd, &stat);
84 return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
85 }
86
87 static void
88 free_mem_region(struct virtio_net *dev)
89 {
90 uint32_t i;
91 struct virtio_memory_region *reg;
92
93 if (!dev || !dev->mem)
94 return;
95
96 for (i = 0; i < dev->mem->nregions; i++) {
97 reg = &dev->mem->regions[i];
98 if (reg->host_user_addr) {
99 munmap(reg->mmap_addr, reg->mmap_size);
100 close(reg->fd);
101 }
102 }
103 }
104
105 void
106 vhost_backend_cleanup(struct virtio_net *dev)
107 {
108 if (dev->mem) {
109 free_mem_region(dev);
110 rte_free(dev->mem);
111 dev->mem = NULL;
112 }
113 if (dev->log_addr) {
114 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
115 dev->log_addr = 0;
116 }
117 }
118
119 /*
120 * This function just returns success at the moment unless
121 * the device hasn't been initialised.
122 */
123 static int
124 vhost_user_set_owner(void)
125 {
126 return 0;
127 }
128
129 static int
130 vhost_user_reset_owner(struct virtio_net *dev)
131 {
132 if (dev->flags & VIRTIO_DEV_RUNNING) {
133 dev->flags &= ~VIRTIO_DEV_RUNNING;
134 notify_ops->destroy_device(dev->vid);
135 }
136
137 cleanup_device(dev, 0);
138 reset_device(dev);
139 return 0;
140 }
141
142 /*
143 * The features that we support are requested.
144 */
145 static uint64_t
146 vhost_user_get_features(void)
147 {
148 return VHOST_FEATURES;
149 }
150
151 /*
152 * We receive the negotiated features supported by us and the virtio device.
153 */
154 static int
155 vhost_user_set_features(struct virtio_net *dev, uint64_t features)
156 {
157 if (features & ~VHOST_FEATURES)
158 return -1;
159
160 dev->features = features;
161 if (dev->features &
162 ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
163 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
164 } else {
165 dev->vhost_hlen = sizeof(struct virtio_net_hdr);
166 }
167 LOG_DEBUG(VHOST_CONFIG,
168 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
169 dev->vid,
170 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
171 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
172
173 return 0;
174 }
175
176 /*
177 * The virtio device sends us the size of the descriptor ring.
178 */
179 static int
180 vhost_user_set_vring_num(struct virtio_net *dev,
181 VhostUserMsg *msg)
182 {
183 struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
184
185 vq->size = msg->payload.state.num;
186
187 if (dev->dequeue_zero_copy) {
188 vq->nr_zmbuf = 0;
189 vq->last_zmbuf_idx = 0;
190 vq->zmbuf_size = vq->size;
191 vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
192 sizeof(struct zcopy_mbuf), 0);
193 if (vq->zmbufs == NULL) {
194 RTE_LOG(WARNING, VHOST_CONFIG,
195 "failed to allocate mem for zero copy; "
196 "zero copy is force disabled\n");
197 dev->dequeue_zero_copy = 0;
198 }
199 }
200
201 vq->shadow_used_ring = rte_malloc(NULL,
202 vq->size * sizeof(struct vring_used_elem),
203 RTE_CACHE_LINE_SIZE);
204 if (!vq->shadow_used_ring) {
205 RTE_LOG(ERR, VHOST_CONFIG,
206 "failed to allocate memory for shadow used ring.\n");
207 return -1;
208 }
209
210 return 0;
211 }
212
213 /*
214 * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
215 * same numa node as the memory of vring descriptor.
216 */
217 #ifdef RTE_LIBRTE_VHOST_NUMA
218 static struct virtio_net*
219 numa_realloc(struct virtio_net *dev, int index)
220 {
221 int oldnode, newnode;
222 struct virtio_net *old_dev;
223 struct vhost_virtqueue *old_vq, *vq;
224 int ret;
225
226 /*
227 * vq is allocated on pairs, we should try to do realloc
228 * on first queue of one queue pair only.
229 */
230 if (index % VIRTIO_QNUM != 0)
231 return dev;
232
233 old_dev = dev;
234 vq = old_vq = dev->virtqueue[index];
235
236 ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
237 MPOL_F_NODE | MPOL_F_ADDR);
238
239 /* check if we need to reallocate vq */
240 ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
241 MPOL_F_NODE | MPOL_F_ADDR);
242 if (ret) {
243 RTE_LOG(ERR, VHOST_CONFIG,
244 "Unable to get vq numa information.\n");
245 return dev;
246 }
247 if (oldnode != newnode) {
248 RTE_LOG(INFO, VHOST_CONFIG,
249 "reallocate vq from %d to %d node\n", oldnode, newnode);
250 vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
251 newnode);
252 if (!vq)
253 return dev;
254
255 memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
256 rte_free(old_vq);
257 }
258
259 /* check if we need to reallocate dev */
260 ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
261 MPOL_F_NODE | MPOL_F_ADDR);
262 if (ret) {
263 RTE_LOG(ERR, VHOST_CONFIG,
264 "Unable to get dev numa information.\n");
265 goto out;
266 }
267 if (oldnode != newnode) {
268 RTE_LOG(INFO, VHOST_CONFIG,
269 "reallocate dev from %d to %d node\n",
270 oldnode, newnode);
271 dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
272 if (!dev) {
273 dev = old_dev;
274 goto out;
275 }
276
277 memcpy(dev, old_dev, sizeof(*dev));
278 rte_free(old_dev);
279 }
280
281 out:
282 dev->virtqueue[index] = vq;
283 dev->virtqueue[index + 1] = vq + 1;
284 vhost_devices[dev->vid] = dev;
285
286 return dev;
287 }
288 #else
289 static struct virtio_net*
290 numa_realloc(struct virtio_net *dev, int index __rte_unused)
291 {
292 return dev;
293 }
294 #endif
295
296 /*
297 * Converts QEMU virtual address to Vhost virtual address. This function is
298 * used to convert the ring addresses to our address space.
299 */
300 static uint64_t
301 qva_to_vva(struct virtio_net *dev, uint64_t qva)
302 {
303 struct virtio_memory_region *reg;
304 uint32_t i;
305
306 /* Find the region where the address lives. */
307 for (i = 0; i < dev->mem->nregions; i++) {
308 reg = &dev->mem->regions[i];
309
310 if (qva >= reg->guest_user_addr &&
311 qva < reg->guest_user_addr + reg->size) {
312 return qva - reg->guest_user_addr +
313 reg->host_user_addr;
314 }
315 }
316
317 return 0;
318 }
319
320 static int vhost_setup_mem_table(struct virtio_net *dev);
321
322 /*
323 * The virtio device sends us the desc, used and avail ring addresses.
324 * This function then converts these to our address space.
325 */
326 static int
327 vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
328 {
329 struct vhost_virtqueue *vq;
330
331 if (dev->has_new_mem_table) {
332 vhost_setup_mem_table(dev);
333 dev->has_new_mem_table = 0;
334 }
335
336
337 if (dev->mem == NULL)
338 return -1;
339
340 /* addr->index refers to the queue index. The txq 1, rxq is 0. */
341 vq = dev->virtqueue[msg->payload.addr.index];
342
343 /* The addresses are converted from QEMU virtual to Vhost virtual. */
344 vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
345 msg->payload.addr.desc_user_addr);
346 if (vq->desc == 0) {
347 RTE_LOG(ERR, VHOST_CONFIG,
348 "(%d) failed to find desc ring address.\n",
349 dev->vid);
350 return -1;
351 }
352
353 dev = numa_realloc(dev, msg->payload.addr.index);
354 vq = dev->virtqueue[msg->payload.addr.index];
355
356 vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
357 msg->payload.addr.avail_user_addr);
358 if (vq->avail == 0) {
359 RTE_LOG(ERR, VHOST_CONFIG,
360 "(%d) failed to find avail ring address.\n",
361 dev->vid);
362 return -1;
363 }
364
365 vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
366 msg->payload.addr.used_user_addr);
367 if (vq->used == 0) {
368 RTE_LOG(ERR, VHOST_CONFIG,
369 "(%d) failed to find used ring address.\n",
370 dev->vid);
371 return -1;
372 }
373
374 if (vq->last_used_idx != vq->used->idx) {
375 RTE_LOG(WARNING, VHOST_CONFIG,
376 "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
377 "some packets maybe resent for Tx and dropped for Rx\n",
378 vq->last_used_idx, vq->used->idx);
379 vq->last_used_idx = vq->used->idx;
380 vq->last_avail_idx = vq->used->idx;
381 }
382
383 vq->log_guest_addr = msg->payload.addr.log_guest_addr;
384
385 LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
386 dev->vid, vq->desc);
387 LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
388 dev->vid, vq->avail);
389 LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
390 dev->vid, vq->used);
391 LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
392 dev->vid, vq->log_guest_addr);
393
394 return 0;
395 }
396
397 /*
398 * The virtio device sends us the available ring last used index.
399 */
400 static int
401 vhost_user_set_vring_base(struct virtio_net *dev,
402 VhostUserMsg *msg)
403 {
404 dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num;
405 dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num;
406
407 return 0;
408 }
409
410 static void
411 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
412 uint64_t host_phys_addr, uint64_t size)
413 {
414 struct guest_page *page, *last_page;
415
416 if (dev->nr_guest_pages == dev->max_guest_pages &&
417 dev->nr_guest_pages > 0) {
418 dev->max_guest_pages *= 2;
419 dev->guest_pages = realloc(dev->guest_pages,
420 dev->max_guest_pages * sizeof(*page));
421 if (!dev->guest_pages) {
422 RTE_LOG(ERR, VHOST_CONFIG, "cannot realloc guest_pages\n");
423 abort();
424 }
425 }
426
427 if (dev->nr_guest_pages > 0) {
428 last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
429 /* merge if the two pages are continuous */
430 if (host_phys_addr == last_page->host_phys_addr +
431 last_page->size) {
432 last_page->size += size;
433 return;
434 }
435 }
436
437 page = &dev->guest_pages[dev->nr_guest_pages++];
438 page->guest_phys_addr = guest_phys_addr;
439 page->host_phys_addr = host_phys_addr;
440 page->size = size;
441 }
442
443 static void
444 add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
445 uint64_t page_size)
446 {
447 uint64_t reg_size = reg->size;
448 uint64_t host_user_addr = reg->host_user_addr;
449 uint64_t guest_phys_addr = reg->guest_phys_addr;
450 uint64_t host_phys_addr;
451 uint64_t size;
452
453 host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
454 size = page_size - (guest_phys_addr & (page_size - 1));
455 size = RTE_MIN(size, reg_size);
456
457 add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
458 host_user_addr += size;
459 guest_phys_addr += size;
460 reg_size -= size;
461
462 while (reg_size > 0) {
463 host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
464 host_user_addr);
465 add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
466 page_size);
467
468 host_user_addr += page_size;
469 guest_phys_addr += page_size;
470 reg_size -= page_size;
471 }
472 }
473
474 #ifdef RTE_LIBRTE_VHOST_DEBUG
475 /* TODO: enable it only in debug mode? */
476 static void
477 dump_guest_pages(struct virtio_net *dev)
478 {
479 uint32_t i;
480 struct guest_page *page;
481
482 for (i = 0; i < dev->nr_guest_pages; i++) {
483 page = &dev->guest_pages[i];
484
485 RTE_LOG(INFO, VHOST_CONFIG,
486 "guest physical page region %u\n"
487 "\t guest_phys_addr: %" PRIx64 "\n"
488 "\t host_phys_addr : %" PRIx64 "\n"
489 "\t size : %" PRIx64 "\n",
490 i,
491 page->guest_phys_addr,
492 page->host_phys_addr,
493 page->size);
494 }
495 }
496 #else
497 #define dump_guest_pages(dev)
498 #endif
499
500 static int
501 vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
502 {
503 uint32_t i;
504
505 if (dev->has_new_mem_table) {
506 /*
507 * The previous mem table was not consumed, so close the
508 * file descriptors from that mem table before copying
509 * the new one.
510 */
511 for (i = 0; i < dev->mem_table.nregions; i++) {
512 close(dev->mem_table_fds[i]);
513 }
514 }
515
516 memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table));
517 memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds));
518 dev->has_new_mem_table = 1;
519
520 return 0;
521 }
522
523 static int
524 vhost_setup_mem_table(struct virtio_net *dev)
525 {
526 struct VhostUserMemory memory = dev->mem_table;
527 struct virtio_memory_region *reg;
528 void *mmap_addr;
529 uint64_t mmap_size;
530 uint64_t mmap_offset;
531 uint64_t alignment;
532 uint32_t i;
533 int fd;
534
535 if (dev->mem) {
536 free_mem_region(dev);
537 rte_free(dev->mem);
538 dev->mem = NULL;
539 }
540
541 dev->nr_guest_pages = 0;
542 if (!dev->guest_pages) {
543 dev->max_guest_pages = 8;
544 dev->guest_pages = malloc(dev->max_guest_pages *
545 sizeof(struct guest_page));
546 }
547
548 dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) +
549 sizeof(struct virtio_memory_region) * memory.nregions, 0);
550 if (dev->mem == NULL) {
551 RTE_LOG(ERR, VHOST_CONFIG,
552 "(%d) failed to allocate memory for dev->mem\n",
553 dev->vid);
554 return -1;
555 }
556 dev->mem->nregions = memory.nregions;
557
558 for (i = 0; i < memory.nregions; i++) {
559 fd = dev->mem_table_fds[i];
560 reg = &dev->mem->regions[i];
561
562 reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
563 reg->guest_user_addr = memory.regions[i].userspace_addr;
564 reg->size = memory.regions[i].memory_size;
565 reg->fd = fd;
566
567 mmap_offset = memory.regions[i].mmap_offset;
568 mmap_size = reg->size + mmap_offset;
569
570 /* mmap() without flag of MAP_ANONYMOUS, should be called
571 * with length argument aligned with hugepagesz at older
572 * longterm version Linux, like 2.6.32 and 3.2.72, or
573 * mmap() will fail with EINVAL.
574 *
575 * to avoid failure, make sure in caller to keep length
576 * aligned.
577 */
578 alignment = get_blk_size(fd);
579 if (alignment == (uint64_t)-1) {
580 RTE_LOG(ERR, VHOST_CONFIG,
581 "couldn't get hugepage size through fstat\n");
582 goto err_mmap;
583 }
584 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
585
586 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
587 MAP_SHARED | MAP_POPULATE, fd, 0);
588
589 if (mmap_addr == MAP_FAILED) {
590 RTE_LOG(ERR, VHOST_CONFIG,
591 "mmap region %u failed.\n", i);
592 goto err_mmap;
593 }
594
595 reg->mmap_addr = mmap_addr;
596 reg->mmap_size = mmap_size;
597 reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
598 mmap_offset;
599
600 add_guest_pages(dev, reg, alignment);
601
602 RTE_LOG(INFO, VHOST_CONFIG,
603 "guest memory region %u, size: 0x%" PRIx64 "\n"
604 "\t guest physical addr: 0x%" PRIx64 "\n"
605 "\t guest virtual addr: 0x%" PRIx64 "\n"
606 "\t host virtual addr: 0x%" PRIx64 "\n"
607 "\t mmap addr : 0x%" PRIx64 "\n"
608 "\t mmap size : 0x%" PRIx64 "\n"
609 "\t mmap align: 0x%" PRIx64 "\n"
610 "\t mmap off : 0x%" PRIx64 "\n",
611 i, reg->size,
612 reg->guest_phys_addr,
613 reg->guest_user_addr,
614 reg->host_user_addr,
615 (uint64_t)(uintptr_t)mmap_addr,
616 mmap_size,
617 alignment,
618 mmap_offset);
619 }
620
621 dump_guest_pages(dev);
622
623 return 0;
624
625 err_mmap:
626 free_mem_region(dev);
627 rte_free(dev->mem);
628 dev->mem = NULL;
629 return -1;
630 }
631
632 static int
633 vq_is_ready(struct vhost_virtqueue *vq)
634 {
635 return vq && vq->desc &&
636 vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
637 vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
638 }
639
640 static int
641 virtio_is_ready(struct virtio_net *dev)
642 {
643 struct vhost_virtqueue *vq;
644 uint32_t i;
645
646 for (i = 0; i < dev->num_queues; i++) {
647 vq = dev->virtqueue[i];
648
649 if (!vq_is_ready(vq)) {
650 RTE_LOG(INFO, VHOST_CONFIG,
651 "virtio is not ready for processing.\n");
652 return 0;
653 }
654 }
655
656 RTE_LOG(INFO, VHOST_CONFIG,
657 "virtio is now ready for processing.\n");
658 return 1;
659 }
660
661 static void
662 vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
663 {
664 struct vhost_vring_file file;
665 struct vhost_virtqueue *vq;
666 uint32_t cur_qp_idx;
667
668 file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
669 if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
670 file.fd = VIRTIO_INVALID_EVENTFD;
671 else
672 file.fd = pmsg->fds[0];
673 RTE_LOG(INFO, VHOST_CONFIG,
674 "vring call idx:%d file:%d\n", file.index, file.fd);
675
676 if (file.index + 1 > dev->num_queues) {
677 dev->num_queues = file.index + 1;
678 }
679
680 /*
681 * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
682 * we get, so we do vring queue pair allocation here.
683 */
684 cur_qp_idx = file.index / VIRTIO_QNUM;
685 if (cur_qp_idx + 1 > dev->virt_qp_nb) {
686 if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
687 return;
688 }
689
690 vq = dev->virtqueue[file.index];
691 assert(vq != NULL);
692
693 if (vq->callfd >= 0)
694 close(vq->callfd);
695
696 vq->callfd = file.fd;
697
698 if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
699 notify_ops->new_device(dev->vid);
700 }
701 }
702
703 /*
704 * In vhost-user, when we receive kick message, will test whether virtio
705 * device is ready for packet processing.
706 */
707 static void
708 vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
709 {
710 struct vhost_vring_file file;
711 struct vhost_virtqueue *vq;
712
713 file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
714 if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
715 file.fd = VIRTIO_INVALID_EVENTFD;
716 else
717 file.fd = pmsg->fds[0];
718 RTE_LOG(INFO, VHOST_CONFIG,
719 "vring kick idx:%d file:%d\n", file.index, file.fd);
720
721 vq = dev->virtqueue[file.index];
722 if (vq->kickfd >= 0)
723 close(vq->kickfd);
724 vq->kickfd = file.fd;
725
726 if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
727 if (dev->dequeue_zero_copy) {
728 RTE_LOG(INFO, VHOST_CONFIG,
729 "dequeue zero copy is enabled\n");
730 }
731
732 if (notify_ops->new_device(dev->vid) == 0)
733 dev->flags |= VIRTIO_DEV_RUNNING;
734 }
735 }
736
737 static void
738 free_zmbufs(struct vhost_virtqueue *vq)
739 {
740 struct zcopy_mbuf *zmbuf, *next;
741
742 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
743 zmbuf != NULL; zmbuf = next) {
744 next = TAILQ_NEXT(zmbuf, next);
745
746 rte_pktmbuf_free(zmbuf->mbuf);
747 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
748 }
749
750 rte_free(vq->zmbufs);
751 }
752
753 /*
754 * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
755 */
756 static int
757 vhost_user_get_vring_base(struct virtio_net *dev,
758 VhostUserMsg *msg)
759 {
760 struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
761
762 /* We have to stop the queue (virtio) if it is running. */
763 if (dev->flags & VIRTIO_DEV_RUNNING) {
764 dev->flags &= ~VIRTIO_DEV_RUNNING;
765 notify_ops->destroy_device(dev->vid);
766 }
767
768 /* Here we are safe to get the last used index */
769 msg->payload.state.num = vq->last_used_idx;
770
771 RTE_LOG(INFO, VHOST_CONFIG,
772 "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num);
773 /*
774 * Based on current qemu vhost-user implementation, this message is
775 * sent and only sent in vhost_vring_stop.
776 * TODO: cleanup the vring, it isn't usable since here.
777 */
778 if (vq->kickfd >= 0)
779 close(vq->kickfd);
780
781 vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
782 vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
783
784 if (dev->dequeue_zero_copy)
785 free_zmbufs(vq);
786 rte_free(vq->shadow_used_ring);
787 vq->shadow_used_ring = NULL;
788
789 return 0;
790 }
791
792 /*
793 * when virtio queues are ready to work, qemu will send us to
794 * enable the virtio queue pair.
795 */
796 static int
797 vhost_user_set_vring_enable(struct virtio_net *dev,
798 VhostUserMsg *msg)
799 {
800 int enable = (int)msg->payload.state.num;
801
802 RTE_LOG(INFO, VHOST_CONFIG,
803 "set queue enable: %d to qp idx: %d\n",
804 enable, msg->payload.state.index);
805
806 if (notify_ops->vring_state_changed)
807 notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable);
808
809 dev->virtqueue[msg->payload.state.index]->enabled = enable;
810
811 return 0;
812 }
813
814 static void
815 vhost_user_set_protocol_features(struct virtio_net *dev,
816 uint64_t protocol_features)
817 {
818 if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
819 return;
820
821 dev->protocol_features = protocol_features;
822 }
823
824 static int
825 vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
826 {
827 int fd = msg->fds[0];
828 uint64_t size, off;
829 void *addr;
830
831 if (fd < 0) {
832 RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
833 return -1;
834 }
835
836 if (msg->size != sizeof(VhostUserLog)) {
837 RTE_LOG(ERR, VHOST_CONFIG,
838 "invalid log base msg size: %"PRId32" != %d\n",
839 msg->size, (int)sizeof(VhostUserLog));
840 return -1;
841 }
842
843 size = msg->payload.log.mmap_size;
844 off = msg->payload.log.mmap_offset;
845 RTE_LOG(INFO, VHOST_CONFIG,
846 "log mmap size: %"PRId64", offset: %"PRId64"\n",
847 size, off);
848
849 /*
850 * mmap from 0 to workaround a hugepage mmap bug: mmap will
851 * fail when offset is not page size aligned.
852 */
853 addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
854 close(fd);
855 if (addr == MAP_FAILED) {
856 RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
857 return -1;
858 }
859
860 /*
861 * Free previously mapped log memory on occasionally
862 * multiple VHOST_USER_SET_LOG_BASE.
863 */
864 if (dev->log_addr) {
865 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
866 }
867 dev->log_addr = (uint64_t)(uintptr_t)addr;
868 dev->log_base = dev->log_addr + off;
869 dev->log_size = size;
870
871 return 0;
872 }
873
874 /*
875 * An rarp packet is constructed and broadcasted to notify switches about
876 * the new location of the migrated VM, so that packets from outside will
877 * not be lost after migration.
878 *
879 * However, we don't actually "send" a rarp packet here, instead, we set
880 * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
881 */
882 static int
883 vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
884 {
885 uint8_t *mac = (uint8_t *)&msg->payload.u64;
886
887 RTE_LOG(DEBUG, VHOST_CONFIG,
888 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
889 mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
890 memcpy(dev->mac.addr_bytes, mac, 6);
891
892 /*
893 * Set the flag to inject a RARP broadcast packet at
894 * rte_vhost_dequeue_burst().
895 *
896 * rte_smp_wmb() is for making sure the mac is copied
897 * before the flag is set.
898 */
899 rte_smp_wmb();
900 rte_atomic16_set(&dev->broadcast_rarp, 1);
901
902 return 0;
903 }
904
905 /* return bytes# of read on success or negative val on failure. */
906 static int
907 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
908 {
909 int ret;
910
911 ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
912 msg->fds, VHOST_MEMORY_MAX_NREGIONS);
913 if (ret <= 0)
914 return ret;
915
916 if (msg && msg->size) {
917 if (msg->size > sizeof(msg->payload)) {
918 RTE_LOG(ERR, VHOST_CONFIG,
919 "invalid msg size: %d\n", msg->size);
920 return -1;
921 }
922 ret = read(sockfd, &msg->payload, msg->size);
923 if (ret <= 0)
924 return ret;
925 if (ret != (int)msg->size) {
926 RTE_LOG(ERR, VHOST_CONFIG,
927 "read control message failed\n");
928 return -1;
929 }
930 }
931
932 return ret;
933 }
934
935 static int
936 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
937 {
938 int ret;
939
940 if (!msg)
941 return 0;
942
943 msg->flags &= ~VHOST_USER_VERSION_MASK;
944 msg->flags |= VHOST_USER_VERSION;
945 msg->flags |= VHOST_USER_REPLY_MASK;
946
947 ret = send_fd_message(sockfd, (char *)msg,
948 VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
949
950 return ret;
951 }
952
953 int
954 vhost_user_msg_handler(int vid, int fd)
955 {
956 struct virtio_net *dev;
957 struct VhostUserMsg msg;
958 int ret;
959
960 dev = get_device(vid);
961 if (dev == NULL)
962 return -1;
963
964 ret = read_vhost_message(fd, &msg);
965 if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
966 if (ret < 0)
967 RTE_LOG(ERR, VHOST_CONFIG,
968 "vhost read message failed\n");
969 else if (ret == 0)
970 RTE_LOG(INFO, VHOST_CONFIG,
971 "vhost peer closed\n");
972 else
973 RTE_LOG(ERR, VHOST_CONFIG,
974 "vhost read incorrect message\n");
975
976 return -1;
977 }
978
979 RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
980 vhost_message_str[msg.request]);
981 switch (msg.request) {
982 case VHOST_USER_GET_FEATURES:
983 msg.payload.u64 = vhost_user_get_features();
984 msg.size = sizeof(msg.payload.u64);
985 send_vhost_message(fd, &msg);
986 break;
987 case VHOST_USER_SET_FEATURES:
988 vhost_user_set_features(dev, msg.payload.u64);
989 break;
990
991 case VHOST_USER_GET_PROTOCOL_FEATURES:
992 msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
993 msg.size = sizeof(msg.payload.u64);
994 send_vhost_message(fd, &msg);
995 break;
996 case VHOST_USER_SET_PROTOCOL_FEATURES:
997 vhost_user_set_protocol_features(dev, msg.payload.u64);
998 break;
999
1000 case VHOST_USER_SET_OWNER:
1001 vhost_user_set_owner();
1002 break;
1003 case VHOST_USER_RESET_OWNER:
1004 vhost_user_reset_owner(dev);
1005 break;
1006
1007 case VHOST_USER_SET_MEM_TABLE:
1008 vhost_user_set_mem_table(dev, &msg);
1009 break;
1010
1011 case VHOST_USER_SET_LOG_BASE:
1012 vhost_user_set_log_base(dev, &msg);
1013
1014 /* it needs a reply */
1015 msg.size = sizeof(msg.payload.u64);
1016 send_vhost_message(fd, &msg);
1017 break;
1018 case VHOST_USER_SET_LOG_FD:
1019 close(msg.fds[0]);
1020 RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
1021 break;
1022
1023 case VHOST_USER_SET_VRING_NUM:
1024 vhost_user_set_vring_num(dev, &msg);
1025 break;
1026 case VHOST_USER_SET_VRING_ADDR:
1027 vhost_user_set_vring_addr(dev, &msg);
1028 break;
1029 case VHOST_USER_SET_VRING_BASE:
1030 vhost_user_set_vring_base(dev, &msg);
1031 break;
1032
1033 case VHOST_USER_GET_VRING_BASE:
1034 vhost_user_get_vring_base(dev, &msg);
1035 msg.size = sizeof(msg.payload.state);
1036 send_vhost_message(fd, &msg);
1037 break;
1038
1039 case VHOST_USER_SET_VRING_KICK:
1040 vhost_user_set_vring_kick(dev, &msg);
1041 break;
1042 case VHOST_USER_SET_VRING_CALL:
1043 vhost_user_set_vring_call(dev, &msg);
1044 break;
1045
1046 case VHOST_USER_SET_VRING_ERR:
1047 if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1048 close(msg.fds[0]);
1049 RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
1050 break;
1051
1052 case VHOST_USER_GET_QUEUE_NUM:
1053 msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
1054 msg.size = sizeof(msg.payload.u64);
1055 send_vhost_message(fd, &msg);
1056 break;
1057
1058 case VHOST_USER_SET_VRING_ENABLE:
1059 vhost_user_set_vring_enable(dev, &msg);
1060 break;
1061 case VHOST_USER_SEND_RARP:
1062 vhost_user_send_rarp(dev, &msg);
1063 break;
1064
1065 default:
1066 break;
1067
1068 }
1069
1070 return 0;
1071 }