]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. | |
5 | * All rights reserved. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of Intel Corporation nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
34 | #include <stdint.h> | |
35 | #include <stdio.h> | |
36 | #include <stdlib.h> | |
37 | #include <string.h> | |
38 | #include <unistd.h> | |
39 | #include <sys/mman.h> | |
40 | #include <sys/types.h> | |
41 | #include <sys/stat.h> | |
42 | #include <assert.h> | |
43 | #ifdef RTE_LIBRTE_VHOST_NUMA | |
44 | #include <numaif.h> | |
45 | #endif | |
46 | ||
47 | #include <rte_common.h> | |
48 | #include <rte_malloc.h> | |
49 | #include <rte_log.h> | |
50 | ||
51 | #include "vhost.h" | |
52 | #include "vhost_user.h" | |
53 | ||
54 | static const char *vhost_message_str[VHOST_USER_MAX] = { | |
55 | [VHOST_USER_NONE] = "VHOST_USER_NONE", | |
56 | [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", | |
57 | [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", | |
58 | [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", | |
59 | [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", | |
60 | [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", | |
61 | [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", | |
62 | [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", | |
63 | [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", | |
64 | [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", | |
65 | [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", | |
66 | [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", | |
67 | [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", | |
68 | [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", | |
69 | [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", | |
70 | [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", | |
71 | [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", | |
72 | [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", | |
73 | [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", | |
74 | [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", | |
75 | }; | |
76 | ||
77 | static uint64_t | |
78 | get_blk_size(int fd) | |
79 | { | |
80 | struct stat stat; | |
81 | int ret; | |
82 | ||
83 | ret = fstat(fd, &stat); | |
84 | return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; | |
85 | } | |
86 | ||
87 | static void | |
88 | free_mem_region(struct virtio_net *dev) | |
89 | { | |
90 | uint32_t i; | |
91 | struct virtio_memory_region *reg; | |
92 | ||
93 | if (!dev || !dev->mem) | |
94 | return; | |
95 | ||
96 | for (i = 0; i < dev->mem->nregions; i++) { | |
97 | reg = &dev->mem->regions[i]; | |
98 | if (reg->host_user_addr) { | |
99 | munmap(reg->mmap_addr, reg->mmap_size); | |
100 | close(reg->fd); | |
101 | } | |
102 | } | |
103 | } | |
104 | ||
105 | void | |
106 | vhost_backend_cleanup(struct virtio_net *dev) | |
107 | { | |
108 | if (dev->mem) { | |
109 | free_mem_region(dev); | |
110 | rte_free(dev->mem); | |
111 | dev->mem = NULL; | |
112 | } | |
113 | if (dev->log_addr) { | |
114 | munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); | |
115 | dev->log_addr = 0; | |
116 | } | |
117 | } | |
118 | ||
119 | /* | |
120 | * This function just returns success at the moment unless | |
121 | * the device hasn't been initialised. | |
122 | */ | |
123 | static int | |
124 | vhost_user_set_owner(void) | |
125 | { | |
126 | return 0; | |
127 | } | |
128 | ||
129 | static int | |
130 | vhost_user_reset_owner(struct virtio_net *dev) | |
131 | { | |
132 | if (dev->flags & VIRTIO_DEV_RUNNING) { | |
133 | dev->flags &= ~VIRTIO_DEV_RUNNING; | |
134 | notify_ops->destroy_device(dev->vid); | |
135 | } | |
136 | ||
137 | cleanup_device(dev, 0); | |
138 | reset_device(dev); | |
139 | return 0; | |
140 | } | |
141 | ||
142 | /* | |
143 | * The features that we support are requested. | |
144 | */ | |
145 | static uint64_t | |
146 | vhost_user_get_features(void) | |
147 | { | |
148 | return VHOST_FEATURES; | |
149 | } | |
150 | ||
151 | /* | |
152 | * We receive the negotiated features supported by us and the virtio device. | |
153 | */ | |
154 | static int | |
155 | vhost_user_set_features(struct virtio_net *dev, uint64_t features) | |
156 | { | |
157 | if (features & ~VHOST_FEATURES) | |
158 | return -1; | |
159 | ||
160 | dev->features = features; | |
161 | if (dev->features & | |
162 | ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { | |
163 | dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); | |
164 | } else { | |
165 | dev->vhost_hlen = sizeof(struct virtio_net_hdr); | |
166 | } | |
167 | LOG_DEBUG(VHOST_CONFIG, | |
168 | "(%d) mergeable RX buffers %s, virtio 1 %s\n", | |
169 | dev->vid, | |
170 | (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", | |
171 | (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); | |
172 | ||
173 | return 0; | |
174 | } | |
175 | ||
176 | /* | |
177 | * The virtio device sends us the size of the descriptor ring. | |
178 | */ | |
179 | static int | |
180 | vhost_user_set_vring_num(struct virtio_net *dev, | |
181 | struct vhost_vring_state *state) | |
182 | { | |
183 | struct vhost_virtqueue *vq = dev->virtqueue[state->index]; | |
184 | ||
185 | vq->size = state->num; | |
186 | ||
187 | if (dev->dequeue_zero_copy) { | |
188 | vq->nr_zmbuf = 0; | |
189 | vq->last_zmbuf_idx = 0; | |
190 | vq->zmbuf_size = vq->size; | |
191 | vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * | |
192 | sizeof(struct zcopy_mbuf), 0); | |
193 | if (vq->zmbufs == NULL) { | |
194 | RTE_LOG(WARNING, VHOST_CONFIG, | |
195 | "failed to allocate mem for zero copy; " | |
196 | "zero copy is force disabled\n"); | |
197 | dev->dequeue_zero_copy = 0; | |
198 | } | |
199 | } | |
200 | ||
201 | vq->shadow_used_ring = rte_malloc(NULL, | |
202 | vq->size * sizeof(struct vring_used_elem), | |
203 | RTE_CACHE_LINE_SIZE); | |
204 | if (!vq->shadow_used_ring) { | |
205 | RTE_LOG(ERR, VHOST_CONFIG, | |
206 | "failed to allocate memory for shadow used ring.\n"); | |
207 | return -1; | |
208 | } | |
209 | ||
210 | return 0; | |
211 | } | |
212 | ||
213 | /* | |
214 | * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the | |
215 | * same numa node as the memory of vring descriptor. | |
216 | */ | |
217 | #ifdef RTE_LIBRTE_VHOST_NUMA | |
218 | static struct virtio_net* | |
219 | numa_realloc(struct virtio_net *dev, int index) | |
220 | { | |
221 | int oldnode, newnode; | |
222 | struct virtio_net *old_dev; | |
223 | struct vhost_virtqueue *old_vq, *vq; | |
224 | int ret; | |
225 | ||
226 | /* | |
227 | * vq is allocated on pairs, we should try to do realloc | |
228 | * on first queue of one queue pair only. | |
229 | */ | |
230 | if (index % VIRTIO_QNUM != 0) | |
231 | return dev; | |
232 | ||
233 | old_dev = dev; | |
234 | vq = old_vq = dev->virtqueue[index]; | |
235 | ||
236 | ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, | |
237 | MPOL_F_NODE | MPOL_F_ADDR); | |
238 | ||
239 | /* check if we need to reallocate vq */ | |
240 | ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, | |
241 | MPOL_F_NODE | MPOL_F_ADDR); | |
242 | if (ret) { | |
243 | RTE_LOG(ERR, VHOST_CONFIG, | |
244 | "Unable to get vq numa information.\n"); | |
245 | return dev; | |
246 | } | |
247 | if (oldnode != newnode) { | |
248 | RTE_LOG(INFO, VHOST_CONFIG, | |
249 | "reallocate vq from %d to %d node\n", oldnode, newnode); | |
250 | vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0, | |
251 | newnode); | |
252 | if (!vq) | |
253 | return dev; | |
254 | ||
255 | memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM); | |
256 | rte_free(old_vq); | |
257 | } | |
258 | ||
259 | /* check if we need to reallocate dev */ | |
260 | ret = get_mempolicy(&oldnode, NULL, 0, old_dev, | |
261 | MPOL_F_NODE | MPOL_F_ADDR); | |
262 | if (ret) { | |
263 | RTE_LOG(ERR, VHOST_CONFIG, | |
264 | "Unable to get dev numa information.\n"); | |
265 | goto out; | |
266 | } | |
267 | if (oldnode != newnode) { | |
268 | RTE_LOG(INFO, VHOST_CONFIG, | |
269 | "reallocate dev from %d to %d node\n", | |
270 | oldnode, newnode); | |
271 | dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); | |
272 | if (!dev) { | |
273 | dev = old_dev; | |
274 | goto out; | |
275 | } | |
276 | ||
277 | memcpy(dev, old_dev, sizeof(*dev)); | |
278 | rte_free(old_dev); | |
279 | } | |
280 | ||
281 | out: | |
282 | dev->virtqueue[index] = vq; | |
283 | dev->virtqueue[index + 1] = vq + 1; | |
284 | vhost_devices[dev->vid] = dev; | |
285 | ||
286 | return dev; | |
287 | } | |
288 | #else | |
289 | static struct virtio_net* | |
290 | numa_realloc(struct virtio_net *dev, int index __rte_unused) | |
291 | { | |
292 | return dev; | |
293 | } | |
294 | #endif | |
295 | ||
296 | /* | |
297 | * Converts QEMU virtual address to Vhost virtual address. This function is | |
298 | * used to convert the ring addresses to our address space. | |
299 | */ | |
300 | static uint64_t | |
301 | qva_to_vva(struct virtio_net *dev, uint64_t qva) | |
302 | { | |
303 | struct virtio_memory_region *reg; | |
304 | uint32_t i; | |
305 | ||
306 | /* Find the region where the address lives. */ | |
307 | for (i = 0; i < dev->mem->nregions; i++) { | |
308 | reg = &dev->mem->regions[i]; | |
309 | ||
310 | if (qva >= reg->guest_user_addr && | |
311 | qva < reg->guest_user_addr + reg->size) { | |
312 | return qva - reg->guest_user_addr + | |
313 | reg->host_user_addr; | |
314 | } | |
315 | } | |
316 | ||
317 | return 0; | |
318 | } | |
319 | ||
320 | /* | |
321 | * The virtio device sends us the desc, used and avail ring addresses. | |
322 | * This function then converts these to our address space. | |
323 | */ | |
324 | static int | |
325 | vhost_user_set_vring_addr(struct virtio_net *dev, struct vhost_vring_addr *addr) | |
326 | { | |
327 | struct vhost_virtqueue *vq; | |
328 | ||
329 | if (dev->mem == NULL) | |
330 | return -1; | |
331 | ||
332 | /* addr->index refers to the queue index. The txq 1, rxq is 0. */ | |
333 | vq = dev->virtqueue[addr->index]; | |
334 | ||
335 | /* The addresses are converted from QEMU virtual to Vhost virtual. */ | |
336 | vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, | |
337 | addr->desc_user_addr); | |
338 | if (vq->desc == 0) { | |
339 | RTE_LOG(ERR, VHOST_CONFIG, | |
340 | "(%d) failed to find desc ring address.\n", | |
341 | dev->vid); | |
342 | return -1; | |
343 | } | |
344 | ||
345 | dev = numa_realloc(dev, addr->index); | |
346 | vq = dev->virtqueue[addr->index]; | |
347 | ||
348 | vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, | |
349 | addr->avail_user_addr); | |
350 | if (vq->avail == 0) { | |
351 | RTE_LOG(ERR, VHOST_CONFIG, | |
352 | "(%d) failed to find avail ring address.\n", | |
353 | dev->vid); | |
354 | return -1; | |
355 | } | |
356 | ||
357 | vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, | |
358 | addr->used_user_addr); | |
359 | if (vq->used == 0) { | |
360 | RTE_LOG(ERR, VHOST_CONFIG, | |
361 | "(%d) failed to find used ring address.\n", | |
362 | dev->vid); | |
363 | return -1; | |
364 | } | |
365 | ||
366 | if (vq->last_used_idx != vq->used->idx) { | |
367 | RTE_LOG(WARNING, VHOST_CONFIG, | |
368 | "last_used_idx (%u) and vq->used->idx (%u) mismatches; " | |
369 | "some packets maybe resent for Tx and dropped for Rx\n", | |
370 | vq->last_used_idx, vq->used->idx); | |
371 | vq->last_used_idx = vq->used->idx; | |
372 | vq->last_avail_idx = vq->used->idx; | |
373 | } | |
374 | ||
375 | vq->log_guest_addr = addr->log_guest_addr; | |
376 | ||
377 | LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", | |
378 | dev->vid, vq->desc); | |
379 | LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", | |
380 | dev->vid, vq->avail); | |
381 | LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", | |
382 | dev->vid, vq->used); | |
383 | LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", | |
384 | dev->vid, vq->log_guest_addr); | |
385 | ||
386 | return 0; | |
387 | } | |
388 | ||
389 | /* | |
390 | * The virtio device sends us the available ring last used index. | |
391 | */ | |
392 | static int | |
393 | vhost_user_set_vring_base(struct virtio_net *dev, | |
394 | struct vhost_vring_state *state) | |
395 | { | |
396 | dev->virtqueue[state->index]->last_used_idx = state->num; | |
397 | dev->virtqueue[state->index]->last_avail_idx = state->num; | |
398 | ||
399 | return 0; | |
400 | } | |
401 | ||
402 | static void | |
403 | add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, | |
404 | uint64_t host_phys_addr, uint64_t size) | |
405 | { | |
406 | struct guest_page *page, *last_page; | |
407 | ||
408 | if (dev->nr_guest_pages == dev->max_guest_pages) { | |
409 | dev->max_guest_pages *= 2; | |
410 | dev->guest_pages = realloc(dev->guest_pages, | |
411 | dev->max_guest_pages * sizeof(*page)); | |
412 | } | |
413 | ||
414 | if (dev->nr_guest_pages > 0) { | |
415 | last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; | |
416 | /* merge if the two pages are continuous */ | |
417 | if (host_phys_addr == last_page->host_phys_addr + | |
418 | last_page->size) { | |
419 | last_page->size += size; | |
420 | return; | |
421 | } | |
422 | } | |
423 | ||
424 | page = &dev->guest_pages[dev->nr_guest_pages++]; | |
425 | page->guest_phys_addr = guest_phys_addr; | |
426 | page->host_phys_addr = host_phys_addr; | |
427 | page->size = size; | |
428 | } | |
429 | ||
430 | static void | |
431 | add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg, | |
432 | uint64_t page_size) | |
433 | { | |
434 | uint64_t reg_size = reg->size; | |
435 | uint64_t host_user_addr = reg->host_user_addr; | |
436 | uint64_t guest_phys_addr = reg->guest_phys_addr; | |
437 | uint64_t host_phys_addr; | |
438 | uint64_t size; | |
439 | ||
440 | host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); | |
441 | size = page_size - (guest_phys_addr & (page_size - 1)); | |
442 | size = RTE_MIN(size, reg_size); | |
443 | ||
444 | add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); | |
445 | host_user_addr += size; | |
446 | guest_phys_addr += size; | |
447 | reg_size -= size; | |
448 | ||
449 | while (reg_size > 0) { | |
450 | host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) | |
451 | host_user_addr); | |
452 | add_one_guest_page(dev, guest_phys_addr, host_phys_addr, | |
453 | page_size); | |
454 | ||
455 | host_user_addr += page_size; | |
456 | guest_phys_addr += page_size; | |
457 | reg_size -= page_size; | |
458 | } | |
459 | } | |
460 | ||
461 | #ifdef RTE_LIBRTE_VHOST_DEBUG | |
462 | /* TODO: enable it only in debug mode? */ | |
463 | static void | |
464 | dump_guest_pages(struct virtio_net *dev) | |
465 | { | |
466 | uint32_t i; | |
467 | struct guest_page *page; | |
468 | ||
469 | for (i = 0; i < dev->nr_guest_pages; i++) { | |
470 | page = &dev->guest_pages[i]; | |
471 | ||
472 | RTE_LOG(INFO, VHOST_CONFIG, | |
473 | "guest physical page region %u\n" | |
474 | "\t guest_phys_addr: %" PRIx64 "\n" | |
475 | "\t host_phys_addr : %" PRIx64 "\n" | |
476 | "\t size : %" PRIx64 "\n", | |
477 | i, | |
478 | page->guest_phys_addr, | |
479 | page->host_phys_addr, | |
480 | page->size); | |
481 | } | |
482 | } | |
483 | #else | |
484 | #define dump_guest_pages(dev) | |
485 | #endif | |
486 | ||
487 | static int | |
488 | vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) | |
489 | { | |
490 | struct VhostUserMemory memory = pmsg->payload.memory; | |
491 | struct virtio_memory_region *reg; | |
492 | void *mmap_addr; | |
493 | uint64_t mmap_size; | |
494 | uint64_t mmap_offset; | |
495 | uint64_t alignment; | |
496 | uint32_t i; | |
497 | int fd; | |
498 | ||
499 | /* Remove from the data plane. */ | |
500 | if (dev->flags & VIRTIO_DEV_RUNNING) { | |
501 | dev->flags &= ~VIRTIO_DEV_RUNNING; | |
502 | notify_ops->destroy_device(dev->vid); | |
503 | } | |
504 | ||
505 | if (dev->mem) { | |
506 | free_mem_region(dev); | |
507 | rte_free(dev->mem); | |
508 | dev->mem = NULL; | |
509 | } | |
510 | ||
511 | dev->nr_guest_pages = 0; | |
512 | if (!dev->guest_pages) { | |
513 | dev->max_guest_pages = 8; | |
514 | dev->guest_pages = malloc(dev->max_guest_pages * | |
515 | sizeof(struct guest_page)); | |
516 | } | |
517 | ||
518 | dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) + | |
519 | sizeof(struct virtio_memory_region) * memory.nregions, 0); | |
520 | if (dev->mem == NULL) { | |
521 | RTE_LOG(ERR, VHOST_CONFIG, | |
522 | "(%d) failed to allocate memory for dev->mem\n", | |
523 | dev->vid); | |
524 | return -1; | |
525 | } | |
526 | dev->mem->nregions = memory.nregions; | |
527 | ||
528 | for (i = 0; i < memory.nregions; i++) { | |
529 | fd = pmsg->fds[i]; | |
530 | reg = &dev->mem->regions[i]; | |
531 | ||
532 | reg->guest_phys_addr = memory.regions[i].guest_phys_addr; | |
533 | reg->guest_user_addr = memory.regions[i].userspace_addr; | |
534 | reg->size = memory.regions[i].memory_size; | |
535 | reg->fd = fd; | |
536 | ||
537 | mmap_offset = memory.regions[i].mmap_offset; | |
538 | mmap_size = reg->size + mmap_offset; | |
539 | ||
540 | /* mmap() without flag of MAP_ANONYMOUS, should be called | |
541 | * with length argument aligned with hugepagesz at older | |
542 | * longterm version Linux, like 2.6.32 and 3.2.72, or | |
543 | * mmap() will fail with EINVAL. | |
544 | * | |
545 | * to avoid failure, make sure in caller to keep length | |
546 | * aligned. | |
547 | */ | |
548 | alignment = get_blk_size(fd); | |
549 | if (alignment == (uint64_t)-1) { | |
550 | RTE_LOG(ERR, VHOST_CONFIG, | |
551 | "couldn't get hugepage size through fstat\n"); | |
552 | goto err_mmap; | |
553 | } | |
554 | mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); | |
555 | ||
556 | mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, | |
557 | MAP_SHARED | MAP_POPULATE, fd, 0); | |
558 | ||
559 | if (mmap_addr == MAP_FAILED) { | |
560 | RTE_LOG(ERR, VHOST_CONFIG, | |
561 | "mmap region %u failed.\n", i); | |
562 | goto err_mmap; | |
563 | } | |
564 | ||
565 | reg->mmap_addr = mmap_addr; | |
566 | reg->mmap_size = mmap_size; | |
567 | reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + | |
568 | mmap_offset; | |
569 | ||
570 | add_guest_pages(dev, reg, alignment); | |
571 | ||
572 | RTE_LOG(INFO, VHOST_CONFIG, | |
573 | "guest memory region %u, size: 0x%" PRIx64 "\n" | |
574 | "\t guest physical addr: 0x%" PRIx64 "\n" | |
575 | "\t guest virtual addr: 0x%" PRIx64 "\n" | |
576 | "\t host virtual addr: 0x%" PRIx64 "\n" | |
577 | "\t mmap addr : 0x%" PRIx64 "\n" | |
578 | "\t mmap size : 0x%" PRIx64 "\n" | |
579 | "\t mmap align: 0x%" PRIx64 "\n" | |
580 | "\t mmap off : 0x%" PRIx64 "\n", | |
581 | i, reg->size, | |
582 | reg->guest_phys_addr, | |
583 | reg->guest_user_addr, | |
584 | reg->host_user_addr, | |
585 | (uint64_t)(uintptr_t)mmap_addr, | |
586 | mmap_size, | |
587 | alignment, | |
588 | mmap_offset); | |
589 | } | |
590 | ||
591 | dump_guest_pages(dev); | |
592 | ||
593 | return 0; | |
594 | ||
595 | err_mmap: | |
596 | free_mem_region(dev); | |
597 | rte_free(dev->mem); | |
598 | dev->mem = NULL; | |
599 | return -1; | |
600 | } | |
601 | ||
602 | static int | |
603 | vq_is_ready(struct vhost_virtqueue *vq) | |
604 | { | |
605 | return vq && vq->desc && | |
606 | vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && | |
607 | vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; | |
608 | } | |
609 | ||
610 | static int | |
611 | virtio_is_ready(struct virtio_net *dev) | |
612 | { | |
613 | struct vhost_virtqueue *rvq, *tvq; | |
614 | uint32_t i; | |
615 | ||
616 | for (i = 0; i < dev->virt_qp_nb; i++) { | |
617 | rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; | |
618 | tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; | |
619 | ||
620 | if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) { | |
621 | RTE_LOG(INFO, VHOST_CONFIG, | |
622 | "virtio is not ready for processing.\n"); | |
623 | return 0; | |
624 | } | |
625 | } | |
626 | ||
627 | RTE_LOG(INFO, VHOST_CONFIG, | |
628 | "virtio is now ready for processing.\n"); | |
629 | return 1; | |
630 | } | |
631 | ||
632 | static void | |
633 | vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) | |
634 | { | |
635 | struct vhost_vring_file file; | |
636 | struct vhost_virtqueue *vq; | |
637 | uint32_t cur_qp_idx; | |
638 | ||
639 | file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; | |
640 | if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) | |
641 | file.fd = VIRTIO_INVALID_EVENTFD; | |
642 | else | |
643 | file.fd = pmsg->fds[0]; | |
644 | RTE_LOG(INFO, VHOST_CONFIG, | |
645 | "vring call idx:%d file:%d\n", file.index, file.fd); | |
646 | ||
647 | /* | |
648 | * FIXME: VHOST_SET_VRING_CALL is the first per-vring message | |
649 | * we get, so we do vring queue pair allocation here. | |
650 | */ | |
651 | cur_qp_idx = file.index / VIRTIO_QNUM; | |
652 | if (cur_qp_idx + 1 > dev->virt_qp_nb) { | |
653 | if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0) | |
654 | return; | |
655 | } | |
656 | ||
657 | vq = dev->virtqueue[file.index]; | |
658 | assert(vq != NULL); | |
659 | ||
660 | if (vq->callfd >= 0) | |
661 | close(vq->callfd); | |
662 | ||
663 | vq->callfd = file.fd; | |
664 | } | |
665 | ||
666 | /* | |
667 | * In vhost-user, when we receive kick message, will test whether virtio | |
668 | * device is ready for packet processing. | |
669 | */ | |
670 | static void | |
671 | vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) | |
672 | { | |
673 | struct vhost_vring_file file; | |
674 | struct vhost_virtqueue *vq; | |
675 | ||
676 | file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; | |
677 | if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) | |
678 | file.fd = VIRTIO_INVALID_EVENTFD; | |
679 | else | |
680 | file.fd = pmsg->fds[0]; | |
681 | RTE_LOG(INFO, VHOST_CONFIG, | |
682 | "vring kick idx:%d file:%d\n", file.index, file.fd); | |
683 | ||
684 | vq = dev->virtqueue[file.index]; | |
685 | if (vq->kickfd >= 0) | |
686 | close(vq->kickfd); | |
687 | vq->kickfd = file.fd; | |
688 | ||
689 | if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { | |
690 | if (dev->dequeue_zero_copy) { | |
691 | RTE_LOG(INFO, VHOST_CONFIG, | |
692 | "dequeue zero copy is enabled\n"); | |
693 | } | |
694 | ||
695 | if (notify_ops->new_device(dev->vid) == 0) | |
696 | dev->flags |= VIRTIO_DEV_RUNNING; | |
697 | } | |
698 | } | |
699 | ||
700 | static void | |
701 | free_zmbufs(struct vhost_virtqueue *vq) | |
702 | { | |
703 | struct zcopy_mbuf *zmbuf, *next; | |
704 | ||
705 | for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); | |
706 | zmbuf != NULL; zmbuf = next) { | |
707 | next = TAILQ_NEXT(zmbuf, next); | |
708 | ||
709 | rte_pktmbuf_free(zmbuf->mbuf); | |
710 | TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); | |
711 | } | |
712 | ||
713 | rte_free(vq->zmbufs); | |
714 | } | |
715 | ||
716 | /* | |
717 | * when virtio is stopped, qemu will send us the GET_VRING_BASE message. | |
718 | */ | |
719 | static int | |
720 | vhost_user_get_vring_base(struct virtio_net *dev, | |
721 | struct vhost_vring_state *state) | |
722 | { | |
723 | struct vhost_virtqueue *vq = dev->virtqueue[state->index]; | |
724 | ||
725 | /* We have to stop the queue (virtio) if it is running. */ | |
726 | if (dev->flags & VIRTIO_DEV_RUNNING) { | |
727 | dev->flags &= ~VIRTIO_DEV_RUNNING; | |
728 | notify_ops->destroy_device(dev->vid); | |
729 | } | |
730 | ||
731 | /* Here we are safe to get the last used index */ | |
732 | state->num = vq->last_used_idx; | |
733 | ||
734 | RTE_LOG(INFO, VHOST_CONFIG, | |
735 | "vring base idx:%d file:%d\n", state->index, state->num); | |
736 | /* | |
737 | * Based on current qemu vhost-user implementation, this message is | |
738 | * sent and only sent in vhost_vring_stop. | |
739 | * TODO: cleanup the vring, it isn't usable since here. | |
740 | */ | |
741 | if (vq->kickfd >= 0) | |
742 | close(vq->kickfd); | |
743 | ||
744 | vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; | |
745 | ||
746 | if (dev->dequeue_zero_copy) | |
747 | free_zmbufs(vq); | |
748 | rte_free(vq->shadow_used_ring); | |
749 | vq->shadow_used_ring = NULL; | |
750 | ||
751 | return 0; | |
752 | } | |
753 | ||
754 | /* | |
755 | * when virtio queues are ready to work, qemu will send us to | |
756 | * enable the virtio queue pair. | |
757 | */ | |
758 | static int | |
759 | vhost_user_set_vring_enable(struct virtio_net *dev, | |
760 | struct vhost_vring_state *state) | |
761 | { | |
762 | int enable = (int)state->num; | |
763 | ||
764 | RTE_LOG(INFO, VHOST_CONFIG, | |
765 | "set queue enable: %d to qp idx: %d\n", | |
766 | enable, state->index); | |
767 | ||
768 | if (notify_ops->vring_state_changed) | |
769 | notify_ops->vring_state_changed(dev->vid, state->index, enable); | |
770 | ||
771 | dev->virtqueue[state->index]->enabled = enable; | |
772 | ||
773 | return 0; | |
774 | } | |
775 | ||
776 | static void | |
777 | vhost_user_set_protocol_features(struct virtio_net *dev, | |
778 | uint64_t protocol_features) | |
779 | { | |
780 | if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) | |
781 | return; | |
782 | ||
783 | dev->protocol_features = protocol_features; | |
784 | } | |
785 | ||
786 | static int | |
787 | vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) | |
788 | { | |
789 | int fd = msg->fds[0]; | |
790 | uint64_t size, off; | |
791 | void *addr; | |
792 | ||
793 | if (fd < 0) { | |
794 | RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); | |
795 | return -1; | |
796 | } | |
797 | ||
798 | if (msg->size != sizeof(VhostUserLog)) { | |
799 | RTE_LOG(ERR, VHOST_CONFIG, | |
800 | "invalid log base msg size: %"PRId32" != %d\n", | |
801 | msg->size, (int)sizeof(VhostUserLog)); | |
802 | return -1; | |
803 | } | |
804 | ||
805 | size = msg->payload.log.mmap_size; | |
806 | off = msg->payload.log.mmap_offset; | |
807 | RTE_LOG(INFO, VHOST_CONFIG, | |
808 | "log mmap size: %"PRId64", offset: %"PRId64"\n", | |
809 | size, off); | |
810 | ||
811 | /* | |
812 | * mmap from 0 to workaround a hugepage mmap bug: mmap will | |
813 | * fail when offset is not page size aligned. | |
814 | */ | |
815 | addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | |
816 | close(fd); | |
817 | if (addr == MAP_FAILED) { | |
818 | RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); | |
819 | return -1; | |
820 | } | |
821 | ||
822 | /* | |
823 | * Free previously mapped log memory on occasionally | |
824 | * multiple VHOST_USER_SET_LOG_BASE. | |
825 | */ | |
826 | if (dev->log_addr) { | |
827 | munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); | |
828 | } | |
829 | dev->log_addr = (uint64_t)(uintptr_t)addr; | |
830 | dev->log_base = dev->log_addr + off; | |
831 | dev->log_size = size; | |
832 | ||
833 | return 0; | |
834 | } | |
835 | ||
836 | /* | |
837 | * An rarp packet is constructed and broadcasted to notify switches about | |
838 | * the new location of the migrated VM, so that packets from outside will | |
839 | * not be lost after migration. | |
840 | * | |
841 | * However, we don't actually "send" a rarp packet here, instead, we set | |
842 | * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. | |
843 | */ | |
844 | static int | |
845 | vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) | |
846 | { | |
847 | uint8_t *mac = (uint8_t *)&msg->payload.u64; | |
848 | ||
849 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
850 | ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", | |
851 | mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); | |
852 | memcpy(dev->mac.addr_bytes, mac, 6); | |
853 | ||
854 | /* | |
855 | * Set the flag to inject a RARP broadcast packet at | |
856 | * rte_vhost_dequeue_burst(). | |
857 | * | |
858 | * rte_smp_wmb() is for making sure the mac is copied | |
859 | * before the flag is set. | |
860 | */ | |
861 | rte_smp_wmb(); | |
862 | rte_atomic16_set(&dev->broadcast_rarp, 1); | |
863 | ||
864 | return 0; | |
865 | } | |
866 | ||
867 | /* return bytes# of read on success or negative val on failure. */ | |
868 | static int | |
869 | read_vhost_message(int sockfd, struct VhostUserMsg *msg) | |
870 | { | |
871 | int ret; | |
872 | ||
873 | ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, | |
874 | msg->fds, VHOST_MEMORY_MAX_NREGIONS); | |
875 | if (ret <= 0) | |
876 | return ret; | |
877 | ||
878 | if (msg && msg->size) { | |
879 | if (msg->size > sizeof(msg->payload)) { | |
880 | RTE_LOG(ERR, VHOST_CONFIG, | |
881 | "invalid msg size: %d\n", msg->size); | |
882 | return -1; | |
883 | } | |
884 | ret = read(sockfd, &msg->payload, msg->size); | |
885 | if (ret <= 0) | |
886 | return ret; | |
887 | if (ret != (int)msg->size) { | |
888 | RTE_LOG(ERR, VHOST_CONFIG, | |
889 | "read control message failed\n"); | |
890 | return -1; | |
891 | } | |
892 | } | |
893 | ||
894 | return ret; | |
895 | } | |
896 | ||
897 | static int | |
898 | send_vhost_message(int sockfd, struct VhostUserMsg *msg) | |
899 | { | |
900 | int ret; | |
901 | ||
902 | if (!msg) | |
903 | return 0; | |
904 | ||
905 | msg->flags &= ~VHOST_USER_VERSION_MASK; | |
906 | msg->flags |= VHOST_USER_VERSION; | |
907 | msg->flags |= VHOST_USER_REPLY_MASK; | |
908 | ||
909 | ret = send_fd_message(sockfd, (char *)msg, | |
910 | VHOST_USER_HDR_SIZE + msg->size, NULL, 0); | |
911 | ||
912 | return ret; | |
913 | } | |
914 | ||
915 | int | |
916 | vhost_user_msg_handler(int vid, int fd) | |
917 | { | |
918 | struct virtio_net *dev; | |
919 | struct VhostUserMsg msg; | |
920 | int ret; | |
921 | ||
922 | dev = get_device(vid); | |
923 | if (dev == NULL) | |
924 | return -1; | |
925 | ||
926 | ret = read_vhost_message(fd, &msg); | |
927 | if (ret <= 0 || msg.request >= VHOST_USER_MAX) { | |
928 | if (ret < 0) | |
929 | RTE_LOG(ERR, VHOST_CONFIG, | |
930 | "vhost read message failed\n"); | |
931 | else if (ret == 0) | |
932 | RTE_LOG(INFO, VHOST_CONFIG, | |
933 | "vhost peer closed\n"); | |
934 | else | |
935 | RTE_LOG(ERR, VHOST_CONFIG, | |
936 | "vhost read incorrect message\n"); | |
937 | ||
938 | return -1; | |
939 | } | |
940 | ||
941 | RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", | |
942 | vhost_message_str[msg.request]); | |
943 | switch (msg.request) { | |
944 | case VHOST_USER_GET_FEATURES: | |
945 | msg.payload.u64 = vhost_user_get_features(); | |
946 | msg.size = sizeof(msg.payload.u64); | |
947 | send_vhost_message(fd, &msg); | |
948 | break; | |
949 | case VHOST_USER_SET_FEATURES: | |
950 | vhost_user_set_features(dev, msg.payload.u64); | |
951 | break; | |
952 | ||
953 | case VHOST_USER_GET_PROTOCOL_FEATURES: | |
954 | msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; | |
955 | msg.size = sizeof(msg.payload.u64); | |
956 | send_vhost_message(fd, &msg); | |
957 | break; | |
958 | case VHOST_USER_SET_PROTOCOL_FEATURES: | |
959 | vhost_user_set_protocol_features(dev, msg.payload.u64); | |
960 | break; | |
961 | ||
962 | case VHOST_USER_SET_OWNER: | |
963 | vhost_user_set_owner(); | |
964 | break; | |
965 | case VHOST_USER_RESET_OWNER: | |
966 | vhost_user_reset_owner(dev); | |
967 | break; | |
968 | ||
969 | case VHOST_USER_SET_MEM_TABLE: | |
970 | vhost_user_set_mem_table(dev, &msg); | |
971 | break; | |
972 | ||
973 | case VHOST_USER_SET_LOG_BASE: | |
974 | vhost_user_set_log_base(dev, &msg); | |
975 | ||
976 | /* it needs a reply */ | |
977 | msg.size = sizeof(msg.payload.u64); | |
978 | send_vhost_message(fd, &msg); | |
979 | break; | |
980 | case VHOST_USER_SET_LOG_FD: | |
981 | close(msg.fds[0]); | |
982 | RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); | |
983 | break; | |
984 | ||
985 | case VHOST_USER_SET_VRING_NUM: | |
986 | vhost_user_set_vring_num(dev, &msg.payload.state); | |
987 | break; | |
988 | case VHOST_USER_SET_VRING_ADDR: | |
989 | vhost_user_set_vring_addr(dev, &msg.payload.addr); | |
990 | break; | |
991 | case VHOST_USER_SET_VRING_BASE: | |
992 | vhost_user_set_vring_base(dev, &msg.payload.state); | |
993 | break; | |
994 | ||
995 | case VHOST_USER_GET_VRING_BASE: | |
996 | ret = vhost_user_get_vring_base(dev, &msg.payload.state); | |
997 | msg.size = sizeof(msg.payload.state); | |
998 | send_vhost_message(fd, &msg); | |
999 | break; | |
1000 | ||
1001 | case VHOST_USER_SET_VRING_KICK: | |
1002 | vhost_user_set_vring_kick(dev, &msg); | |
1003 | break; | |
1004 | case VHOST_USER_SET_VRING_CALL: | |
1005 | vhost_user_set_vring_call(dev, &msg); | |
1006 | break; | |
1007 | ||
1008 | case VHOST_USER_SET_VRING_ERR: | |
1009 | if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) | |
1010 | close(msg.fds[0]); | |
1011 | RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); | |
1012 | break; | |
1013 | ||
1014 | case VHOST_USER_GET_QUEUE_NUM: | |
1015 | msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; | |
1016 | msg.size = sizeof(msg.payload.u64); | |
1017 | send_vhost_message(fd, &msg); | |
1018 | break; | |
1019 | ||
1020 | case VHOST_USER_SET_VRING_ENABLE: | |
1021 | vhost_user_set_vring_enable(dev, &msg.payload.state); | |
1022 | break; | |
1023 | case VHOST_USER_SEND_RARP: | |
1024 | vhost_user_send_rarp(dev, &msg); | |
1025 | break; | |
1026 | ||
1027 | default: | |
1028 | break; | |
1029 | ||
1030 | } | |
1031 | ||
1032 | return 0; | |
1033 | } |