]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2010-2018 Intel Corporation | |
3 | */ | |
4 | ||
5 | /* Security model | |
6 | * -------------- | |
7 | * The vhost-user protocol connection is an external interface, so it must be | |
8 | * robust against invalid inputs. | |
9 | * | |
10 | * This is important because the vhost-user master is only one step removed | |
11 | * from the guest. Malicious guests that have escaped will then launch further | |
12 | * attacks from the vhost-user master. | |
13 | * | |
14 | * Even in deployments where guests are trusted, a bug in the vhost-user master | |
15 | * can still cause invalid messages to be sent. Such messages must not | |
16 | * compromise the stability of the DPDK application by causing crashes, memory | |
17 | * corruption, or other problematic behavior. | |
18 | * | |
19 | * Do not assume received VhostUserMsg fields contain sensible values! | |
20 | */ | |
21 | ||
22 | #include <stdint.h> | |
23 | #include <stdio.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include <unistd.h> | |
9f95a23c TL |
27 | #include <fcntl.h> |
28 | #include <sys/ioctl.h> | |
11fdf7f2 TL |
29 | #include <sys/mman.h> |
30 | #include <sys/types.h> | |
31 | #include <sys/stat.h> | |
9f95a23c | 32 | #include <sys/syscall.h> |
11fdf7f2 TL |
33 | #include <assert.h> |
34 | #ifdef RTE_LIBRTE_VHOST_NUMA | |
35 | #include <numaif.h> | |
36 | #endif | |
9f95a23c TL |
37 | #ifdef RTE_LIBRTE_VHOST_POSTCOPY |
38 | #include <linux/userfaultfd.h> | |
39 | #endif | |
11fdf7f2 TL |
40 | |
41 | #include <rte_common.h> | |
42 | #include <rte_malloc.h> | |
43 | #include <rte_log.h> | |
44 | ||
45 | #include "iotlb.h" | |
46 | #include "vhost.h" | |
47 | #include "vhost_user.h" | |
48 | ||
49 | #define VIRTIO_MIN_MTU 68 | |
50 | #define VIRTIO_MAX_MTU 65535 | |
51 | ||
52 | static const char *vhost_message_str[VHOST_USER_MAX] = { | |
53 | [VHOST_USER_NONE] = "VHOST_USER_NONE", | |
54 | [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", | |
55 | [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", | |
56 | [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", | |
57 | [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", | |
58 | [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", | |
59 | [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", | |
60 | [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", | |
61 | [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", | |
62 | [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", | |
63 | [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", | |
64 | [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", | |
65 | [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", | |
66 | [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", | |
67 | [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", | |
68 | [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", | |
69 | [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", | |
70 | [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", | |
71 | [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", | |
72 | [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", | |
73 | [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", | |
74 | [VHOST_USER_SET_SLAVE_REQ_FD] = "VHOST_USER_SET_SLAVE_REQ_FD", | |
75 | [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG", | |
76 | [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS", | |
77 | [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS", | |
9f95a23c TL |
78 | [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE", |
79 | [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN", | |
80 | [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END", | |
11fdf7f2 TL |
81 | }; |
82 | ||
9f95a23c TL |
83 | static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg); |
84 | static int read_vhost_message(int sockfd, struct VhostUserMsg *msg); | |
85 | ||
11fdf7f2 TL |
86 | static uint64_t |
87 | get_blk_size(int fd) | |
88 | { | |
89 | struct stat stat; | |
90 | int ret; | |
91 | ||
92 | ret = fstat(fd, &stat); | |
93 | return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; | |
94 | } | |
95 | ||
9f95a23c TL |
96 | /* |
97 | * Reclaim all the outstanding zmbufs for a virtqueue. | |
98 | */ | |
99 | static void | |
100 | drain_zmbuf_list(struct vhost_virtqueue *vq) | |
101 | { | |
102 | struct zcopy_mbuf *zmbuf, *next; | |
103 | ||
104 | for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); | |
105 | zmbuf != NULL; zmbuf = next) { | |
106 | next = TAILQ_NEXT(zmbuf, next); | |
107 | ||
108 | while (!mbuf_is_consumed(zmbuf->mbuf)) | |
109 | usleep(1000); | |
110 | ||
111 | TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); | |
112 | restore_mbuf(zmbuf->mbuf); | |
113 | rte_pktmbuf_free(zmbuf->mbuf); | |
114 | put_zmbuf(zmbuf); | |
115 | vq->nr_zmbuf -= 1; | |
116 | } | |
117 | } | |
118 | ||
11fdf7f2 TL |
119 | static void |
120 | free_mem_region(struct virtio_net *dev) | |
121 | { | |
122 | uint32_t i; | |
123 | struct rte_vhost_mem_region *reg; | |
9f95a23c | 124 | struct vhost_virtqueue *vq; |
11fdf7f2 TL |
125 | |
126 | if (!dev || !dev->mem) | |
127 | return; | |
128 | ||
9f95a23c TL |
129 | if (dev->dequeue_zero_copy) { |
130 | for (i = 0; i < dev->nr_vring; i++) { | |
131 | vq = dev->virtqueue[i]; | |
132 | if (vq) | |
133 | drain_zmbuf_list(vq); | |
134 | } | |
135 | } | |
136 | ||
11fdf7f2 TL |
137 | for (i = 0; i < dev->mem->nregions; i++) { |
138 | reg = &dev->mem->regions[i]; | |
139 | if (reg->host_user_addr) { | |
140 | munmap(reg->mmap_addr, reg->mmap_size); | |
141 | close(reg->fd); | |
142 | } | |
143 | } | |
144 | } | |
145 | ||
146 | void | |
147 | vhost_backend_cleanup(struct virtio_net *dev) | |
148 | { | |
149 | if (dev->mem) { | |
150 | free_mem_region(dev); | |
151 | rte_free(dev->mem); | |
152 | dev->mem = NULL; | |
153 | } | |
154 | ||
155 | free(dev->guest_pages); | |
156 | dev->guest_pages = NULL; | |
157 | ||
158 | if (dev->log_addr) { | |
159 | munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); | |
160 | dev->log_addr = 0; | |
161 | } | |
162 | ||
163 | if (dev->slave_req_fd >= 0) { | |
164 | close(dev->slave_req_fd); | |
165 | dev->slave_req_fd = -1; | |
166 | } | |
9f95a23c TL |
167 | |
168 | if (dev->postcopy_ufd >= 0) { | |
169 | close(dev->postcopy_ufd); | |
170 | dev->postcopy_ufd = -1; | |
171 | } | |
172 | ||
173 | dev->postcopy_listening = 0; | |
11fdf7f2 TL |
174 | } |
175 | ||
176 | /* | |
177 | * This function just returns success at the moment unless | |
178 | * the device hasn't been initialised. | |
179 | */ | |
180 | static int | |
9f95a23c TL |
181 | vhost_user_set_owner(struct virtio_net **pdev __rte_unused, |
182 | struct VhostUserMsg *msg __rte_unused, | |
183 | int main_fd __rte_unused) | |
11fdf7f2 | 184 | { |
9f95a23c | 185 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
186 | } |
187 | ||
188 | static int | |
9f95a23c TL |
189 | vhost_user_reset_owner(struct virtio_net **pdev, |
190 | struct VhostUserMsg *msg __rte_unused, | |
191 | int main_fd __rte_unused) | |
11fdf7f2 | 192 | { |
9f95a23c | 193 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
194 | vhost_destroy_device_notify(dev); |
195 | ||
196 | cleanup_device(dev, 0); | |
197 | reset_device(dev); | |
9f95a23c | 198 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
199 | } |
200 | ||
201 | /* | |
202 | * The features that we support are requested. | |
203 | */ | |
9f95a23c TL |
204 | static int |
205 | vhost_user_get_features(struct virtio_net **pdev, struct VhostUserMsg *msg, | |
206 | int main_fd __rte_unused) | |
11fdf7f2 | 207 | { |
9f95a23c | 208 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
209 | uint64_t features = 0; |
210 | ||
211 | rte_vhost_driver_get_features(dev->ifname, &features); | |
9f95a23c TL |
212 | |
213 | msg->payload.u64 = features; | |
214 | msg->size = sizeof(msg->payload.u64); | |
215 | msg->fd_num = 0; | |
216 | ||
217 | return RTE_VHOST_MSG_RESULT_REPLY; | |
11fdf7f2 TL |
218 | } |
219 | ||
220 | /* | |
221 | * The queue number that we support are requested. | |
222 | */ | |
9f95a23c TL |
223 | static int |
224 | vhost_user_get_queue_num(struct virtio_net **pdev, struct VhostUserMsg *msg, | |
225 | int main_fd __rte_unused) | |
11fdf7f2 | 226 | { |
9f95a23c | 227 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
228 | uint32_t queue_num = 0; |
229 | ||
230 | rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); | |
9f95a23c TL |
231 | |
232 | msg->payload.u64 = (uint64_t)queue_num; | |
233 | msg->size = sizeof(msg->payload.u64); | |
234 | msg->fd_num = 0; | |
235 | ||
236 | return RTE_VHOST_MSG_RESULT_REPLY; | |
11fdf7f2 TL |
237 | } |
238 | ||
239 | /* | |
240 | * We receive the negotiated features supported by us and the virtio device. | |
241 | */ | |
242 | static int | |
9f95a23c TL |
243 | vhost_user_set_features(struct virtio_net **pdev, struct VhostUserMsg *msg, |
244 | int main_fd __rte_unused) | |
11fdf7f2 | 245 | { |
9f95a23c TL |
246 | struct virtio_net *dev = *pdev; |
247 | uint64_t features = msg->payload.u64; | |
11fdf7f2 TL |
248 | uint64_t vhost_features = 0; |
249 | struct rte_vdpa_device *vdpa_dev; | |
250 | int did = -1; | |
251 | ||
252 | rte_vhost_driver_get_features(dev->ifname, &vhost_features); | |
253 | if (features & ~vhost_features) { | |
254 | RTE_LOG(ERR, VHOST_CONFIG, | |
255 | "(%d) received invalid negotiated features.\n", | |
256 | dev->vid); | |
9f95a23c | 257 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
258 | } |
259 | ||
260 | if (dev->flags & VIRTIO_DEV_RUNNING) { | |
261 | if (dev->features == features) | |
9f95a23c | 262 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
263 | |
264 | /* | |
265 | * Error out if master tries to change features while device is | |
266 | * in running state. The exception being VHOST_F_LOG_ALL, which | |
267 | * is enabled when the live-migration starts. | |
268 | */ | |
269 | if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) { | |
270 | RTE_LOG(ERR, VHOST_CONFIG, | |
271 | "(%d) features changed while device is running.\n", | |
272 | dev->vid); | |
9f95a23c | 273 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
274 | } |
275 | ||
276 | if (dev->notify_ops->features_changed) | |
277 | dev->notify_ops->features_changed(dev->vid, features); | |
278 | } | |
279 | ||
280 | dev->features = features; | |
281 | if (dev->features & | |
282 | ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { | |
283 | dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); | |
284 | } else { | |
285 | dev->vhost_hlen = sizeof(struct virtio_net_hdr); | |
286 | } | |
287 | VHOST_LOG_DEBUG(VHOST_CONFIG, | |
288 | "(%d) mergeable RX buffers %s, virtio 1 %s\n", | |
289 | dev->vid, | |
290 | (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", | |
291 | (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); | |
292 | ||
293 | if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) && | |
294 | !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) { | |
295 | /* | |
296 | * Remove all but first queue pair if MQ hasn't been | |
297 | * negotiated. This is safe because the device is not | |
298 | * running at this stage. | |
299 | */ | |
300 | while (dev->nr_vring > 2) { | |
301 | struct vhost_virtqueue *vq; | |
302 | ||
303 | vq = dev->virtqueue[--dev->nr_vring]; | |
304 | if (!vq) | |
305 | continue; | |
306 | ||
307 | dev->virtqueue[dev->nr_vring] = NULL; | |
308 | cleanup_vq(vq, 1); | |
309 | free_vq(dev, vq); | |
310 | } | |
311 | } | |
312 | ||
313 | did = dev->vdpa_dev_id; | |
314 | vdpa_dev = rte_vdpa_get_device(did); | |
315 | if (vdpa_dev && vdpa_dev->ops->set_features) | |
316 | vdpa_dev->ops->set_features(dev->vid); | |
317 | ||
9f95a23c | 318 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
319 | } |
320 | ||
321 | /* | |
322 | * The virtio device sends us the size of the descriptor ring. | |
323 | */ | |
324 | static int | |
9f95a23c TL |
325 | vhost_user_set_vring_num(struct virtio_net **pdev, |
326 | struct VhostUserMsg *msg, | |
327 | int main_fd __rte_unused) | |
11fdf7f2 | 328 | { |
9f95a23c | 329 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
330 | struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; |
331 | ||
332 | vq->size = msg->payload.state.num; | |
333 | ||
334 | /* VIRTIO 1.0, 2.4 Virtqueues says: | |
335 | * | |
336 | * Queue Size value is always a power of 2. The maximum Queue Size | |
337 | * value is 32768. | |
338 | */ | |
339 | if ((vq->size & (vq->size - 1)) || vq->size > 32768) { | |
340 | RTE_LOG(ERR, VHOST_CONFIG, | |
341 | "invalid virtqueue size %u\n", vq->size); | |
9f95a23c | 342 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
343 | } |
344 | ||
345 | if (dev->dequeue_zero_copy) { | |
346 | vq->nr_zmbuf = 0; | |
347 | vq->last_zmbuf_idx = 0; | |
348 | vq->zmbuf_size = vq->size; | |
349 | vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * | |
350 | sizeof(struct zcopy_mbuf), 0); | |
351 | if (vq->zmbufs == NULL) { | |
352 | RTE_LOG(WARNING, VHOST_CONFIG, | |
353 | "failed to allocate mem for zero copy; " | |
354 | "zero copy is force disabled\n"); | |
355 | dev->dequeue_zero_copy = 0; | |
356 | } | |
357 | TAILQ_INIT(&vq->zmbuf_list); | |
358 | } | |
359 | ||
360 | if (vq_is_packed(dev)) { | |
361 | vq->shadow_used_packed = rte_malloc(NULL, | |
362 | vq->size * | |
363 | sizeof(struct vring_used_elem_packed), | |
364 | RTE_CACHE_LINE_SIZE); | |
365 | if (!vq->shadow_used_packed) { | |
366 | RTE_LOG(ERR, VHOST_CONFIG, | |
367 | "failed to allocate memory for shadow used ring.\n"); | |
9f95a23c | 368 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
369 | } |
370 | ||
371 | } else { | |
372 | vq->shadow_used_split = rte_malloc(NULL, | |
373 | vq->size * sizeof(struct vring_used_elem), | |
374 | RTE_CACHE_LINE_SIZE); | |
375 | if (!vq->shadow_used_split) { | |
376 | RTE_LOG(ERR, VHOST_CONFIG, | |
377 | "failed to allocate memory for shadow used ring.\n"); | |
9f95a23c | 378 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
379 | } |
380 | } | |
381 | ||
382 | vq->batch_copy_elems = rte_malloc(NULL, | |
383 | vq->size * sizeof(struct batch_copy_elem), | |
384 | RTE_CACHE_LINE_SIZE); | |
385 | if (!vq->batch_copy_elems) { | |
386 | RTE_LOG(ERR, VHOST_CONFIG, | |
387 | "failed to allocate memory for batching copy.\n"); | |
9f95a23c | 388 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
389 | } |
390 | ||
9f95a23c | 391 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
392 | } |
393 | ||
394 | /* | |
395 | * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the | |
396 | * same numa node as the memory of vring descriptor. | |
397 | */ | |
398 | #ifdef RTE_LIBRTE_VHOST_NUMA | |
399 | static struct virtio_net* | |
400 | numa_realloc(struct virtio_net *dev, int index) | |
401 | { | |
402 | int oldnode, newnode; | |
403 | struct virtio_net *old_dev; | |
404 | struct vhost_virtqueue *old_vq, *vq; | |
405 | struct zcopy_mbuf *new_zmbuf; | |
406 | struct vring_used_elem *new_shadow_used_split; | |
407 | struct vring_used_elem_packed *new_shadow_used_packed; | |
408 | struct batch_copy_elem *new_batch_copy_elems; | |
409 | int ret; | |
410 | ||
411 | old_dev = dev; | |
412 | vq = old_vq = dev->virtqueue[index]; | |
413 | ||
414 | ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, | |
415 | MPOL_F_NODE | MPOL_F_ADDR); | |
416 | ||
417 | /* check if we need to reallocate vq */ | |
418 | ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, | |
419 | MPOL_F_NODE | MPOL_F_ADDR); | |
420 | if (ret) { | |
421 | RTE_LOG(ERR, VHOST_CONFIG, | |
422 | "Unable to get vq numa information.\n"); | |
423 | return dev; | |
424 | } | |
425 | if (oldnode != newnode) { | |
426 | RTE_LOG(INFO, VHOST_CONFIG, | |
427 | "reallocate vq from %d to %d node\n", oldnode, newnode); | |
428 | vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); | |
429 | if (!vq) | |
430 | return dev; | |
431 | ||
432 | memcpy(vq, old_vq, sizeof(*vq)); | |
433 | TAILQ_INIT(&vq->zmbuf_list); | |
434 | ||
9f95a23c TL |
435 | if (dev->dequeue_zero_copy) { |
436 | new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size * | |
437 | sizeof(struct zcopy_mbuf), 0, newnode); | |
438 | if (new_zmbuf) { | |
439 | rte_free(vq->zmbufs); | |
440 | vq->zmbufs = new_zmbuf; | |
441 | } | |
11fdf7f2 TL |
442 | } |
443 | ||
444 | if (vq_is_packed(dev)) { | |
445 | new_shadow_used_packed = rte_malloc_socket(NULL, | |
446 | vq->size * | |
447 | sizeof(struct vring_used_elem_packed), | |
448 | RTE_CACHE_LINE_SIZE, | |
449 | newnode); | |
450 | if (new_shadow_used_packed) { | |
451 | rte_free(vq->shadow_used_packed); | |
452 | vq->shadow_used_packed = new_shadow_used_packed; | |
453 | } | |
454 | } else { | |
455 | new_shadow_used_split = rte_malloc_socket(NULL, | |
456 | vq->size * | |
457 | sizeof(struct vring_used_elem), | |
458 | RTE_CACHE_LINE_SIZE, | |
459 | newnode); | |
460 | if (new_shadow_used_split) { | |
461 | rte_free(vq->shadow_used_split); | |
462 | vq->shadow_used_split = new_shadow_used_split; | |
463 | } | |
464 | } | |
465 | ||
466 | new_batch_copy_elems = rte_malloc_socket(NULL, | |
467 | vq->size * sizeof(struct batch_copy_elem), | |
468 | RTE_CACHE_LINE_SIZE, | |
469 | newnode); | |
470 | if (new_batch_copy_elems) { | |
471 | rte_free(vq->batch_copy_elems); | |
472 | vq->batch_copy_elems = new_batch_copy_elems; | |
473 | } | |
474 | ||
475 | rte_free(old_vq); | |
476 | } | |
477 | ||
478 | /* check if we need to reallocate dev */ | |
479 | ret = get_mempolicy(&oldnode, NULL, 0, old_dev, | |
480 | MPOL_F_NODE | MPOL_F_ADDR); | |
481 | if (ret) { | |
482 | RTE_LOG(ERR, VHOST_CONFIG, | |
483 | "Unable to get dev numa information.\n"); | |
484 | goto out; | |
485 | } | |
486 | if (oldnode != newnode) { | |
487 | RTE_LOG(INFO, VHOST_CONFIG, | |
488 | "reallocate dev from %d to %d node\n", | |
489 | oldnode, newnode); | |
490 | dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); | |
491 | if (!dev) { | |
492 | dev = old_dev; | |
493 | goto out; | |
494 | } | |
495 | ||
496 | memcpy(dev, old_dev, sizeof(*dev)); | |
497 | rte_free(old_dev); | |
498 | } | |
499 | ||
500 | out: | |
501 | dev->virtqueue[index] = vq; | |
502 | vhost_devices[dev->vid] = dev; | |
503 | ||
504 | if (old_vq != vq) | |
505 | vhost_user_iotlb_init(dev, index); | |
506 | ||
507 | return dev; | |
508 | } | |
509 | #else | |
510 | static struct virtio_net* | |
511 | numa_realloc(struct virtio_net *dev, int index __rte_unused) | |
512 | { | |
513 | return dev; | |
514 | } | |
515 | #endif | |
516 | ||
517 | /* Converts QEMU virtual address to Vhost virtual address. */ | |
518 | static uint64_t | |
519 | qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) | |
520 | { | |
521 | struct rte_vhost_mem_region *r; | |
522 | uint32_t i; | |
523 | ||
9f95a23c TL |
524 | if (unlikely(!dev || !dev->mem)) |
525 | goto out_error; | |
526 | ||
11fdf7f2 TL |
527 | /* Find the region where the address lives. */ |
528 | for (i = 0; i < dev->mem->nregions; i++) { | |
529 | r = &dev->mem->regions[i]; | |
530 | ||
531 | if (qva >= r->guest_user_addr && | |
532 | qva < r->guest_user_addr + r->size) { | |
533 | ||
534 | if (unlikely(*len > r->guest_user_addr + r->size - qva)) | |
535 | *len = r->guest_user_addr + r->size - qva; | |
536 | ||
537 | return qva - r->guest_user_addr + | |
538 | r->host_user_addr; | |
539 | } | |
540 | } | |
9f95a23c | 541 | out_error: |
11fdf7f2 TL |
542 | *len = 0; |
543 | ||
544 | return 0; | |
545 | } | |
546 | ||
547 | ||
548 | /* | |
549 | * Converts ring address to Vhost virtual address. | |
550 | * If IOMMU is enabled, the ring address is a guest IO virtual address, | |
551 | * else it is a QEMU virtual address. | |
552 | */ | |
553 | static uint64_t | |
554 | ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, | |
555 | uint64_t ra, uint64_t *size) | |
556 | { | |
557 | if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { | |
558 | uint64_t vva; | |
559 | ||
560 | vva = vhost_user_iotlb_cache_find(vq, ra, | |
561 | size, VHOST_ACCESS_RW); | |
562 | if (!vva) | |
563 | vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW); | |
564 | ||
565 | return vva; | |
566 | } | |
567 | ||
568 | return qva_to_vva(dev, ra, size); | |
569 | } | |
570 | ||
571 | static struct virtio_net * | |
572 | translate_ring_addresses(struct virtio_net *dev, int vq_index) | |
573 | { | |
574 | struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; | |
575 | struct vhost_vring_addr *addr = &vq->ring_addrs; | |
9f95a23c | 576 | uint64_t len, expected_len; |
11fdf7f2 TL |
577 | |
578 | if (vq_is_packed(dev)) { | |
579 | len = sizeof(struct vring_packed_desc) * vq->size; | |
580 | vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) | |
581 | ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len); | |
582 | vq->log_guest_addr = 0; | |
583 | if (vq->desc_packed == NULL || | |
584 | len != sizeof(struct vring_packed_desc) * | |
585 | vq->size) { | |
586 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
587 | "(%d) failed to map desc_packed ring.\n", | |
588 | dev->vid); | |
589 | return dev; | |
590 | } | |
591 | ||
592 | dev = numa_realloc(dev, vq_index); | |
593 | vq = dev->virtqueue[vq_index]; | |
594 | addr = &vq->ring_addrs; | |
595 | ||
596 | len = sizeof(struct vring_packed_desc_event); | |
597 | vq->driver_event = (struct vring_packed_desc_event *) | |
598 | (uintptr_t)ring_addr_to_vva(dev, | |
599 | vq, addr->avail_user_addr, &len); | |
600 | if (vq->driver_event == NULL || | |
601 | len != sizeof(struct vring_packed_desc_event)) { | |
602 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
603 | "(%d) failed to find driver area address.\n", | |
604 | dev->vid); | |
605 | return dev; | |
606 | } | |
607 | ||
608 | len = sizeof(struct vring_packed_desc_event); | |
609 | vq->device_event = (struct vring_packed_desc_event *) | |
610 | (uintptr_t)ring_addr_to_vva(dev, | |
611 | vq, addr->used_user_addr, &len); | |
612 | if (vq->device_event == NULL || | |
613 | len != sizeof(struct vring_packed_desc_event)) { | |
614 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
615 | "(%d) failed to find device area address.\n", | |
616 | dev->vid); | |
617 | return dev; | |
618 | } | |
619 | ||
620 | return dev; | |
621 | } | |
622 | ||
623 | /* The addresses are converted from QEMU virtual to Vhost virtual. */ | |
624 | if (vq->desc && vq->avail && vq->used) | |
625 | return dev; | |
626 | ||
627 | len = sizeof(struct vring_desc) * vq->size; | |
628 | vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, | |
629 | vq, addr->desc_user_addr, &len); | |
630 | if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { | |
631 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
632 | "(%d) failed to map desc ring.\n", | |
633 | dev->vid); | |
634 | return dev; | |
635 | } | |
636 | ||
637 | dev = numa_realloc(dev, vq_index); | |
638 | vq = dev->virtqueue[vq_index]; | |
639 | addr = &vq->ring_addrs; | |
640 | ||
641 | len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; | |
9f95a23c TL |
642 | if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) |
643 | len += sizeof(uint16_t); | |
644 | expected_len = len; | |
11fdf7f2 TL |
645 | vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, |
646 | vq, addr->avail_user_addr, &len); | |
9f95a23c | 647 | if (vq->avail == 0 || len != expected_len) { |
11fdf7f2 TL |
648 | RTE_LOG(DEBUG, VHOST_CONFIG, |
649 | "(%d) failed to map avail ring.\n", | |
650 | dev->vid); | |
651 | return dev; | |
652 | } | |
653 | ||
654 | len = sizeof(struct vring_used) + | |
655 | sizeof(struct vring_used_elem) * vq->size; | |
9f95a23c TL |
656 | if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) |
657 | len += sizeof(uint16_t); | |
658 | expected_len = len; | |
11fdf7f2 TL |
659 | vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, |
660 | vq, addr->used_user_addr, &len); | |
9f95a23c | 661 | if (vq->used == 0 || len != expected_len) { |
11fdf7f2 TL |
662 | RTE_LOG(DEBUG, VHOST_CONFIG, |
663 | "(%d) failed to map used ring.\n", | |
664 | dev->vid); | |
665 | return dev; | |
666 | } | |
667 | ||
668 | if (vq->last_used_idx != vq->used->idx) { | |
669 | RTE_LOG(WARNING, VHOST_CONFIG, | |
670 | "last_used_idx (%u) and vq->used->idx (%u) mismatches; " | |
671 | "some packets maybe resent for Tx and dropped for Rx\n", | |
672 | vq->last_used_idx, vq->used->idx); | |
673 | vq->last_used_idx = vq->used->idx; | |
674 | vq->last_avail_idx = vq->used->idx; | |
675 | } | |
676 | ||
677 | vq->log_guest_addr = addr->log_guest_addr; | |
678 | ||
679 | VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", | |
680 | dev->vid, vq->desc); | |
681 | VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", | |
682 | dev->vid, vq->avail); | |
683 | VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", | |
684 | dev->vid, vq->used); | |
685 | VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", | |
686 | dev->vid, vq->log_guest_addr); | |
687 | ||
688 | return dev; | |
689 | } | |
690 | ||
691 | /* | |
692 | * The virtio device sends us the desc, used and avail ring addresses. | |
693 | * This function then converts these to our address space. | |
694 | */ | |
695 | static int | |
9f95a23c TL |
696 | vhost_user_set_vring_addr(struct virtio_net **pdev, struct VhostUserMsg *msg, |
697 | int main_fd __rte_unused) | |
11fdf7f2 | 698 | { |
9f95a23c | 699 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
700 | struct vhost_virtqueue *vq; |
701 | struct vhost_vring_addr *addr = &msg->payload.addr; | |
11fdf7f2 TL |
702 | |
703 | if (dev->mem == NULL) | |
9f95a23c | 704 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
705 | |
706 | /* addr->index refers to the queue index. The txq 1, rxq is 0. */ | |
707 | vq = dev->virtqueue[msg->payload.addr.index]; | |
708 | ||
709 | /* | |
710 | * Rings addresses should not be interpreted as long as the ring is not | |
711 | * started and enabled | |
712 | */ | |
713 | memcpy(&vq->ring_addrs, addr, sizeof(*addr)); | |
714 | ||
715 | vring_invalidate(dev, vq); | |
716 | ||
717 | if (vq->enabled && (dev->features & | |
718 | (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { | |
719 | dev = translate_ring_addresses(dev, msg->payload.addr.index); | |
720 | if (!dev) | |
9f95a23c | 721 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
722 | |
723 | *pdev = dev; | |
724 | } | |
725 | ||
9f95a23c | 726 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
727 | } |
728 | ||
729 | /* | |
730 | * The virtio device sends us the available ring last used index. | |
731 | */ | |
732 | static int | |
9f95a23c TL |
733 | vhost_user_set_vring_base(struct virtio_net **pdev, |
734 | struct VhostUserMsg *msg, | |
735 | int main_fd __rte_unused) | |
11fdf7f2 | 736 | { |
9f95a23c TL |
737 | struct virtio_net *dev = *pdev; |
738 | struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; | |
739 | uint64_t val = msg->payload.state.num; | |
11fdf7f2 | 740 | |
9f95a23c TL |
741 | if (vq_is_packed(dev)) { |
742 | /* | |
743 | * Bit[0:14]: avail index | |
744 | * Bit[15]: avail wrap counter | |
745 | */ | |
746 | vq->last_avail_idx = val & 0x7fff; | |
747 | vq->avail_wrap_counter = !!(val & (0x1 << 15)); | |
748 | /* | |
749 | * Set used index to same value as available one, as | |
750 | * their values should be the same since ring processing | |
751 | * was stopped at get time. | |
752 | */ | |
753 | vq->last_used_idx = vq->last_avail_idx; | |
754 | vq->used_wrap_counter = vq->avail_wrap_counter; | |
755 | } else { | |
756 | vq->last_used_idx = msg->payload.state.num; | |
757 | vq->last_avail_idx = msg->payload.state.num; | |
758 | } | |
759 | ||
760 | return RTE_VHOST_MSG_RESULT_OK; | |
11fdf7f2 TL |
761 | } |
762 | ||
763 | static int | |
764 | add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, | |
765 | uint64_t host_phys_addr, uint64_t size) | |
766 | { | |
767 | struct guest_page *page, *last_page; | |
9f95a23c | 768 | struct guest_page *old_pages; |
11fdf7f2 TL |
769 | |
770 | if (dev->nr_guest_pages == dev->max_guest_pages) { | |
771 | dev->max_guest_pages *= 2; | |
9f95a23c | 772 | old_pages = dev->guest_pages; |
11fdf7f2 TL |
773 | dev->guest_pages = realloc(dev->guest_pages, |
774 | dev->max_guest_pages * sizeof(*page)); | |
775 | if (!dev->guest_pages) { | |
776 | RTE_LOG(ERR, VHOST_CONFIG, "cannot realloc guest_pages\n"); | |
9f95a23c | 777 | free(old_pages); |
11fdf7f2 TL |
778 | return -1; |
779 | } | |
780 | } | |
781 | ||
782 | if (dev->nr_guest_pages > 0) { | |
783 | last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; | |
784 | /* merge if the two pages are continuous */ | |
785 | if (host_phys_addr == last_page->host_phys_addr + | |
786 | last_page->size) { | |
787 | last_page->size += size; | |
788 | return 0; | |
789 | } | |
790 | } | |
791 | ||
792 | page = &dev->guest_pages[dev->nr_guest_pages++]; | |
793 | page->guest_phys_addr = guest_phys_addr; | |
794 | page->host_phys_addr = host_phys_addr; | |
795 | page->size = size; | |
796 | ||
797 | return 0; | |
798 | } | |
799 | ||
800 | static int | |
801 | add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, | |
802 | uint64_t page_size) | |
803 | { | |
804 | uint64_t reg_size = reg->size; | |
805 | uint64_t host_user_addr = reg->host_user_addr; | |
806 | uint64_t guest_phys_addr = reg->guest_phys_addr; | |
807 | uint64_t host_phys_addr; | |
808 | uint64_t size; | |
809 | ||
810 | host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr); | |
811 | size = page_size - (guest_phys_addr & (page_size - 1)); | |
812 | size = RTE_MIN(size, reg_size); | |
813 | ||
814 | if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0) | |
815 | return -1; | |
816 | ||
817 | host_user_addr += size; | |
818 | guest_phys_addr += size; | |
819 | reg_size -= size; | |
820 | ||
821 | while (reg_size > 0) { | |
822 | size = RTE_MIN(reg_size, page_size); | |
823 | host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t) | |
824 | host_user_addr); | |
825 | if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, | |
826 | size) < 0) | |
827 | return -1; | |
828 | ||
829 | host_user_addr += size; | |
830 | guest_phys_addr += size; | |
831 | reg_size -= size; | |
832 | } | |
833 | ||
834 | return 0; | |
835 | } | |
836 | ||
837 | #ifdef RTE_LIBRTE_VHOST_DEBUG | |
838 | /* TODO: enable it only in debug mode? */ | |
839 | static void | |
840 | dump_guest_pages(struct virtio_net *dev) | |
841 | { | |
842 | uint32_t i; | |
843 | struct guest_page *page; | |
844 | ||
845 | for (i = 0; i < dev->nr_guest_pages; i++) { | |
846 | page = &dev->guest_pages[i]; | |
847 | ||
848 | RTE_LOG(INFO, VHOST_CONFIG, | |
849 | "guest physical page region %u\n" | |
850 | "\t guest_phys_addr: %" PRIx64 "\n" | |
851 | "\t host_phys_addr : %" PRIx64 "\n" | |
852 | "\t size : %" PRIx64 "\n", | |
853 | i, | |
854 | page->guest_phys_addr, | |
855 | page->host_phys_addr, | |
856 | page->size); | |
857 | } | |
858 | } | |
859 | #else | |
860 | #define dump_guest_pages(dev) | |
861 | #endif | |
862 | ||
863 | static bool | |
864 | vhost_memory_changed(struct VhostUserMemory *new, | |
865 | struct rte_vhost_memory *old) | |
866 | { | |
867 | uint32_t i; | |
868 | ||
869 | if (new->nregions != old->nregions) | |
870 | return true; | |
871 | ||
872 | for (i = 0; i < new->nregions; ++i) { | |
873 | VhostUserMemoryRegion *new_r = &new->regions[i]; | |
874 | struct rte_vhost_mem_region *old_r = &old->regions[i]; | |
875 | ||
876 | if (new_r->guest_phys_addr != old_r->guest_phys_addr) | |
877 | return true; | |
878 | if (new_r->memory_size != old_r->size) | |
879 | return true; | |
880 | if (new_r->userspace_addr != old_r->guest_user_addr) | |
881 | return true; | |
882 | } | |
883 | ||
884 | return false; | |
885 | } | |
886 | ||
887 | static int | |
9f95a23c TL |
888 | vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, |
889 | int main_fd) | |
11fdf7f2 TL |
890 | { |
891 | struct virtio_net *dev = *pdev; | |
9f95a23c | 892 | struct VhostUserMemory *memory = &msg->payload.memory; |
11fdf7f2 TL |
893 | struct rte_vhost_mem_region *reg; |
894 | void *mmap_addr; | |
895 | uint64_t mmap_size; | |
896 | uint64_t mmap_offset; | |
897 | uint64_t alignment; | |
898 | uint32_t i; | |
899 | int populate; | |
900 | int fd; | |
901 | ||
9f95a23c | 902 | if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) { |
11fdf7f2 | 903 | RTE_LOG(ERR, VHOST_CONFIG, |
9f95a23c TL |
904 | "too many memory regions (%u)\n", memory->nregions); |
905 | return RTE_VHOST_MSG_RESULT_ERR; | |
11fdf7f2 TL |
906 | } |
907 | ||
9f95a23c | 908 | if (dev->mem && !vhost_memory_changed(memory, dev->mem)) { |
11fdf7f2 TL |
909 | RTE_LOG(INFO, VHOST_CONFIG, |
910 | "(%d) memory regions not changed\n", dev->vid); | |
911 | ||
9f95a23c TL |
912 | for (i = 0; i < memory->nregions; i++) |
913 | close(msg->fds[i]); | |
11fdf7f2 | 914 | |
9f95a23c | 915 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
916 | } |
917 | ||
918 | if (dev->mem) { | |
919 | free_mem_region(dev); | |
920 | rte_free(dev->mem); | |
921 | dev->mem = NULL; | |
922 | } | |
923 | ||
924 | /* Flush IOTLB cache as previous HVAs are now invalid */ | |
925 | if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) | |
926 | for (i = 0; i < dev->nr_vring; i++) | |
927 | vhost_user_iotlb_flush_all(dev->virtqueue[i]); | |
928 | ||
929 | dev->nr_guest_pages = 0; | |
930 | if (!dev->guest_pages) { | |
931 | dev->max_guest_pages = 8; | |
932 | dev->guest_pages = malloc(dev->max_guest_pages * | |
933 | sizeof(struct guest_page)); | |
934 | if (dev->guest_pages == NULL) { | |
935 | RTE_LOG(ERR, VHOST_CONFIG, | |
936 | "(%d) failed to allocate memory " | |
937 | "for dev->guest_pages\n", | |
938 | dev->vid); | |
9f95a23c | 939 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
940 | } |
941 | } | |
942 | ||
943 | dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + | |
9f95a23c | 944 | sizeof(struct rte_vhost_mem_region) * memory->nregions, 0); |
11fdf7f2 TL |
945 | if (dev->mem == NULL) { |
946 | RTE_LOG(ERR, VHOST_CONFIG, | |
947 | "(%d) failed to allocate memory for dev->mem\n", | |
948 | dev->vid); | |
9f95a23c | 949 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 | 950 | } |
9f95a23c | 951 | dev->mem->nregions = memory->nregions; |
11fdf7f2 | 952 | |
9f95a23c TL |
953 | for (i = 0; i < memory->nregions; i++) { |
954 | fd = msg->fds[i]; | |
11fdf7f2 TL |
955 | reg = &dev->mem->regions[i]; |
956 | ||
9f95a23c TL |
957 | reg->guest_phys_addr = memory->regions[i].guest_phys_addr; |
958 | reg->guest_user_addr = memory->regions[i].userspace_addr; | |
959 | reg->size = memory->regions[i].memory_size; | |
11fdf7f2 TL |
960 | reg->fd = fd; |
961 | ||
9f95a23c | 962 | mmap_offset = memory->regions[i].mmap_offset; |
11fdf7f2 TL |
963 | |
964 | /* Check for memory_size + mmap_offset overflow */ | |
965 | if (mmap_offset >= -reg->size) { | |
966 | RTE_LOG(ERR, VHOST_CONFIG, | |
967 | "mmap_offset (%#"PRIx64") and memory_size " | |
968 | "(%#"PRIx64") overflow\n", | |
969 | mmap_offset, reg->size); | |
970 | goto err_mmap; | |
971 | } | |
972 | ||
973 | mmap_size = reg->size + mmap_offset; | |
974 | ||
975 | /* mmap() without flag of MAP_ANONYMOUS, should be called | |
976 | * with length argument aligned with hugepagesz at older | |
977 | * longterm version Linux, like 2.6.32 and 3.2.72, or | |
978 | * mmap() will fail with EINVAL. | |
979 | * | |
980 | * to avoid failure, make sure in caller to keep length | |
981 | * aligned. | |
982 | */ | |
983 | alignment = get_blk_size(fd); | |
984 | if (alignment == (uint64_t)-1) { | |
985 | RTE_LOG(ERR, VHOST_CONFIG, | |
986 | "couldn't get hugepage size through fstat\n"); | |
987 | goto err_mmap; | |
988 | } | |
989 | mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); | |
990 | ||
991 | populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0; | |
992 | mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, | |
993 | MAP_SHARED | populate, fd, 0); | |
994 | ||
995 | if (mmap_addr == MAP_FAILED) { | |
996 | RTE_LOG(ERR, VHOST_CONFIG, | |
997 | "mmap region %u failed.\n", i); | |
998 | goto err_mmap; | |
999 | } | |
1000 | ||
1001 | reg->mmap_addr = mmap_addr; | |
1002 | reg->mmap_size = mmap_size; | |
1003 | reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + | |
1004 | mmap_offset; | |
1005 | ||
1006 | if (dev->dequeue_zero_copy) | |
1007 | if (add_guest_pages(dev, reg, alignment) < 0) { | |
1008 | RTE_LOG(ERR, VHOST_CONFIG, | |
1009 | "adding guest pages to region %u failed.\n", | |
1010 | i); | |
1011 | goto err_mmap; | |
1012 | } | |
1013 | ||
1014 | RTE_LOG(INFO, VHOST_CONFIG, | |
1015 | "guest memory region %u, size: 0x%" PRIx64 "\n" | |
1016 | "\t guest physical addr: 0x%" PRIx64 "\n" | |
1017 | "\t guest virtual addr: 0x%" PRIx64 "\n" | |
1018 | "\t host virtual addr: 0x%" PRIx64 "\n" | |
1019 | "\t mmap addr : 0x%" PRIx64 "\n" | |
1020 | "\t mmap size : 0x%" PRIx64 "\n" | |
1021 | "\t mmap align: 0x%" PRIx64 "\n" | |
1022 | "\t mmap off : 0x%" PRIx64 "\n", | |
1023 | i, reg->size, | |
1024 | reg->guest_phys_addr, | |
1025 | reg->guest_user_addr, | |
1026 | reg->host_user_addr, | |
1027 | (uint64_t)(uintptr_t)mmap_addr, | |
1028 | mmap_size, | |
1029 | alignment, | |
1030 | mmap_offset); | |
9f95a23c TL |
1031 | |
1032 | if (dev->postcopy_listening) { | |
1033 | /* | |
1034 | * We haven't a better way right now than sharing | |
1035 | * DPDK's virtual address with Qemu, so that Qemu can | |
1036 | * retrieve the region offset when handling userfaults. | |
1037 | */ | |
1038 | memory->regions[i].userspace_addr = | |
1039 | reg->host_user_addr; | |
1040 | } | |
1041 | } | |
1042 | if (dev->postcopy_listening) { | |
1043 | /* Send the addresses back to qemu */ | |
1044 | msg->fd_num = 0; | |
1045 | send_vhost_reply(main_fd, msg); | |
1046 | ||
1047 | /* Wait for qemu to acknolwedge it's got the addresses | |
1048 | * we've got to wait before we're allowed to generate faults. | |
1049 | */ | |
1050 | VhostUserMsg ack_msg; | |
1051 | if (read_vhost_message(main_fd, &ack_msg) <= 0) { | |
1052 | RTE_LOG(ERR, VHOST_CONFIG, | |
1053 | "Failed to read qemu ack on postcopy set-mem-table\n"); | |
1054 | goto err_mmap; | |
1055 | } | |
1056 | if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) { | |
1057 | RTE_LOG(ERR, VHOST_CONFIG, | |
1058 | "Bad qemu ack on postcopy set-mem-table (%d)\n", | |
1059 | ack_msg.request.master); | |
1060 | goto err_mmap; | |
1061 | } | |
1062 | ||
1063 | /* Now userfault register and we can use the memory */ | |
1064 | for (i = 0; i < memory->nregions; i++) { | |
1065 | #ifdef RTE_LIBRTE_VHOST_POSTCOPY | |
1066 | reg = &dev->mem->regions[i]; | |
1067 | struct uffdio_register reg_struct; | |
1068 | ||
1069 | /* | |
1070 | * Let's register all the mmap'ed area to ensure | |
1071 | * alignment on page boundary. | |
1072 | */ | |
1073 | reg_struct.range.start = | |
1074 | (uint64_t)(uintptr_t)reg->mmap_addr; | |
1075 | reg_struct.range.len = reg->mmap_size; | |
1076 | reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; | |
1077 | ||
1078 | if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, | |
1079 | ®_struct)) { | |
1080 | RTE_LOG(ERR, VHOST_CONFIG, | |
1081 | "Failed to register ufd for region %d: (ufd = %d) %s\n", | |
1082 | i, dev->postcopy_ufd, | |
1083 | strerror(errno)); | |
1084 | goto err_mmap; | |
1085 | } | |
1086 | RTE_LOG(INFO, VHOST_CONFIG, | |
1087 | "\t userfaultfd registered for range : %llx - %llx\n", | |
1088 | reg_struct.range.start, | |
1089 | reg_struct.range.start + | |
1090 | reg_struct.range.len - 1); | |
1091 | #else | |
1092 | goto err_mmap; | |
1093 | #endif | |
1094 | } | |
11fdf7f2 TL |
1095 | } |
1096 | ||
1097 | for (i = 0; i < dev->nr_vring; i++) { | |
1098 | struct vhost_virtqueue *vq = dev->virtqueue[i]; | |
1099 | ||
1100 | if (vq->desc || vq->avail || vq->used) { | |
1101 | /* | |
1102 | * If the memory table got updated, the ring addresses | |
1103 | * need to be translated again as virtual addresses have | |
1104 | * changed. | |
1105 | */ | |
1106 | vring_invalidate(dev, vq); | |
1107 | ||
1108 | dev = translate_ring_addresses(dev, i); | |
9f95a23c TL |
1109 | if (!dev) { |
1110 | dev = *pdev; | |
1111 | goto err_mmap; | |
1112 | } | |
11fdf7f2 TL |
1113 | |
1114 | *pdev = dev; | |
1115 | } | |
1116 | } | |
1117 | ||
1118 | dump_guest_pages(dev); | |
1119 | ||
9f95a23c | 1120 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
1121 | |
1122 | err_mmap: | |
1123 | free_mem_region(dev); | |
1124 | rte_free(dev->mem); | |
1125 | dev->mem = NULL; | |
9f95a23c | 1126 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1127 | } |
1128 | ||
1129 | static bool | |
1130 | vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) | |
1131 | { | |
1132 | bool rings_ok; | |
1133 | ||
1134 | if (!vq) | |
1135 | return false; | |
1136 | ||
1137 | if (vq_is_packed(dev)) | |
1138 | rings_ok = !!vq->desc_packed; | |
1139 | else | |
1140 | rings_ok = vq->desc && vq->avail && vq->used; | |
1141 | ||
1142 | return rings_ok && | |
1143 | vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && | |
1144 | vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; | |
1145 | } | |
1146 | ||
1147 | static int | |
1148 | virtio_is_ready(struct virtio_net *dev) | |
1149 | { | |
1150 | struct vhost_virtqueue *vq; | |
1151 | uint32_t i; | |
1152 | ||
1153 | if (dev->nr_vring == 0) | |
1154 | return 0; | |
1155 | ||
1156 | for (i = 0; i < dev->nr_vring; i++) { | |
1157 | vq = dev->virtqueue[i]; | |
1158 | ||
1159 | if (!vq_is_ready(dev, vq)) | |
1160 | return 0; | |
1161 | } | |
1162 | ||
1163 | RTE_LOG(INFO, VHOST_CONFIG, | |
1164 | "virtio is now ready for processing.\n"); | |
1165 | return 1; | |
1166 | } | |
1167 | ||
9f95a23c TL |
1168 | static int |
1169 | vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg, | |
1170 | int main_fd __rte_unused) | |
11fdf7f2 | 1171 | { |
9f95a23c | 1172 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
1173 | struct vhost_vring_file file; |
1174 | struct vhost_virtqueue *vq; | |
1175 | ||
9f95a23c TL |
1176 | file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; |
1177 | if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) | |
11fdf7f2 TL |
1178 | file.fd = VIRTIO_INVALID_EVENTFD; |
1179 | else | |
9f95a23c | 1180 | file.fd = msg->fds[0]; |
11fdf7f2 TL |
1181 | RTE_LOG(INFO, VHOST_CONFIG, |
1182 | "vring call idx:%d file:%d\n", file.index, file.fd); | |
1183 | ||
1184 | vq = dev->virtqueue[file.index]; | |
1185 | if (vq->callfd >= 0) | |
1186 | close(vq->callfd); | |
1187 | ||
1188 | vq->callfd = file.fd; | |
9f95a23c TL |
1189 | |
1190 | return RTE_VHOST_MSG_RESULT_OK; | |
11fdf7f2 TL |
1191 | } |
1192 | ||
9f95a23c TL |
1193 | static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused, |
1194 | struct VhostUserMsg *msg, | |
1195 | int main_fd __rte_unused) | |
11fdf7f2 | 1196 | { |
9f95a23c TL |
1197 | if (!(msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) |
1198 | close(msg->fds[0]); | |
1199 | RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); | |
1200 | ||
1201 | return RTE_VHOST_MSG_RESULT_OK; | |
1202 | } | |
1203 | ||
1204 | static int | |
1205 | vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg, | |
1206 | int main_fd __rte_unused) | |
1207 | { | |
1208 | struct virtio_net *dev = *pdev; | |
11fdf7f2 TL |
1209 | struct vhost_vring_file file; |
1210 | struct vhost_virtqueue *vq; | |
11fdf7f2 | 1211 | |
9f95a23c TL |
1212 | file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; |
1213 | if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) | |
11fdf7f2 TL |
1214 | file.fd = VIRTIO_INVALID_EVENTFD; |
1215 | else | |
9f95a23c | 1216 | file.fd = msg->fds[0]; |
11fdf7f2 TL |
1217 | RTE_LOG(INFO, VHOST_CONFIG, |
1218 | "vring kick idx:%d file:%d\n", file.index, file.fd); | |
1219 | ||
1220 | /* Interpret ring addresses only when ring is started. */ | |
1221 | dev = translate_ring_addresses(dev, file.index); | |
1222 | if (!dev) | |
9f95a23c | 1223 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1224 | |
1225 | *pdev = dev; | |
1226 | ||
1227 | vq = dev->virtqueue[file.index]; | |
1228 | ||
1229 | /* | |
1230 | * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated, | |
1231 | * the ring starts already enabled. Otherwise, it is enabled via | |
1232 | * the SET_VRING_ENABLE message. | |
1233 | */ | |
9f95a23c | 1234 | if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { |
11fdf7f2 | 1235 | vq->enabled = 1; |
9f95a23c TL |
1236 | if (dev->notify_ops->vring_state_changed) |
1237 | dev->notify_ops->vring_state_changed( | |
1238 | dev->vid, file.index, 1); | |
1239 | } | |
11fdf7f2 TL |
1240 | |
1241 | if (vq->kickfd >= 0) | |
1242 | close(vq->kickfd); | |
1243 | vq->kickfd = file.fd; | |
9f95a23c TL |
1244 | |
1245 | return RTE_VHOST_MSG_RESULT_OK; | |
11fdf7f2 TL |
1246 | } |
1247 | ||
1248 | static void | |
1249 | free_zmbufs(struct vhost_virtqueue *vq) | |
1250 | { | |
9f95a23c | 1251 | drain_zmbuf_list(vq); |
11fdf7f2 TL |
1252 | |
1253 | rte_free(vq->zmbufs); | |
1254 | } | |
1255 | ||
1256 | /* | |
1257 | * when virtio is stopped, qemu will send us the GET_VRING_BASE message. | |
1258 | */ | |
1259 | static int | |
9f95a23c TL |
1260 | vhost_user_get_vring_base(struct virtio_net **pdev, |
1261 | struct VhostUserMsg *msg, | |
1262 | int main_fd __rte_unused) | |
11fdf7f2 | 1263 | { |
9f95a23c | 1264 | struct virtio_net *dev = *pdev; |
11fdf7f2 | 1265 | struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; |
9f95a23c | 1266 | uint64_t val; |
11fdf7f2 TL |
1267 | |
1268 | /* We have to stop the queue (virtio) if it is running. */ | |
1269 | vhost_destroy_device_notify(dev); | |
1270 | ||
1271 | dev->flags &= ~VIRTIO_DEV_READY; | |
1272 | dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; | |
1273 | ||
9f95a23c TL |
1274 | /* Here we are safe to get the indexes */ |
1275 | if (vq_is_packed(dev)) { | |
1276 | /* | |
1277 | * Bit[0:14]: avail index | |
1278 | * Bit[15]: avail wrap counter | |
1279 | */ | |
1280 | val = vq->last_avail_idx & 0x7fff; | |
1281 | val |= vq->avail_wrap_counter << 15; | |
1282 | msg->payload.state.num = val; | |
1283 | } else { | |
1284 | msg->payload.state.num = vq->last_avail_idx; | |
1285 | } | |
11fdf7f2 TL |
1286 | |
1287 | RTE_LOG(INFO, VHOST_CONFIG, | |
1288 | "vring base idx:%d file:%d\n", msg->payload.state.index, | |
1289 | msg->payload.state.num); | |
1290 | /* | |
1291 | * Based on current qemu vhost-user implementation, this message is | |
1292 | * sent and only sent in vhost_vring_stop. | |
1293 | * TODO: cleanup the vring, it isn't usable since here. | |
1294 | */ | |
1295 | if (vq->kickfd >= 0) | |
1296 | close(vq->kickfd); | |
1297 | ||
1298 | vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; | |
1299 | ||
1300 | if (vq->callfd >= 0) | |
1301 | close(vq->callfd); | |
1302 | ||
1303 | vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; | |
1304 | ||
9f95a23c TL |
1305 | vq->signalled_used_valid = false; |
1306 | ||
11fdf7f2 TL |
1307 | if (dev->dequeue_zero_copy) |
1308 | free_zmbufs(vq); | |
1309 | if (vq_is_packed(dev)) { | |
1310 | rte_free(vq->shadow_used_packed); | |
1311 | vq->shadow_used_packed = NULL; | |
1312 | } else { | |
1313 | rte_free(vq->shadow_used_split); | |
1314 | vq->shadow_used_split = NULL; | |
1315 | } | |
1316 | ||
1317 | rte_free(vq->batch_copy_elems); | |
1318 | vq->batch_copy_elems = NULL; | |
1319 | ||
9f95a23c TL |
1320 | msg->size = sizeof(msg->payload.state); |
1321 | msg->fd_num = 0; | |
1322 | ||
1323 | return RTE_VHOST_MSG_RESULT_REPLY; | |
11fdf7f2 TL |
1324 | } |
1325 | ||
1326 | /* | |
1327 | * when virtio queues are ready to work, qemu will send us to | |
1328 | * enable the virtio queue pair. | |
1329 | */ | |
1330 | static int | |
9f95a23c TL |
1331 | vhost_user_set_vring_enable(struct virtio_net **pdev, |
1332 | struct VhostUserMsg *msg, | |
1333 | int main_fd __rte_unused) | |
11fdf7f2 | 1334 | { |
9f95a23c | 1335 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
1336 | int enable = (int)msg->payload.state.num; |
1337 | int index = (int)msg->payload.state.index; | |
1338 | struct rte_vdpa_device *vdpa_dev; | |
1339 | int did = -1; | |
1340 | ||
1341 | RTE_LOG(INFO, VHOST_CONFIG, | |
1342 | "set queue enable: %d to qp idx: %d\n", | |
1343 | enable, index); | |
1344 | ||
1345 | did = dev->vdpa_dev_id; | |
1346 | vdpa_dev = rte_vdpa_get_device(did); | |
1347 | if (vdpa_dev && vdpa_dev->ops->set_vring_state) | |
1348 | vdpa_dev->ops->set_vring_state(dev->vid, index, enable); | |
1349 | ||
1350 | if (dev->notify_ops->vring_state_changed) | |
1351 | dev->notify_ops->vring_state_changed(dev->vid, | |
1352 | index, enable); | |
1353 | ||
9f95a23c TL |
1354 | /* On disable, rings have to be stopped being processed. */ |
1355 | if (!enable && dev->dequeue_zero_copy) | |
1356 | drain_zmbuf_list(dev->virtqueue[index]); | |
1357 | ||
11fdf7f2 TL |
1358 | dev->virtqueue[index]->enabled = enable; |
1359 | ||
9f95a23c | 1360 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
1361 | } |
1362 | ||
9f95a23c TL |
1363 | static int |
1364 | vhost_user_get_protocol_features(struct virtio_net **pdev, | |
1365 | struct VhostUserMsg *msg, | |
1366 | int main_fd __rte_unused) | |
11fdf7f2 | 1367 | { |
9f95a23c | 1368 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
1369 | uint64_t features, protocol_features; |
1370 | ||
1371 | rte_vhost_driver_get_features(dev->ifname, &features); | |
1372 | rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features); | |
1373 | ||
1374 | /* | |
1375 | * REPLY_ACK protocol feature is only mandatory for now | |
1376 | * for IOMMU feature. If IOMMU is explicitly disabled by the | |
1377 | * application, disable also REPLY_ACK feature for older buggy | |
1378 | * Qemu versions (from v2.7.0 to v2.9.0). | |
1379 | */ | |
1380 | if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) | |
1381 | protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK); | |
1382 | ||
1383 | msg->payload.u64 = protocol_features; | |
1384 | msg->size = sizeof(msg->payload.u64); | |
9f95a23c TL |
1385 | msg->fd_num = 0; |
1386 | ||
1387 | return RTE_VHOST_MSG_RESULT_REPLY; | |
11fdf7f2 TL |
1388 | } |
1389 | ||
9f95a23c TL |
1390 | static int |
1391 | vhost_user_set_protocol_features(struct virtio_net **pdev, | |
1392 | struct VhostUserMsg *msg, | |
1393 | int main_fd __rte_unused) | |
11fdf7f2 | 1394 | { |
9f95a23c TL |
1395 | struct virtio_net *dev = *pdev; |
1396 | uint64_t protocol_features = msg->payload.u64; | |
1397 | uint64_t slave_protocol_features = 0; | |
1398 | ||
1399 | rte_vhost_driver_get_protocol_features(dev->ifname, | |
1400 | &slave_protocol_features); | |
1401 | if (protocol_features & ~slave_protocol_features) { | |
1402 | RTE_LOG(ERR, VHOST_CONFIG, | |
1403 | "(%d) received invalid protocol features.\n", | |
1404 | dev->vid); | |
1405 | return RTE_VHOST_MSG_RESULT_ERR; | |
1406 | } | |
11fdf7f2 TL |
1407 | |
1408 | dev->protocol_features = protocol_features; | |
9f95a23c TL |
1409 | |
1410 | return RTE_VHOST_MSG_RESULT_OK; | |
11fdf7f2 TL |
1411 | } |
1412 | ||
1413 | static int | |
9f95a23c TL |
1414 | vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg, |
1415 | int main_fd __rte_unused) | |
11fdf7f2 | 1416 | { |
9f95a23c | 1417 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
1418 | int fd = msg->fds[0]; |
1419 | uint64_t size, off; | |
1420 | void *addr; | |
1421 | ||
1422 | if (fd < 0) { | |
1423 | RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); | |
9f95a23c | 1424 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1425 | } |
1426 | ||
1427 | if (msg->size != sizeof(VhostUserLog)) { | |
1428 | RTE_LOG(ERR, VHOST_CONFIG, | |
1429 | "invalid log base msg size: %"PRId32" != %d\n", | |
1430 | msg->size, (int)sizeof(VhostUserLog)); | |
9f95a23c | 1431 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1432 | } |
1433 | ||
1434 | size = msg->payload.log.mmap_size; | |
1435 | off = msg->payload.log.mmap_offset; | |
1436 | ||
1437 | /* Don't allow mmap_offset to point outside the mmap region */ | |
1438 | if (off > size) { | |
1439 | RTE_LOG(ERR, VHOST_CONFIG, | |
1440 | "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n", | |
1441 | off, size); | |
9f95a23c | 1442 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1443 | } |
1444 | ||
1445 | RTE_LOG(INFO, VHOST_CONFIG, | |
1446 | "log mmap size: %"PRId64", offset: %"PRId64"\n", | |
1447 | size, off); | |
1448 | ||
1449 | /* | |
1450 | * mmap from 0 to workaround a hugepage mmap bug: mmap will | |
1451 | * fail when offset is not page size aligned. | |
1452 | */ | |
1453 | addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | |
1454 | close(fd); | |
1455 | if (addr == MAP_FAILED) { | |
1456 | RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); | |
9f95a23c | 1457 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1458 | } |
1459 | ||
1460 | /* | |
1461 | * Free previously mapped log memory on occasionally | |
1462 | * multiple VHOST_USER_SET_LOG_BASE. | |
1463 | */ | |
1464 | if (dev->log_addr) { | |
1465 | munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); | |
1466 | } | |
1467 | dev->log_addr = (uint64_t)(uintptr_t)addr; | |
1468 | dev->log_base = dev->log_addr + off; | |
1469 | dev->log_size = size; | |
1470 | ||
9f95a23c TL |
1471 | /* |
1472 | * The spec is not clear about it (yet), but QEMU doesn't expect | |
1473 | * any payload in the reply. | |
1474 | */ | |
1475 | msg->size = 0; | |
1476 | msg->fd_num = 0; | |
1477 | ||
1478 | return RTE_VHOST_MSG_RESULT_REPLY; | |
1479 | } | |
1480 | ||
1481 | static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused, | |
1482 | struct VhostUserMsg *msg, | |
1483 | int main_fd __rte_unused) | |
1484 | { | |
1485 | close(msg->fds[0]); | |
1486 | RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); | |
1487 | ||
1488 | return RTE_VHOST_MSG_RESULT_OK; | |
11fdf7f2 TL |
1489 | } |
1490 | ||
1491 | /* | |
1492 | * An rarp packet is constructed and broadcasted to notify switches about | |
1493 | * the new location of the migrated VM, so that packets from outside will | |
1494 | * not be lost after migration. | |
1495 | * | |
1496 | * However, we don't actually "send" a rarp packet here, instead, we set | |
1497 | * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. | |
1498 | */ | |
1499 | static int | |
9f95a23c TL |
1500 | vhost_user_send_rarp(struct virtio_net **pdev, struct VhostUserMsg *msg, |
1501 | int main_fd __rte_unused) | |
11fdf7f2 | 1502 | { |
9f95a23c | 1503 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
1504 | uint8_t *mac = (uint8_t *)&msg->payload.u64; |
1505 | struct rte_vdpa_device *vdpa_dev; | |
1506 | int did = -1; | |
1507 | ||
1508 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
1509 | ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", | |
1510 | mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); | |
1511 | memcpy(dev->mac.addr_bytes, mac, 6); | |
1512 | ||
1513 | /* | |
1514 | * Set the flag to inject a RARP broadcast packet at | |
1515 | * rte_vhost_dequeue_burst(). | |
1516 | * | |
1517 | * rte_smp_wmb() is for making sure the mac is copied | |
1518 | * before the flag is set. | |
1519 | */ | |
1520 | rte_smp_wmb(); | |
1521 | rte_atomic16_set(&dev->broadcast_rarp, 1); | |
1522 | did = dev->vdpa_dev_id; | |
1523 | vdpa_dev = rte_vdpa_get_device(did); | |
1524 | if (vdpa_dev && vdpa_dev->ops->migration_done) | |
1525 | vdpa_dev->ops->migration_done(dev->vid); | |
1526 | ||
9f95a23c | 1527 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
1528 | } |
1529 | ||
1530 | static int | |
9f95a23c TL |
1531 | vhost_user_net_set_mtu(struct virtio_net **pdev, struct VhostUserMsg *msg, |
1532 | int main_fd __rte_unused) | |
11fdf7f2 | 1533 | { |
9f95a23c | 1534 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
1535 | if (msg->payload.u64 < VIRTIO_MIN_MTU || |
1536 | msg->payload.u64 > VIRTIO_MAX_MTU) { | |
1537 | RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", | |
1538 | msg->payload.u64); | |
1539 | ||
9f95a23c | 1540 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1541 | } |
1542 | ||
1543 | dev->mtu = msg->payload.u64; | |
1544 | ||
9f95a23c | 1545 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
1546 | } |
1547 | ||
1548 | static int | |
9f95a23c TL |
1549 | vhost_user_set_req_fd(struct virtio_net **pdev, struct VhostUserMsg *msg, |
1550 | int main_fd __rte_unused) | |
11fdf7f2 | 1551 | { |
9f95a23c | 1552 | struct virtio_net *dev = *pdev; |
11fdf7f2 TL |
1553 | int fd = msg->fds[0]; |
1554 | ||
1555 | if (fd < 0) { | |
1556 | RTE_LOG(ERR, VHOST_CONFIG, | |
1557 | "Invalid file descriptor for slave channel (%d)\n", | |
1558 | fd); | |
9f95a23c | 1559 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1560 | } |
1561 | ||
1562 | dev->slave_req_fd = fd; | |
1563 | ||
9f95a23c | 1564 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
1565 | } |
1566 | ||
1567 | static int | |
1568 | is_vring_iotlb_update(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) | |
1569 | { | |
1570 | struct vhost_vring_addr *ra; | |
1571 | uint64_t start, end; | |
1572 | ||
1573 | start = imsg->iova; | |
1574 | end = start + imsg->size; | |
1575 | ||
1576 | ra = &vq->ring_addrs; | |
1577 | if (ra->desc_user_addr >= start && ra->desc_user_addr < end) | |
1578 | return 1; | |
1579 | if (ra->avail_user_addr >= start && ra->avail_user_addr < end) | |
1580 | return 1; | |
1581 | if (ra->used_user_addr >= start && ra->used_user_addr < end) | |
1582 | return 1; | |
1583 | ||
1584 | return 0; | |
1585 | } | |
1586 | ||
1587 | static int | |
1588 | is_vring_iotlb_invalidate(struct vhost_virtqueue *vq, | |
1589 | struct vhost_iotlb_msg *imsg) | |
1590 | { | |
1591 | uint64_t istart, iend, vstart, vend; | |
1592 | ||
1593 | istart = imsg->iova; | |
1594 | iend = istart + imsg->size - 1; | |
1595 | ||
1596 | vstart = (uintptr_t)vq->desc; | |
1597 | vend = vstart + sizeof(struct vring_desc) * vq->size - 1; | |
1598 | if (vstart <= iend && istart <= vend) | |
1599 | return 1; | |
1600 | ||
1601 | vstart = (uintptr_t)vq->avail; | |
1602 | vend = vstart + sizeof(struct vring_avail); | |
1603 | vend += sizeof(uint16_t) * vq->size - 1; | |
1604 | if (vstart <= iend && istart <= vend) | |
1605 | return 1; | |
1606 | ||
1607 | vstart = (uintptr_t)vq->used; | |
1608 | vend = vstart + sizeof(struct vring_used); | |
1609 | vend += sizeof(struct vring_used_elem) * vq->size - 1; | |
1610 | if (vstart <= iend && istart <= vend) | |
1611 | return 1; | |
1612 | ||
1613 | return 0; | |
1614 | } | |
1615 | ||
1616 | static int | |
9f95a23c TL |
1617 | vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg, |
1618 | int main_fd __rte_unused) | |
11fdf7f2 TL |
1619 | { |
1620 | struct virtio_net *dev = *pdev; | |
1621 | struct vhost_iotlb_msg *imsg = &msg->payload.iotlb; | |
1622 | uint16_t i; | |
1623 | uint64_t vva, len; | |
1624 | ||
1625 | switch (imsg->type) { | |
1626 | case VHOST_IOTLB_UPDATE: | |
1627 | len = imsg->size; | |
1628 | vva = qva_to_vva(dev, imsg->uaddr, &len); | |
1629 | if (!vva) | |
9f95a23c | 1630 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1631 | |
1632 | for (i = 0; i < dev->nr_vring; i++) { | |
1633 | struct vhost_virtqueue *vq = dev->virtqueue[i]; | |
1634 | ||
1635 | vhost_user_iotlb_cache_insert(vq, imsg->iova, vva, | |
1636 | len, imsg->perm); | |
1637 | ||
1638 | if (is_vring_iotlb_update(vq, imsg)) | |
1639 | *pdev = dev = translate_ring_addresses(dev, i); | |
1640 | } | |
1641 | break; | |
1642 | case VHOST_IOTLB_INVALIDATE: | |
1643 | for (i = 0; i < dev->nr_vring; i++) { | |
1644 | struct vhost_virtqueue *vq = dev->virtqueue[i]; | |
1645 | ||
1646 | vhost_user_iotlb_cache_remove(vq, imsg->iova, | |
1647 | imsg->size); | |
1648 | ||
1649 | if (is_vring_iotlb_invalidate(vq, imsg)) | |
1650 | vring_invalidate(dev, vq); | |
1651 | } | |
1652 | break; | |
1653 | default: | |
1654 | RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n", | |
1655 | imsg->type); | |
9f95a23c | 1656 | return RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 TL |
1657 | } |
1658 | ||
9f95a23c | 1659 | return RTE_VHOST_MSG_RESULT_OK; |
11fdf7f2 TL |
1660 | } |
1661 | ||
9f95a23c TL |
1662 | static int |
1663 | vhost_user_set_postcopy_advise(struct virtio_net **pdev, | |
1664 | struct VhostUserMsg *msg, | |
1665 | int main_fd __rte_unused) | |
1666 | { | |
1667 | struct virtio_net *dev = *pdev; | |
1668 | #ifdef RTE_LIBRTE_VHOST_POSTCOPY | |
1669 | struct uffdio_api api_struct; | |
1670 | ||
1671 | dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); | |
1672 | ||
1673 | if (dev->postcopy_ufd == -1) { | |
1674 | RTE_LOG(ERR, VHOST_CONFIG, "Userfaultfd not available: %s\n", | |
1675 | strerror(errno)); | |
1676 | return RTE_VHOST_MSG_RESULT_ERR; | |
1677 | } | |
1678 | api_struct.api = UFFD_API; | |
1679 | api_struct.features = 0; | |
1680 | if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { | |
1681 | RTE_LOG(ERR, VHOST_CONFIG, "UFFDIO_API ioctl failure: %s\n", | |
1682 | strerror(errno)); | |
1683 | close(dev->postcopy_ufd); | |
1684 | dev->postcopy_ufd = -1; | |
1685 | return RTE_VHOST_MSG_RESULT_ERR; | |
1686 | } | |
1687 | msg->fds[0] = dev->postcopy_ufd; | |
1688 | msg->fd_num = 1; | |
1689 | ||
1690 | return RTE_VHOST_MSG_RESULT_REPLY; | |
1691 | #else | |
1692 | dev->postcopy_ufd = -1; | |
1693 | msg->fd_num = 0; | |
1694 | ||
1695 | return RTE_VHOST_MSG_RESULT_ERR; | |
1696 | #endif | |
1697 | } | |
1698 | ||
1699 | static int | |
1700 | vhost_user_set_postcopy_listen(struct virtio_net **pdev, | |
1701 | struct VhostUserMsg *msg __rte_unused, | |
1702 | int main_fd __rte_unused) | |
1703 | { | |
1704 | struct virtio_net *dev = *pdev; | |
1705 | ||
1706 | if (dev->mem && dev->mem->nregions) { | |
1707 | RTE_LOG(ERR, VHOST_CONFIG, | |
1708 | "Regions already registered at postcopy-listen\n"); | |
1709 | return RTE_VHOST_MSG_RESULT_ERR; | |
1710 | } | |
1711 | dev->postcopy_listening = 1; | |
1712 | ||
1713 | return RTE_VHOST_MSG_RESULT_OK; | |
1714 | } | |
1715 | ||
1716 | static int | |
1717 | vhost_user_postcopy_end(struct virtio_net **pdev, struct VhostUserMsg *msg, | |
1718 | int main_fd __rte_unused) | |
1719 | { | |
1720 | struct virtio_net *dev = *pdev; | |
1721 | ||
1722 | dev->postcopy_listening = 0; | |
1723 | if (dev->postcopy_ufd >= 0) { | |
1724 | close(dev->postcopy_ufd); | |
1725 | dev->postcopy_ufd = -1; | |
1726 | } | |
1727 | ||
1728 | msg->payload.u64 = 0; | |
1729 | msg->size = sizeof(msg->payload.u64); | |
1730 | msg->fd_num = 0; | |
1731 | ||
1732 | return RTE_VHOST_MSG_RESULT_REPLY; | |
1733 | } | |
1734 | ||
1735 | typedef int (*vhost_message_handler_t)(struct virtio_net **pdev, | |
1736 | struct VhostUserMsg *msg, | |
1737 | int main_fd); | |
1738 | static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = { | |
1739 | [VHOST_USER_NONE] = NULL, | |
1740 | [VHOST_USER_GET_FEATURES] = vhost_user_get_features, | |
1741 | [VHOST_USER_SET_FEATURES] = vhost_user_set_features, | |
1742 | [VHOST_USER_SET_OWNER] = vhost_user_set_owner, | |
1743 | [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner, | |
1744 | [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table, | |
1745 | [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base, | |
1746 | [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd, | |
1747 | [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num, | |
1748 | [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr, | |
1749 | [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base, | |
1750 | [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base, | |
1751 | [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick, | |
1752 | [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call, | |
1753 | [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err, | |
1754 | [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features, | |
1755 | [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features, | |
1756 | [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num, | |
1757 | [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable, | |
1758 | [VHOST_USER_SEND_RARP] = vhost_user_send_rarp, | |
1759 | [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu, | |
1760 | [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd, | |
1761 | [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg, | |
1762 | [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise, | |
1763 | [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen, | |
1764 | [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end, | |
1765 | }; | |
1766 | ||
1767 | ||
11fdf7f2 TL |
1768 | /* return bytes# of read on success or negative val on failure. */ |
1769 | static int | |
1770 | read_vhost_message(int sockfd, struct VhostUserMsg *msg) | |
1771 | { | |
1772 | int ret; | |
1773 | ||
1774 | ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, | |
9f95a23c | 1775 | msg->fds, VHOST_MEMORY_MAX_NREGIONS, &msg->fd_num); |
11fdf7f2 TL |
1776 | if (ret <= 0) |
1777 | return ret; | |
1778 | ||
9f95a23c | 1779 | if (msg->size) { |
11fdf7f2 TL |
1780 | if (msg->size > sizeof(msg->payload)) { |
1781 | RTE_LOG(ERR, VHOST_CONFIG, | |
1782 | "invalid msg size: %d\n", msg->size); | |
1783 | return -1; | |
1784 | } | |
1785 | ret = read(sockfd, &msg->payload, msg->size); | |
1786 | if (ret <= 0) | |
1787 | return ret; | |
1788 | if (ret != (int)msg->size) { | |
1789 | RTE_LOG(ERR, VHOST_CONFIG, | |
1790 | "read control message failed\n"); | |
1791 | return -1; | |
1792 | } | |
1793 | } | |
1794 | ||
1795 | return ret; | |
1796 | } | |
1797 | ||
1798 | static int | |
9f95a23c | 1799 | send_vhost_message(int sockfd, struct VhostUserMsg *msg) |
11fdf7f2 TL |
1800 | { |
1801 | if (!msg) | |
1802 | return 0; | |
1803 | ||
1804 | return send_fd_message(sockfd, (char *)msg, | |
9f95a23c | 1805 | VHOST_USER_HDR_SIZE + msg->size, msg->fds, msg->fd_num); |
11fdf7f2 TL |
1806 | } |
1807 | ||
1808 | static int | |
1809 | send_vhost_reply(int sockfd, struct VhostUserMsg *msg) | |
1810 | { | |
1811 | if (!msg) | |
1812 | return 0; | |
1813 | ||
1814 | msg->flags &= ~VHOST_USER_VERSION_MASK; | |
1815 | msg->flags &= ~VHOST_USER_NEED_REPLY; | |
1816 | msg->flags |= VHOST_USER_VERSION; | |
1817 | msg->flags |= VHOST_USER_REPLY_MASK; | |
1818 | ||
9f95a23c | 1819 | return send_vhost_message(sockfd, msg); |
11fdf7f2 TL |
1820 | } |
1821 | ||
1822 | static int | |
9f95a23c | 1823 | send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg) |
11fdf7f2 TL |
1824 | { |
1825 | int ret; | |
1826 | ||
1827 | if (msg->flags & VHOST_USER_NEED_REPLY) | |
1828 | rte_spinlock_lock(&dev->slave_req_lock); | |
1829 | ||
9f95a23c | 1830 | ret = send_vhost_message(dev->slave_req_fd, msg); |
11fdf7f2 TL |
1831 | if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY)) |
1832 | rte_spinlock_unlock(&dev->slave_req_lock); | |
1833 | ||
1834 | return ret; | |
1835 | } | |
1836 | ||
1837 | /* | |
1838 | * Allocate a queue pair if it hasn't been allocated yet | |
1839 | */ | |
1840 | static int | |
9f95a23c TL |
1841 | vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, |
1842 | struct VhostUserMsg *msg) | |
11fdf7f2 TL |
1843 | { |
1844 | uint16_t vring_idx; | |
1845 | ||
1846 | switch (msg->request.master) { | |
1847 | case VHOST_USER_SET_VRING_KICK: | |
1848 | case VHOST_USER_SET_VRING_CALL: | |
1849 | case VHOST_USER_SET_VRING_ERR: | |
1850 | vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; | |
1851 | break; | |
1852 | case VHOST_USER_SET_VRING_NUM: | |
1853 | case VHOST_USER_SET_VRING_BASE: | |
1854 | case VHOST_USER_SET_VRING_ENABLE: | |
1855 | vring_idx = msg->payload.state.index; | |
1856 | break; | |
1857 | case VHOST_USER_SET_VRING_ADDR: | |
1858 | vring_idx = msg->payload.addr.index; | |
1859 | break; | |
1860 | default: | |
1861 | return 0; | |
1862 | } | |
1863 | ||
1864 | if (vring_idx >= VHOST_MAX_VRING) { | |
1865 | RTE_LOG(ERR, VHOST_CONFIG, | |
1866 | "invalid vring index: %u\n", vring_idx); | |
1867 | return -1; | |
1868 | } | |
1869 | ||
1870 | if (dev->virtqueue[vring_idx]) | |
1871 | return 0; | |
1872 | ||
1873 | return alloc_vring_queue(dev, vring_idx); | |
1874 | } | |
1875 | ||
1876 | static void | |
1877 | vhost_user_lock_all_queue_pairs(struct virtio_net *dev) | |
1878 | { | |
1879 | unsigned int i = 0; | |
1880 | unsigned int vq_num = 0; | |
1881 | ||
1882 | while (vq_num < dev->nr_vring) { | |
1883 | struct vhost_virtqueue *vq = dev->virtqueue[i]; | |
1884 | ||
1885 | if (vq) { | |
1886 | rte_spinlock_lock(&vq->access_lock); | |
1887 | vq_num++; | |
1888 | } | |
1889 | i++; | |
1890 | } | |
1891 | } | |
1892 | ||
1893 | static void | |
1894 | vhost_user_unlock_all_queue_pairs(struct virtio_net *dev) | |
1895 | { | |
1896 | unsigned int i = 0; | |
1897 | unsigned int vq_num = 0; | |
1898 | ||
1899 | while (vq_num < dev->nr_vring) { | |
1900 | struct vhost_virtqueue *vq = dev->virtqueue[i]; | |
1901 | ||
1902 | if (vq) { | |
1903 | rte_spinlock_unlock(&vq->access_lock); | |
1904 | vq_num++; | |
1905 | } | |
1906 | i++; | |
1907 | } | |
1908 | } | |
1909 | ||
1910 | int | |
1911 | vhost_user_msg_handler(int vid, int fd) | |
1912 | { | |
1913 | struct virtio_net *dev; | |
1914 | struct VhostUserMsg msg; | |
1915 | struct rte_vdpa_device *vdpa_dev; | |
1916 | int did = -1; | |
1917 | int ret; | |
1918 | int unlock_required = 0; | |
9f95a23c TL |
1919 | bool handled; |
1920 | int request; | |
11fdf7f2 TL |
1921 | |
1922 | dev = get_device(vid); | |
1923 | if (dev == NULL) | |
1924 | return -1; | |
1925 | ||
1926 | if (!dev->notify_ops) { | |
1927 | dev->notify_ops = vhost_driver_callback_get(dev->ifname); | |
1928 | if (!dev->notify_ops) { | |
1929 | RTE_LOG(ERR, VHOST_CONFIG, | |
1930 | "failed to get callback ops for driver %s\n", | |
1931 | dev->ifname); | |
1932 | return -1; | |
1933 | } | |
1934 | } | |
1935 | ||
1936 | ret = read_vhost_message(fd, &msg); | |
9f95a23c | 1937 | if (ret <= 0) { |
11fdf7f2 TL |
1938 | if (ret < 0) |
1939 | RTE_LOG(ERR, VHOST_CONFIG, | |
1940 | "vhost read message failed\n"); | |
9f95a23c | 1941 | else |
11fdf7f2 TL |
1942 | RTE_LOG(INFO, VHOST_CONFIG, |
1943 | "vhost peer closed\n"); | |
11fdf7f2 TL |
1944 | |
1945 | return -1; | |
1946 | } | |
1947 | ||
1948 | ret = 0; | |
9f95a23c TL |
1949 | request = msg.request.master; |
1950 | if (request > VHOST_USER_NONE && request < VHOST_USER_MAX && | |
1951 | vhost_message_str[request]) { | |
1952 | if (request != VHOST_USER_IOTLB_MSG) | |
1953 | RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", | |
1954 | vhost_message_str[request]); | |
1955 | else | |
1956 | RTE_LOG(DEBUG, VHOST_CONFIG, "read message %s\n", | |
1957 | vhost_message_str[request]); | |
1958 | } else { | |
1959 | RTE_LOG(DEBUG, VHOST_CONFIG, "External request %d\n", request); | |
1960 | } | |
11fdf7f2 TL |
1961 | |
1962 | ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); | |
1963 | if (ret < 0) { | |
1964 | RTE_LOG(ERR, VHOST_CONFIG, | |
1965 | "failed to alloc queue\n"); | |
1966 | return -1; | |
1967 | } | |
1968 | ||
1969 | /* | |
1970 | * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE | |
1971 | * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops | |
1972 | * and device is destroyed. destroy_device waits for queues to be | |
1973 | * inactive, so it is safe. Otherwise taking the access_lock | |
1974 | * would cause a dead lock. | |
1975 | */ | |
9f95a23c | 1976 | switch (request) { |
11fdf7f2 TL |
1977 | case VHOST_USER_SET_FEATURES: |
1978 | case VHOST_USER_SET_PROTOCOL_FEATURES: | |
1979 | case VHOST_USER_SET_OWNER: | |
1980 | case VHOST_USER_SET_MEM_TABLE: | |
1981 | case VHOST_USER_SET_LOG_BASE: | |
1982 | case VHOST_USER_SET_LOG_FD: | |
1983 | case VHOST_USER_SET_VRING_NUM: | |
1984 | case VHOST_USER_SET_VRING_ADDR: | |
1985 | case VHOST_USER_SET_VRING_BASE: | |
1986 | case VHOST_USER_SET_VRING_KICK: | |
1987 | case VHOST_USER_SET_VRING_CALL: | |
1988 | case VHOST_USER_SET_VRING_ERR: | |
1989 | case VHOST_USER_SET_VRING_ENABLE: | |
1990 | case VHOST_USER_SEND_RARP: | |
1991 | case VHOST_USER_NET_SET_MTU: | |
1992 | case VHOST_USER_SET_SLAVE_REQ_FD: | |
1993 | vhost_user_lock_all_queue_pairs(dev); | |
1994 | unlock_required = 1; | |
1995 | break; | |
1996 | default: | |
1997 | break; | |
1998 | ||
1999 | } | |
2000 | ||
9f95a23c | 2001 | handled = false; |
11fdf7f2 | 2002 | if (dev->extern_ops.pre_msg_handle) { |
11fdf7f2 | 2003 | ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, |
9f95a23c TL |
2004 | (void *)&msg); |
2005 | switch (ret) { | |
2006 | case RTE_VHOST_MSG_RESULT_REPLY: | |
11fdf7f2 | 2007 | send_vhost_reply(fd, &msg); |
9f95a23c TL |
2008 | /* Fall-through */ |
2009 | case RTE_VHOST_MSG_RESULT_ERR: | |
2010 | case RTE_VHOST_MSG_RESULT_OK: | |
2011 | handled = true; | |
11fdf7f2 | 2012 | goto skip_to_post_handle; |
9f95a23c TL |
2013 | case RTE_VHOST_MSG_RESULT_NOT_HANDLED: |
2014 | default: | |
2015 | break; | |
2016 | } | |
11fdf7f2 TL |
2017 | } |
2018 | ||
9f95a23c TL |
2019 | if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) { |
2020 | if (!vhost_message_handlers[request]) | |
2021 | goto skip_to_post_handle; | |
2022 | ret = vhost_message_handlers[request](&dev, &msg, fd); | |
11fdf7f2 | 2023 | |
9f95a23c TL |
2024 | switch (ret) { |
2025 | case RTE_VHOST_MSG_RESULT_ERR: | |
2026 | RTE_LOG(ERR, VHOST_CONFIG, | |
2027 | "Processing %s failed.\n", | |
2028 | vhost_message_str[request]); | |
2029 | handled = true; | |
2030 | break; | |
2031 | case RTE_VHOST_MSG_RESULT_OK: | |
2032 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
2033 | "Processing %s succeeded.\n", | |
2034 | vhost_message_str[request]); | |
2035 | handled = true; | |
2036 | break; | |
2037 | case RTE_VHOST_MSG_RESULT_REPLY: | |
2038 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
2039 | "Processing %s succeeded and needs reply.\n", | |
2040 | vhost_message_str[request]); | |
2041 | send_vhost_reply(fd, &msg); | |
2042 | handled = true; | |
2043 | break; | |
2044 | default: | |
2045 | break; | |
2046 | } | |
11fdf7f2 TL |
2047 | } |
2048 | ||
2049 | skip_to_post_handle: | |
9f95a23c TL |
2050 | if (ret != RTE_VHOST_MSG_RESULT_ERR && |
2051 | dev->extern_ops.post_msg_handle) { | |
2052 | ret = (*dev->extern_ops.post_msg_handle)(dev->vid, | |
2053 | (void *)&msg); | |
2054 | switch (ret) { | |
2055 | case RTE_VHOST_MSG_RESULT_REPLY: | |
11fdf7f2 | 2056 | send_vhost_reply(fd, &msg); |
9f95a23c TL |
2057 | /* Fall-through */ |
2058 | case RTE_VHOST_MSG_RESULT_ERR: | |
2059 | case RTE_VHOST_MSG_RESULT_OK: | |
2060 | handled = true; | |
2061 | case RTE_VHOST_MSG_RESULT_NOT_HANDLED: | |
2062 | default: | |
2063 | break; | |
2064 | } | |
11fdf7f2 TL |
2065 | } |
2066 | ||
11fdf7f2 TL |
2067 | if (unlock_required) |
2068 | vhost_user_unlock_all_queue_pairs(dev); | |
2069 | ||
9f95a23c TL |
2070 | /* If message was not handled at this stage, treat it as an error */ |
2071 | if (!handled) { | |
2072 | RTE_LOG(ERR, VHOST_CONFIG, | |
2073 | "vhost message (req: %d) was not handled.\n", request); | |
2074 | ret = RTE_VHOST_MSG_RESULT_ERR; | |
2075 | } | |
2076 | ||
2077 | /* | |
2078 | * If the request required a reply that was already sent, | |
2079 | * this optional reply-ack won't be sent as the | |
2080 | * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply(). | |
2081 | */ | |
11fdf7f2 | 2082 | if (msg.flags & VHOST_USER_NEED_REPLY) { |
9f95a23c | 2083 | msg.payload.u64 = ret == RTE_VHOST_MSG_RESULT_ERR; |
11fdf7f2 | 2084 | msg.size = sizeof(msg.payload.u64); |
9f95a23c | 2085 | msg.fd_num = 0; |
11fdf7f2 | 2086 | send_vhost_reply(fd, &msg); |
9f95a23c TL |
2087 | } else if (ret == RTE_VHOST_MSG_RESULT_ERR) { |
2088 | RTE_LOG(ERR, VHOST_CONFIG, | |
2089 | "vhost message handling failed.\n"); | |
2090 | return -1; | |
11fdf7f2 TL |
2091 | } |
2092 | ||
2093 | if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { | |
2094 | dev->flags |= VIRTIO_DEV_READY; | |
2095 | ||
2096 | if (!(dev->flags & VIRTIO_DEV_RUNNING)) { | |
2097 | if (dev->dequeue_zero_copy) { | |
2098 | RTE_LOG(INFO, VHOST_CONFIG, | |
2099 | "dequeue zero copy is enabled\n"); | |
2100 | } | |
2101 | ||
2102 | if (dev->notify_ops->new_device(dev->vid) == 0) | |
2103 | dev->flags |= VIRTIO_DEV_RUNNING; | |
2104 | } | |
2105 | } | |
2106 | ||
2107 | did = dev->vdpa_dev_id; | |
2108 | vdpa_dev = rte_vdpa_get_device(did); | |
2109 | if (vdpa_dev && virtio_is_ready(dev) && | |
2110 | !(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) && | |
9f95a23c | 2111 | msg.request.master == VHOST_USER_SET_VRING_CALL) { |
11fdf7f2 TL |
2112 | if (vdpa_dev->ops->dev_conf) |
2113 | vdpa_dev->ops->dev_conf(dev->vid); | |
2114 | dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; | |
11fdf7f2 TL |
2115 | } |
2116 | ||
2117 | return 0; | |
2118 | } | |
2119 | ||
2120 | static int process_slave_message_reply(struct virtio_net *dev, | |
9f95a23c | 2121 | const struct VhostUserMsg *msg) |
11fdf7f2 | 2122 | { |
9f95a23c | 2123 | struct VhostUserMsg msg_reply; |
11fdf7f2 TL |
2124 | int ret; |
2125 | ||
2126 | if ((msg->flags & VHOST_USER_NEED_REPLY) == 0) | |
2127 | return 0; | |
2128 | ||
2129 | if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0) { | |
2130 | ret = -1; | |
2131 | goto out; | |
2132 | } | |
2133 | ||
2134 | if (msg_reply.request.slave != msg->request.slave) { | |
2135 | RTE_LOG(ERR, VHOST_CONFIG, | |
2136 | "Received unexpected msg type (%u), expected %u\n", | |
2137 | msg_reply.request.slave, msg->request.slave); | |
2138 | ret = -1; | |
2139 | goto out; | |
2140 | } | |
2141 | ||
2142 | ret = msg_reply.payload.u64 ? -1 : 0; | |
2143 | ||
2144 | out: | |
2145 | rte_spinlock_unlock(&dev->slave_req_lock); | |
2146 | return ret; | |
2147 | } | |
2148 | ||
2149 | int | |
2150 | vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) | |
2151 | { | |
2152 | int ret; | |
2153 | struct VhostUserMsg msg = { | |
2154 | .request.slave = VHOST_USER_SLAVE_IOTLB_MSG, | |
2155 | .flags = VHOST_USER_VERSION, | |
2156 | .size = sizeof(msg.payload.iotlb), | |
2157 | .payload.iotlb = { | |
2158 | .iova = iova, | |
2159 | .perm = perm, | |
2160 | .type = VHOST_IOTLB_MISS, | |
2161 | }, | |
2162 | }; | |
2163 | ||
9f95a23c | 2164 | ret = send_vhost_message(dev->slave_req_fd, &msg); |
11fdf7f2 TL |
2165 | if (ret < 0) { |
2166 | RTE_LOG(ERR, VHOST_CONFIG, | |
2167 | "Failed to send IOTLB miss message (%d)\n", | |
2168 | ret); | |
2169 | return ret; | |
2170 | } | |
2171 | ||
2172 | return 0; | |
2173 | } | |
2174 | ||
2175 | static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, | |
2176 | int index, int fd, | |
2177 | uint64_t offset, | |
2178 | uint64_t size) | |
2179 | { | |
11fdf7f2 TL |
2180 | int ret; |
2181 | struct VhostUserMsg msg = { | |
2182 | .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, | |
2183 | .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, | |
2184 | .size = sizeof(msg.payload.area), | |
2185 | .payload.area = { | |
2186 | .u64 = index & VHOST_USER_VRING_IDX_MASK, | |
2187 | .size = size, | |
2188 | .offset = offset, | |
2189 | }, | |
2190 | }; | |
2191 | ||
2192 | if (fd < 0) | |
2193 | msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; | |
2194 | else { | |
9f95a23c TL |
2195 | msg.fds[0] = fd; |
2196 | msg.fd_num = 1; | |
11fdf7f2 TL |
2197 | } |
2198 | ||
9f95a23c | 2199 | ret = send_vhost_slave_message(dev, &msg); |
11fdf7f2 TL |
2200 | if (ret < 0) { |
2201 | RTE_LOG(ERR, VHOST_CONFIG, | |
2202 | "Failed to set host notifier (%d)\n", ret); | |
2203 | return ret; | |
2204 | } | |
2205 | ||
2206 | return process_slave_message_reply(dev, &msg); | |
2207 | } | |
2208 | ||
9f95a23c | 2209 | int rte_vhost_host_notifier_ctrl(int vid, bool enable) |
11fdf7f2 TL |
2210 | { |
2211 | struct virtio_net *dev; | |
2212 | struct rte_vdpa_device *vdpa_dev; | |
2213 | int vfio_device_fd, did, ret = 0; | |
2214 | uint64_t offset, size; | |
2215 | unsigned int i; | |
2216 | ||
2217 | dev = get_device(vid); | |
2218 | if (!dev) | |
2219 | return -ENODEV; | |
2220 | ||
2221 | did = dev->vdpa_dev_id; | |
2222 | if (did < 0) | |
2223 | return -EINVAL; | |
2224 | ||
2225 | if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || | |
2226 | !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || | |
2227 | !(dev->protocol_features & | |
2228 | (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) || | |
2229 | !(dev->protocol_features & | |
2230 | (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) || | |
2231 | !(dev->protocol_features & | |
2232 | (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) | |
2233 | return -ENOTSUP; | |
2234 | ||
2235 | vdpa_dev = rte_vdpa_get_device(did); | |
2236 | if (!vdpa_dev) | |
2237 | return -ENODEV; | |
2238 | ||
2239 | RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP); | |
2240 | RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP); | |
2241 | ||
2242 | vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); | |
2243 | if (vfio_device_fd < 0) | |
2244 | return -ENOTSUP; | |
2245 | ||
2246 | if (enable) { | |
2247 | for (i = 0; i < dev->nr_vring; i++) { | |
2248 | if (vdpa_dev->ops->get_notify_area(vid, i, &offset, | |
2249 | &size) < 0) { | |
2250 | ret = -ENOTSUP; | |
2251 | goto disable; | |
2252 | } | |
2253 | ||
2254 | if (vhost_user_slave_set_vring_host_notifier(dev, i, | |
2255 | vfio_device_fd, offset, size) < 0) { | |
2256 | ret = -EFAULT; | |
2257 | goto disable; | |
2258 | } | |
2259 | } | |
2260 | } else { | |
2261 | disable: | |
2262 | for (i = 0; i < dev->nr_vring; i++) { | |
2263 | vhost_user_slave_set_vring_host_notifier(dev, i, -1, | |
2264 | 0, 0); | |
2265 | } | |
2266 | } | |
2267 | ||
2268 | return ret; | |
2269 | } |