]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2016 Intel Corporation | |
11fdf7f2 TL |
3 | */ |
4 | ||
5 | #include <sys/types.h> | |
6 | #include <sys/stat.h> | |
7 | #include <fcntl.h> | |
8 | #include <unistd.h> | |
9 | ||
10 | #include <rte_memory.h> | |
11 | #include <rte_eal_memconfig.h> | |
12 | ||
13 | #include "vhost.h" | |
14 | #include "virtio_user_dev.h" | |
15 | #include "vhost_kernel_tap.h" | |
16 | ||
17 | struct vhost_memory_kernel { | |
18 | uint32_t nregions; | |
19 | uint32_t padding; | |
20 | struct vhost_memory_region regions[0]; | |
21 | }; | |
22 | ||
23 | /* vhost kernel ioctls */ | |
24 | #define VHOST_VIRTIO 0xAF | |
25 | #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) | |
26 | #define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) | |
27 | #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) | |
28 | #define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) | |
29 | #define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) | |
30 | #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) | |
31 | #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) | |
32 | #define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) | |
33 | #define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) | |
34 | #define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) | |
35 | #define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) | |
36 | #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) | |
37 | #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) | |
38 | #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) | |
39 | #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) | |
40 | ||
41 | static uint64_t max_regions = 64; | |
42 | ||
43 | static void | |
44 | get_vhost_kernel_max_regions(void) | |
45 | { | |
46 | int fd; | |
47 | char buf[20] = {'\0'}; | |
48 | ||
49 | fd = open("/sys/module/vhost/parameters/max_mem_regions", O_RDONLY); | |
50 | if (fd < 0) | |
51 | return; | |
52 | ||
53 | if (read(fd, buf, sizeof(buf) - 1) > 0) | |
54 | max_regions = strtoull(buf, NULL, 10); | |
55 | ||
56 | close(fd); | |
57 | } | |
58 | ||
59 | static uint64_t vhost_req_user_to_kernel[] = { | |
60 | [VHOST_USER_SET_OWNER] = VHOST_SET_OWNER, | |
61 | [VHOST_USER_RESET_OWNER] = VHOST_RESET_OWNER, | |
62 | [VHOST_USER_SET_FEATURES] = VHOST_SET_FEATURES, | |
63 | [VHOST_USER_GET_FEATURES] = VHOST_GET_FEATURES, | |
64 | [VHOST_USER_SET_VRING_CALL] = VHOST_SET_VRING_CALL, | |
65 | [VHOST_USER_SET_VRING_NUM] = VHOST_SET_VRING_NUM, | |
66 | [VHOST_USER_SET_VRING_BASE] = VHOST_SET_VRING_BASE, | |
67 | [VHOST_USER_GET_VRING_BASE] = VHOST_GET_VRING_BASE, | |
68 | [VHOST_USER_SET_VRING_ADDR] = VHOST_SET_VRING_ADDR, | |
69 | [VHOST_USER_SET_VRING_KICK] = VHOST_SET_VRING_KICK, | |
70 | [VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE, | |
71 | }; | |
72 | ||
9f95a23c TL |
73 | static int |
74 | add_memseg_list(const struct rte_memseg_list *msl, void *arg) | |
75 | { | |
76 | struct vhost_memory_kernel *vm = arg; | |
77 | struct vhost_memory_region *mr; | |
78 | void *start_addr; | |
79 | uint64_t len; | |
80 | ||
81 | if (msl->external) | |
82 | return 0; | |
83 | ||
84 | if (vm->nregions >= max_regions) | |
85 | return -1; | |
86 | ||
87 | start_addr = msl->base_va; | |
88 | len = msl->page_sz * msl->memseg_arr.len; | |
89 | ||
90 | mr = &vm->regions[vm->nregions++]; | |
91 | ||
92 | mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr; | |
93 | mr->userspace_addr = (uint64_t)(uintptr_t)start_addr; | |
94 | mr->memory_size = len; | |
95 | mr->mmap_offset = 0; /* flags_padding */ | |
96 | ||
97 | PMD_DRV_LOG(DEBUG, "index=%u addr=%p len=%" PRIu64, | |
98 | vm->nregions - 1, start_addr, len); | |
99 | ||
100 | return 0; | |
101 | } | |
102 | ||
103 | /* By default, vhost kernel module allows 64 regions, but DPDK may | |
104 | * have much more memory regions. Below function will treat each | |
105 | * contiguous memory space reserved by DPDK as one region. | |
11fdf7f2 TL |
106 | */ |
107 | static struct vhost_memory_kernel * | |
108 | prepare_vhost_memory_kernel(void) | |
109 | { | |
11fdf7f2 TL |
110 | struct vhost_memory_kernel *vm; |
111 | ||
112 | vm = malloc(sizeof(struct vhost_memory_kernel) + | |
9f95a23c TL |
113 | max_regions * |
114 | sizeof(struct vhost_memory_region)); | |
11fdf7f2 TL |
115 | if (!vm) |
116 | return NULL; | |
117 | ||
9f95a23c TL |
118 | vm->nregions = 0; |
119 | vm->padding = 0; | |
11fdf7f2 | 120 | |
9f95a23c TL |
121 | /* |
122 | * The memory lock has already been taken by memory subsystem | |
123 | * or virtio_user_start_device(). | |
124 | */ | |
125 | if (rte_memseg_list_walk_thread_unsafe(add_memseg_list, vm) < 0) { | |
126 | free(vm); | |
127 | return NULL; | |
11fdf7f2 TL |
128 | } |
129 | ||
11fdf7f2 TL |
130 | return vm; |
131 | } | |
132 | ||
133 | /* with below features, vhost kernel does not need to do the checksum and TSO, | |
134 | * these info will be passed to virtio_user through virtio net header. | |
135 | */ | |
136 | #define VHOST_KERNEL_GUEST_OFFLOADS_MASK \ | |
137 | ((1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ | |
138 | (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ | |
139 | (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ | |
140 | (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ | |
141 | (1ULL << VIRTIO_NET_F_GUEST_UFO)) | |
142 | ||
143 | /* with below features, when flows from virtio_user to vhost kernel | |
144 | * (1) if flows goes up through the kernel networking stack, it does not need | |
145 | * to verify checksum, which can save CPU cycles; | |
146 | * (2) if flows goes through a Linux bridge and outside from an interface | |
147 | * (kernel driver), checksum and TSO will be done by GSO in kernel or even | |
148 | * offloaded into real physical device. | |
149 | */ | |
150 | #define VHOST_KERNEL_HOST_OFFLOADS_MASK \ | |
151 | ((1ULL << VIRTIO_NET_F_HOST_TSO4) | \ | |
152 | (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ | |
153 | (1ULL << VIRTIO_NET_F_CSUM)) | |
154 | ||
9f95a23c TL |
155 | static unsigned int |
156 | tap_support_features(void) | |
11fdf7f2 TL |
157 | { |
158 | int tapfd; | |
159 | unsigned int tap_features; | |
160 | ||
161 | tapfd = open(PATH_NET_TUN, O_RDWR); | |
162 | if (tapfd < 0) { | |
163 | PMD_DRV_LOG(ERR, "fail to open %s: %s", | |
164 | PATH_NET_TUN, strerror(errno)); | |
165 | return -1; | |
166 | } | |
167 | ||
168 | if (ioctl(tapfd, TUNGETFEATURES, &tap_features) == -1) { | |
169 | PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno)); | |
170 | close(tapfd); | |
171 | return -1; | |
172 | } | |
173 | ||
174 | close(tapfd); | |
9f95a23c | 175 | return tap_features; |
11fdf7f2 TL |
176 | } |
177 | ||
178 | static int | |
179 | vhost_kernel_ioctl(struct virtio_user_dev *dev, | |
180 | enum vhost_user_request req, | |
181 | void *arg) | |
182 | { | |
183 | int ret = -1; | |
184 | unsigned int i; | |
185 | uint64_t req_kernel; | |
186 | struct vhost_memory_kernel *vm = NULL; | |
187 | int vhostfd; | |
188 | unsigned int queue_sel; | |
9f95a23c | 189 | unsigned int features; |
11fdf7f2 TL |
190 | |
191 | PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]); | |
192 | ||
193 | req_kernel = vhost_req_user_to_kernel[req]; | |
194 | ||
195 | if (req_kernel == VHOST_SET_MEM_TABLE) { | |
196 | vm = prepare_vhost_memory_kernel(); | |
197 | if (!vm) | |
198 | return -1; | |
199 | arg = (void *)vm; | |
200 | } | |
201 | ||
202 | if (req_kernel == VHOST_SET_FEATURES) { | |
203 | /* We don't need memory protection here */ | |
204 | *(uint64_t *)arg &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); | |
205 | ||
206 | /* VHOST kernel does not know about below flags */ | |
207 | *(uint64_t *)arg &= ~VHOST_KERNEL_GUEST_OFFLOADS_MASK; | |
208 | *(uint64_t *)arg &= ~VHOST_KERNEL_HOST_OFFLOADS_MASK; | |
209 | ||
210 | *(uint64_t *)arg &= ~(1ULL << VIRTIO_NET_F_MQ); | |
211 | } | |
212 | ||
213 | switch (req_kernel) { | |
214 | case VHOST_SET_VRING_NUM: | |
215 | case VHOST_SET_VRING_ADDR: | |
216 | case VHOST_SET_VRING_BASE: | |
217 | case VHOST_GET_VRING_BASE: | |
218 | case VHOST_SET_VRING_KICK: | |
219 | case VHOST_SET_VRING_CALL: | |
220 | queue_sel = *(unsigned int *)arg; | |
221 | vhostfd = dev->vhostfds[queue_sel / 2]; | |
222 | *(unsigned int *)arg = queue_sel % 2; | |
223 | PMD_DRV_LOG(DEBUG, "vhostfd=%d, index=%u", | |
224 | vhostfd, *(unsigned int *)arg); | |
225 | break; | |
226 | default: | |
227 | vhostfd = -1; | |
228 | } | |
229 | if (vhostfd == -1) { | |
230 | for (i = 0; i < dev->max_queue_pairs; ++i) { | |
231 | if (dev->vhostfds[i] < 0) | |
232 | continue; | |
233 | ||
234 | ret = ioctl(dev->vhostfds[i], req_kernel, arg); | |
235 | if (ret < 0) | |
236 | break; | |
237 | } | |
238 | } else { | |
239 | ret = ioctl(vhostfd, req_kernel, arg); | |
240 | } | |
241 | ||
242 | if (!ret && req_kernel == VHOST_GET_FEATURES) { | |
9f95a23c | 243 | features = tap_support_features(); |
11fdf7f2 TL |
244 | /* with tap as the backend, all these features are supported |
245 | * but not claimed by vhost-net, so we add them back when | |
246 | * reporting to upper layer. | |
247 | */ | |
9f95a23c TL |
248 | if (features & IFF_VNET_HDR) { |
249 | *((uint64_t *)arg) |= VHOST_KERNEL_GUEST_OFFLOADS_MASK; | |
250 | *((uint64_t *)arg) |= VHOST_KERNEL_HOST_OFFLOADS_MASK; | |
251 | } | |
11fdf7f2 TL |
252 | |
253 | /* vhost_kernel will not declare this feature, but it does | |
254 | * support multi-queue. | |
255 | */ | |
9f95a23c | 256 | if (features & IFF_MULTI_QUEUE) |
11fdf7f2 TL |
257 | *(uint64_t *)arg |= (1ull << VIRTIO_NET_F_MQ); |
258 | } | |
259 | ||
260 | if (vm) | |
261 | free(vm); | |
262 | ||
263 | if (ret < 0) | |
264 | PMD_DRV_LOG(ERR, "%s failed: %s", | |
265 | vhost_msg_strings[req], strerror(errno)); | |
266 | ||
267 | return ret; | |
268 | } | |
269 | ||
270 | /** | |
271 | * Set up environment to talk with a vhost kernel backend. | |
272 | * | |
273 | * @return | |
274 | * - (-1) if fail to set up; | |
275 | * - (>=0) if successful. | |
276 | */ | |
277 | static int | |
278 | vhost_kernel_setup(struct virtio_user_dev *dev) | |
279 | { | |
280 | int vhostfd; | |
281 | uint32_t i; | |
282 | ||
283 | get_vhost_kernel_max_regions(); | |
284 | ||
285 | for (i = 0; i < dev->max_queue_pairs; ++i) { | |
286 | vhostfd = open(dev->path, O_RDWR); | |
287 | if (vhostfd < 0) { | |
288 | PMD_DRV_LOG(ERR, "fail to open %s, %s", | |
289 | dev->path, strerror(errno)); | |
290 | return -1; | |
291 | } | |
292 | ||
293 | dev->vhostfds[i] = vhostfd; | |
294 | } | |
295 | ||
296 | return 0; | |
297 | } | |
298 | ||
299 | static int | |
300 | vhost_kernel_set_backend(int vhostfd, int tapfd) | |
301 | { | |
302 | struct vhost_vring_file f; | |
303 | ||
304 | f.fd = tapfd; | |
305 | f.index = 0; | |
306 | if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { | |
307 | PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", | |
308 | strerror(errno)); | |
309 | return -1; | |
310 | } | |
311 | ||
312 | f.index = 1; | |
313 | if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { | |
314 | PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", | |
315 | strerror(errno)); | |
316 | return -1; | |
317 | } | |
318 | ||
319 | return 0; | |
320 | } | |
321 | ||
322 | static int | |
323 | vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, | |
324 | uint16_t pair_idx, | |
325 | int enable) | |
326 | { | |
327 | int hdr_size; | |
328 | int vhostfd; | |
329 | int tapfd; | |
330 | int req_mq = (dev->max_queue_pairs > 1); | |
331 | ||
332 | vhostfd = dev->vhostfds[pair_idx]; | |
333 | ||
334 | if (!enable) { | |
335 | if (dev->tapfds[pair_idx] >= 0) { | |
336 | close(dev->tapfds[pair_idx]); | |
337 | dev->tapfds[pair_idx] = -1; | |
338 | } | |
339 | return vhost_kernel_set_backend(vhostfd, -1); | |
340 | } else if (dev->tapfds[pair_idx] >= 0) { | |
341 | return 0; | |
342 | } | |
343 | ||
344 | if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || | |
345 | (dev->features & (1ULL << VIRTIO_F_VERSION_1))) | |
346 | hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); | |
347 | else | |
348 | hdr_size = sizeof(struct virtio_net_hdr); | |
349 | ||
9f95a23c TL |
350 | tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq, |
351 | (char *)dev->mac_addr, dev->features); | |
11fdf7f2 TL |
352 | if (tapfd < 0) { |
353 | PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel"); | |
354 | return -1; | |
355 | } | |
356 | ||
357 | if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) { | |
358 | PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); | |
359 | close(tapfd); | |
360 | return -1; | |
361 | } | |
362 | ||
363 | dev->tapfds[pair_idx] = tapfd; | |
364 | return 0; | |
365 | } | |
366 | ||
9f95a23c | 367 | struct virtio_user_backend_ops virtio_ops_kernel = { |
11fdf7f2 TL |
368 | .setup = vhost_kernel_setup, |
369 | .send_request = vhost_kernel_ioctl, | |
370 | .enable_qp = vhost_kernel_enable_queue_pair | |
371 | }; |