]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright (c) Intel Corporation. | |
5 | * All rights reserved. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of Intel Corporation nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
11fdf7f2 | 34 | #include "spdk/stdinc.h" |
7c673cae | 35 | |
11fdf7f2 | 36 | #include "env_internal.h" |
7c673cae FG |
37 | |
38 | #include <rte_config.h> | |
39 | #include <rte_eal_memconfig.h> | |
40 | ||
11fdf7f2 TL |
41 | #include "spdk_internal/assert.h" |
42 | ||
7c673cae FG |
43 | #include "spdk/assert.h" |
44 | #include "spdk/likely.h" | |
45 | #include "spdk/queue.h" | |
46 | #include "spdk/util.h" | |
47 | ||
11fdf7f2 TL |
48 | #ifdef __FreeBSD__ |
49 | #define SPDK_VFIO_ENABLED 0 | |
50 | #else | |
51 | #include <linux/version.h> | |
52 | /* | |
53 | * DPDK versions before 17.11 don't provide a way to get VFIO information in the public API, | |
54 | * and we can't link to internal symbols when built against shared library DPDK, | |
55 | * so disable VFIO entirely in that case. | |
7c673cae | 56 | */ |
11fdf7f2 TL |
57 | #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) && \ |
58 | (RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) || !defined(RTE_BUILD_SHARED_LIB)) | |
7c673cae | 59 | |
11fdf7f2 TL |
60 | #define SPDK_VFIO_ENABLED 1 |
61 | #include <linux/vfio.h> | |
7c673cae | 62 | |
11fdf7f2 TL |
63 | #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) |
64 | #include <rte_vfio.h> | |
65 | #else | |
66 | /* Internal DPDK function forward declaration */ | |
67 | int pci_vfio_is_enabled(void); | |
68 | #endif | |
7c673cae | 69 | |
11fdf7f2 TL |
70 | struct spdk_vfio_dma_map { |
71 | struct vfio_iommu_type1_dma_map map; | |
72 | struct vfio_iommu_type1_dma_unmap unmap; | |
73 | TAILQ_ENTRY(spdk_vfio_dma_map) tailq; | |
7c673cae FG |
74 | }; |
75 | ||
11fdf7f2 TL |
76 | struct vfio_cfg { |
77 | int fd; | |
78 | bool enabled; | |
79 | unsigned device_ref; | |
80 | TAILQ_HEAD(, spdk_vfio_dma_map) maps; | |
81 | pthread_mutex_t mutex; | |
7c673cae FG |
82 | }; |
83 | ||
11fdf7f2 TL |
84 | static struct vfio_cfg g_vfio = { |
85 | .fd = -1, | |
86 | .enabled = false, | |
87 | .device_ref = 0, | |
88 | .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), | |
89 | .mutex = PTHREAD_MUTEX_INITIALIZER | |
7c673cae FG |
90 | }; |
91 | ||
11fdf7f2 TL |
92 | #else |
93 | #define SPDK_VFIO_ENABLED 0 | |
94 | #endif | |
95 | #endif | |
96 | ||
97 | #if DEBUG | |
98 | #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) | |
99 | #else | |
100 | #define DEBUG_PRINT(...) | |
101 | #endif | |
102 | ||
103 | struct spdk_vtophys_pci_device { | |
104 | struct rte_pci_device *pci_device; | |
105 | TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; | |
106 | uint64_t ref; | |
7c673cae FG |
107 | }; |
108 | ||
11fdf7f2 TL |
109 | static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; |
110 | static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = | |
111 | TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); | |
112 | ||
7c673cae | 113 | static struct spdk_mem_map *g_vtophys_map; |
7c673cae | 114 | |
11fdf7f2 TL |
115 | #if SPDK_VFIO_ENABLED |
116 | static int | |
117 | vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) | |
7c673cae | 118 | { |
11fdf7f2 TL |
119 | struct spdk_vfio_dma_map *dma_map; |
120 | int ret; | |
7c673cae | 121 | |
11fdf7f2 TL |
122 | dma_map = calloc(1, sizeof(*dma_map)); |
123 | if (dma_map == NULL) { | |
124 | return -ENOMEM; | |
125 | } | |
7c673cae | 126 | |
11fdf7f2 TL |
127 | dma_map->map.argsz = sizeof(dma_map->map); |
128 | dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; | |
129 | dma_map->map.vaddr = vaddr; | |
130 | dma_map->map.iova = iova; | |
131 | dma_map->map.size = size; | |
132 | ||
133 | dma_map->unmap.argsz = sizeof(dma_map->unmap); | |
134 | dma_map->unmap.flags = 0; | |
135 | dma_map->unmap.iova = iova; | |
136 | dma_map->unmap.size = size; | |
137 | ||
138 | pthread_mutex_lock(&g_vfio.mutex); | |
139 | if (g_vfio.device_ref == 0) { | |
140 | /* VFIO requires at least one device (IOMMU group) to be added to | |
141 | * a VFIO container before it is possible to perform any IOMMU | |
142 | * operations on that container. This memory will be mapped once | |
143 | * the first device (IOMMU group) is hotplugged. | |
144 | * | |
145 | * Since the vfio container is managed internally by DPDK, it is | |
146 | * also possible that some device is already in that container, but | |
147 | * it's not managed by SPDK - e.g. an NIC attached internally | |
148 | * inside DPDK. We could map the memory straight away in such | |
149 | * scenario, but there's no need to do it. DPDK devices clearly | |
150 | * don't need our mappings and hence we defer the mapping | |
151 | * unconditionally until the first SPDK-managed device is | |
152 | * hotplugged. | |
153 | */ | |
154 | goto out_insert; | |
155 | } | |
7c673cae | 156 | |
11fdf7f2 TL |
157 | ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); |
158 | if (ret) { | |
159 | DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); | |
160 | pthread_mutex_unlock(&g_vfio.mutex); | |
161 | free(dma_map); | |
162 | return ret; | |
7c673cae FG |
163 | } |
164 | ||
11fdf7f2 TL |
165 | out_insert: |
166 | TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); | |
167 | pthread_mutex_unlock(&g_vfio.mutex); | |
168 | return 0; | |
7c673cae FG |
169 | } |
170 | ||
11fdf7f2 TL |
171 | static int |
172 | vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) | |
7c673cae | 173 | { |
11fdf7f2 TL |
174 | struct spdk_vfio_dma_map *dma_map; |
175 | int ret; | |
7c673cae | 176 | |
11fdf7f2 TL |
177 | pthread_mutex_lock(&g_vfio.mutex); |
178 | TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { | |
179 | if (dma_map->map.iova == iova) { | |
180 | break; | |
181 | } | |
7c673cae FG |
182 | } |
183 | ||
11fdf7f2 TL |
184 | if (dma_map == NULL) { |
185 | DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); | |
186 | pthread_mutex_unlock(&g_vfio.mutex); | |
187 | return -ENXIO; | |
7c673cae FG |
188 | } |
189 | ||
11fdf7f2 TL |
190 | /** don't support partial or multiple-page unmap for now */ |
191 | assert(dma_map->map.size == size); | |
7c673cae | 192 | |
11fdf7f2 TL |
193 | if (g_vfio.device_ref == 0) { |
194 | /* Memory is not mapped anymore, just remove it's references */ | |
195 | goto out_remove; | |
196 | } | |
7c673cae | 197 | |
7c673cae | 198 | |
11fdf7f2 TL |
199 | ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); |
200 | if (ret) { | |
201 | DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); | |
202 | pthread_mutex_unlock(&g_vfio.mutex); | |
203 | return ret; | |
204 | } | |
7c673cae | 205 | |
11fdf7f2 TL |
206 | out_remove: |
207 | TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); | |
208 | pthread_mutex_unlock(&g_vfio.mutex); | |
209 | free(dma_map); | |
210 | return 0; | |
7c673cae | 211 | } |
11fdf7f2 | 212 | #endif |
7c673cae | 213 | |
11fdf7f2 TL |
214 | static uint64_t |
215 | vtophys_get_paddr_memseg(uint64_t vaddr) | |
7c673cae | 216 | { |
11fdf7f2 TL |
217 | uintptr_t paddr; |
218 | struct rte_memseg *seg; | |
7c673cae | 219 | |
11fdf7f2 TL |
220 | #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) |
221 | seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); | |
222 | if (seg != NULL) { | |
223 | paddr = seg->phys_addr; | |
224 | if (paddr == RTE_BAD_IOVA) { | |
225 | return SPDK_VTOPHYS_ERROR; | |
226 | } | |
227 | paddr += (vaddr - (uintptr_t)seg->addr); | |
228 | return paddr; | |
7c673cae | 229 | } |
11fdf7f2 TL |
230 | #else |
231 | struct rte_mem_config *mcfg; | |
232 | uint32_t seg_idx; | |
7c673cae | 233 | |
11fdf7f2 TL |
234 | mcfg = rte_eal_get_configuration()->mem_config; |
235 | for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { | |
236 | seg = &mcfg->memseg[seg_idx]; | |
237 | if (seg->addr == NULL) { | |
238 | break; | |
239 | } | |
7c673cae | 240 | |
11fdf7f2 TL |
241 | if (vaddr >= (uintptr_t)seg->addr && |
242 | vaddr < ((uintptr_t)seg->addr + seg->len)) { | |
243 | paddr = seg->phys_addr; | |
244 | #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) | |
245 | if (paddr == RTE_BAD_IOVA) { | |
246 | #else | |
247 | if (paddr == RTE_BAD_PHYS_ADDR) { | |
248 | #endif | |
249 | return SPDK_VTOPHYS_ERROR; | |
250 | } | |
251 | paddr += (vaddr - (uintptr_t)seg->addr); | |
252 | return paddr; | |
253 | } | |
7c673cae | 254 | } |
11fdf7f2 | 255 | #endif |
7c673cae | 256 | |
11fdf7f2 | 257 | return SPDK_VTOPHYS_ERROR; |
7c673cae FG |
258 | } |
259 | ||
11fdf7f2 TL |
260 | /* Try to get the paddr from /proc/self/pagemap */ |
261 | static uint64_t | |
262 | vtophys_get_paddr_pagemap(uint64_t vaddr) | |
7c673cae | 263 | { |
11fdf7f2 | 264 | uintptr_t paddr; |
7c673cae | 265 | |
11fdf7f2 TL |
266 | #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) |
267 | #define BAD_ADDR RTE_BAD_IOVA | |
268 | #define VTOPHYS rte_mem_virt2iova | |
269 | #else | |
270 | #define BAD_ADDR RTE_BAD_PHYS_ADDR | |
271 | #define VTOPHYS rte_mem_virt2phy | |
272 | #endif | |
273 | ||
274 | /* | |
275 | * Note: the virt2phy/virt2iova functions have changed over time, such | |
276 | * that older versions may return 0 while recent versions will never | |
277 | * return 0 but RTE_BAD_PHYS_ADDR/IOVA instead. To support older and | |
278 | * newer versions, check for both return values. | |
279 | */ | |
280 | paddr = VTOPHYS((void *)vaddr); | |
281 | if (paddr == 0 || paddr == BAD_ADDR) { | |
282 | /* | |
283 | * The vaddr may be valid but doesn't have a backing page | |
284 | * assigned yet. Touch the page to ensure a backing page | |
285 | * gets assigned, then try to translate again. | |
286 | */ | |
287 | rte_atomic64_read((rte_atomic64_t *)vaddr); | |
288 | paddr = VTOPHYS((void *)vaddr); | |
289 | } | |
290 | if (paddr == 0 || paddr == BAD_ADDR) { | |
291 | /* Unable to get to the physical address. */ | |
292 | return SPDK_VTOPHYS_ERROR; | |
7c673cae | 293 | } |
7c673cae | 294 | |
11fdf7f2 TL |
295 | #undef BAD_ADDR |
296 | #undef VTOPHYS | |
7c673cae | 297 | |
11fdf7f2 | 298 | return paddr; |
7c673cae FG |
299 | } |
300 | ||
11fdf7f2 TL |
301 | /* Try to get the paddr from pci devices */ |
302 | static uint64_t | |
303 | vtophys_get_paddr_pci(uint64_t vaddr) | |
7c673cae | 304 | { |
11fdf7f2 TL |
305 | struct spdk_vtophys_pci_device *vtophys_dev; |
306 | uintptr_t paddr; | |
307 | struct rte_pci_device *dev; | |
308 | #if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 1) | |
309 | struct rte_mem_resource *res; | |
310 | #else | |
311 | struct rte_pci_resource *res; | |
7c673cae | 312 | #endif |
11fdf7f2 TL |
313 | unsigned r; |
314 | ||
315 | pthread_mutex_lock(&g_vtophys_pci_devices_mutex); | |
316 | TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { | |
317 | dev = vtophys_dev->pci_device; | |
318 | ||
319 | for (r = 0; r < PCI_MAX_RESOURCE; r++) { | |
320 | res = &dev->mem_resource[r]; | |
321 | if (res->phys_addr && vaddr >= (uint64_t)res->addr && | |
322 | vaddr < (uint64_t)res->addr + res->len) { | |
323 | paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); | |
324 | DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, | |
325 | (void *)paddr); | |
326 | pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); | |
327 | return paddr; | |
328 | } | |
7c673cae FG |
329 | } |
330 | } | |
11fdf7f2 | 331 | pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); |
7c673cae | 332 | |
11fdf7f2 | 333 | return SPDK_VTOPHYS_ERROR; |
7c673cae FG |
334 | } |
335 | ||
11fdf7f2 TL |
336 | static int |
337 | spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, | |
338 | enum spdk_mem_map_notify_action action, | |
339 | void *vaddr, size_t len) | |
7c673cae | 340 | { |
11fdf7f2 TL |
341 | int rc = 0, pci_phys = 0; |
342 | uint64_t paddr; | |
7c673cae | 343 | |
11fdf7f2 TL |
344 | if ((uintptr_t)vaddr & ~MASK_256TB) { |
345 | DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); | |
346 | return -EINVAL; | |
347 | } | |
7c673cae | 348 | |
11fdf7f2 TL |
349 | if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { |
350 | DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", | |
351 | __func__, vaddr, len); | |
352 | return -EINVAL; | |
7c673cae | 353 | } |
7c673cae | 354 | |
11fdf7f2 TL |
355 | while (len > 0) { |
356 | /* Get the physical address from the DPDK memsegs */ | |
357 | paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); | |
358 | ||
359 | switch (action) { | |
360 | case SPDK_MEM_MAP_NOTIFY_REGISTER: | |
361 | if (paddr == SPDK_VTOPHYS_ERROR) { | |
362 | /* This is not an address that DPDK is managing. */ | |
363 | #if SPDK_VFIO_ENABLED | |
364 | if (g_vfio.enabled) { | |
365 | /* We'll use the virtual address as the iova. DPDK | |
366 | * currently uses physical addresses as the iovas (or counts | |
367 | * up from 0 if it can't get physical addresses), so | |
368 | * the range of user space virtual addresses and physical | |
369 | * addresses will never overlap. | |
370 | */ | |
371 | paddr = (uint64_t)vaddr; | |
372 | rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); | |
373 | if (rc) { | |
374 | return -EFAULT; | |
375 | } | |
376 | } else | |
7c673cae | 377 | #endif |
11fdf7f2 TL |
378 | { |
379 | /* Get the physical address from /proc/self/pagemap. */ | |
380 | paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); | |
381 | if (paddr == SPDK_VTOPHYS_ERROR) { | |
382 | /* Get the physical address from PCI devices */ | |
383 | paddr = vtophys_get_paddr_pci((uint64_t)vaddr); | |
384 | if (paddr == SPDK_VTOPHYS_ERROR) { | |
385 | DEBUG_PRINT("could not get phys addr for %p\n", vaddr); | |
386 | return -EFAULT; | |
387 | } | |
388 | pci_phys = 1; | |
389 | } | |
390 | } | |
391 | } | |
392 | /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ | |
393 | if (!pci_phys && (paddr & MASK_2MB)) { | |
394 | DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); | |
395 | return -EINVAL; | |
396 | } | |
7c673cae | 397 | |
11fdf7f2 TL |
398 | rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); |
399 | break; | |
400 | case SPDK_MEM_MAP_NOTIFY_UNREGISTER: | |
401 | #if SPDK_VFIO_ENABLED | |
402 | if (paddr == SPDK_VTOPHYS_ERROR) { | |
403 | /* | |
404 | * This is not an address that DPDK is managing. If vfio is enabled, | |
405 | * we need to unmap the range from the IOMMU | |
406 | */ | |
407 | if (g_vfio.enabled) { | |
408 | uint64_t buffer_len; | |
409 | paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len); | |
410 | if (buffer_len != VALUE_2MB) { | |
411 | return -EINVAL; | |
412 | } | |
413 | rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); | |
414 | if (rc) { | |
415 | return -EFAULT; | |
416 | } | |
417 | } | |
418 | } | |
7c673cae | 419 | #endif |
11fdf7f2 TL |
420 | rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); |
421 | break; | |
422 | default: | |
423 | SPDK_UNREACHABLE(); | |
7c673cae FG |
424 | } |
425 | ||
11fdf7f2 TL |
426 | if (rc != 0) { |
427 | return rc; | |
7c673cae | 428 | } |
11fdf7f2 TL |
429 | vaddr += VALUE_2MB; |
430 | len -= VALUE_2MB; | |
7c673cae | 431 | } |
11fdf7f2 TL |
432 | |
433 | return rc; | |
7c673cae FG |
434 | } |
435 | ||
11fdf7f2 TL |
436 | #if SPDK_VFIO_ENABLED |
437 | ||
438 | static bool | |
439 | spdk_vfio_enabled(void) | |
7c673cae | 440 | { |
11fdf7f2 TL |
441 | #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) |
442 | return rte_vfio_is_enabled("vfio_pci"); | |
443 | #else | |
444 | return pci_vfio_is_enabled(); | |
7c673cae | 445 | #endif |
7c673cae FG |
446 | } |
447 | ||
11fdf7f2 TL |
448 | static void |
449 | spdk_vtophys_iommu_init(void) | |
7c673cae | 450 | { |
11fdf7f2 TL |
451 | char proc_fd_path[PATH_MAX + 1]; |
452 | char link_path[PATH_MAX + 1]; | |
453 | const char vfio_path[] = "/dev/vfio/vfio"; | |
454 | DIR *dir; | |
455 | struct dirent *d; | |
7c673cae | 456 | |
11fdf7f2 TL |
457 | if (!spdk_vfio_enabled()) { |
458 | return; | |
7c673cae FG |
459 | } |
460 | ||
11fdf7f2 TL |
461 | dir = opendir("/proc/self/fd"); |
462 | if (!dir) { | |
463 | DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); | |
464 | return; | |
7c673cae FG |
465 | } |
466 | ||
11fdf7f2 TL |
467 | while ((d = readdir(dir)) != NULL) { |
468 | if (d->d_type != DT_LNK) { | |
469 | continue; | |
470 | } | |
7c673cae | 471 | |
11fdf7f2 TL |
472 | snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); |
473 | if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { | |
474 | continue; | |
7c673cae FG |
475 | } |
476 | ||
11fdf7f2 TL |
477 | if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { |
478 | sscanf(d->d_name, "%d", &g_vfio.fd); | |
479 | break; | |
7c673cae FG |
480 | } |
481 | } | |
482 | ||
11fdf7f2 | 483 | closedir(dir); |
7c673cae | 484 | |
11fdf7f2 TL |
485 | if (g_vfio.fd < 0) { |
486 | DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); | |
7c673cae FG |
487 | return; |
488 | } | |
489 | ||
11fdf7f2 | 490 | g_vfio.enabled = true; |
7c673cae | 491 | |
11fdf7f2 | 492 | return; |
7c673cae | 493 | } |
11fdf7f2 | 494 | #endif |
7c673cae | 495 | |
11fdf7f2 TL |
496 | void |
497 | spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) | |
7c673cae | 498 | { |
11fdf7f2 TL |
499 | struct spdk_vtophys_pci_device *vtophys_dev; |
500 | bool found = false; | |
501 | ||
502 | pthread_mutex_lock(&g_vtophys_pci_devices_mutex); | |
503 | TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { | |
504 | if (vtophys_dev->pci_device == pci_device) { | |
505 | vtophys_dev->ref++; | |
506 | found = true; | |
507 | break; | |
508 | } | |
509 | } | |
7c673cae | 510 | |
11fdf7f2 TL |
511 | if (!found) { |
512 | vtophys_dev = calloc(1, sizeof(*vtophys_dev)); | |
513 | if (vtophys_dev) { | |
514 | vtophys_dev->pci_device = pci_device; | |
515 | vtophys_dev->ref = 1; | |
516 | TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); | |
517 | } else { | |
518 | DEBUG_PRINT("Memory allocation error\n"); | |
519 | } | |
7c673cae | 520 | } |
11fdf7f2 | 521 | pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); |
7c673cae | 522 | |
11fdf7f2 TL |
523 | #if SPDK_VFIO_ENABLED |
524 | struct spdk_vfio_dma_map *dma_map; | |
525 | int ret; | |
526 | ||
527 | if (!g_vfio.enabled) { | |
7c673cae FG |
528 | return; |
529 | } | |
530 | ||
11fdf7f2 TL |
531 | pthread_mutex_lock(&g_vfio.mutex); |
532 | g_vfio.device_ref++; | |
533 | if (g_vfio.device_ref > 1) { | |
534 | pthread_mutex_unlock(&g_vfio.mutex); | |
535 | return; | |
536 | } | |
7c673cae | 537 | |
11fdf7f2 TL |
538 | /* This is the first SPDK device using DPDK vfio. This means that the first |
539 | * IOMMU group might have been just been added to the DPDK vfio container. | |
540 | * From this point it is certain that the memory can be mapped now. | |
541 | */ | |
542 | TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { | |
543 | ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); | |
544 | if (ret) { | |
545 | DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); | |
546 | break; | |
7c673cae | 547 | } |
7c673cae | 548 | } |
11fdf7f2 TL |
549 | pthread_mutex_unlock(&g_vfio.mutex); |
550 | #endif | |
7c673cae FG |
551 | } |
552 | ||
11fdf7f2 TL |
553 | void |
554 | spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) | |
7c673cae | 555 | { |
11fdf7f2 TL |
556 | struct spdk_vtophys_pci_device *vtophys_dev; |
557 | ||
558 | pthread_mutex_lock(&g_vtophys_pci_devices_mutex); | |
559 | TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { | |
560 | if (vtophys_dev->pci_device == pci_device) { | |
561 | assert(vtophys_dev->ref > 0); | |
562 | if (--vtophys_dev->ref == 0) { | |
563 | TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); | |
564 | free(vtophys_dev); | |
565 | } | |
566 | break; | |
567 | } | |
568 | } | |
569 | pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); | |
7c673cae | 570 | |
11fdf7f2 TL |
571 | #if SPDK_VFIO_ENABLED |
572 | struct spdk_vfio_dma_map *dma_map; | |
573 | int ret; | |
574 | ||
575 | if (!g_vfio.enabled) { | |
7c673cae FG |
576 | return; |
577 | } | |
578 | ||
11fdf7f2 TL |
579 | pthread_mutex_lock(&g_vfio.mutex); |
580 | assert(g_vfio.device_ref > 0); | |
581 | g_vfio.device_ref--; | |
582 | if (g_vfio.device_ref > 0) { | |
583 | pthread_mutex_unlock(&g_vfio.mutex); | |
7c673cae FG |
584 | return; |
585 | } | |
586 | ||
11fdf7f2 TL |
587 | /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have |
588 | * any additional devices using it's vfio container, all the mappings | |
589 | * will be automatically removed by the Linux vfio driver. We unmap | |
590 | * the memory manually to be able to easily re-map it later regardless | |
591 | * of other, external factors. | |
592 | */ | |
593 | TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { | |
594 | ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); | |
595 | if (ret) { | |
596 | DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); | |
597 | break; | |
598 | } | |
7c673cae | 599 | } |
11fdf7f2 TL |
600 | pthread_mutex_unlock(&g_vfio.mutex); |
601 | #endif | |
7c673cae FG |
602 | } |
603 | ||
11fdf7f2 TL |
604 | int |
605 | spdk_vtophys_init(void) | |
7c673cae | 606 | { |
11fdf7f2 TL |
607 | const struct spdk_mem_map_ops vtophys_map_ops = { |
608 | .notify_cb = spdk_vtophys_notify, | |
609 | .are_contiguous = NULL | |
610 | }; | |
7c673cae | 611 | |
11fdf7f2 TL |
612 | #if SPDK_VFIO_ENABLED |
613 | spdk_vtophys_iommu_init(); | |
614 | #endif | |
7c673cae | 615 | |
11fdf7f2 | 616 | g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); |
7c673cae | 617 | if (g_vtophys_map == NULL) { |
11fdf7f2 TL |
618 | DEBUG_PRINT("vtophys map allocation failed\n"); |
619 | return -1; | |
7c673cae | 620 | } |
11fdf7f2 | 621 | return 0; |
7c673cae FG |
622 | } |
623 | ||
624 | uint64_t | |
625 | spdk_vtophys(void *buf) | |
626 | { | |
627 | uint64_t vaddr, paddr_2mb; | |
628 | ||
629 | vaddr = (uint64_t)buf; | |
630 | ||
11fdf7f2 | 631 | paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, NULL); |
7c673cae FG |
632 | |
633 | /* | |
634 | * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, | |
635 | * we will still bitwise-or it with the buf offset below, but the result will still be | |
11fdf7f2 TL |
636 | * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being |
637 | * unaligned) we must now check the return value before addition. | |
7c673cae FG |
638 | */ |
639 | SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); | |
11fdf7f2 TL |
640 | if (paddr_2mb == SPDK_VTOPHYS_ERROR) { |
641 | return SPDK_VTOPHYS_ERROR; | |
642 | } else { | |
643 | return paddr_2mb + ((uint64_t)buf & MASK_2MB); | |
644 | } | |
645 | } | |
646 | ||
647 | static int | |
648 | spdk_bus_scan(void) | |
649 | { | |
650 | return 0; | |
651 | } | |
652 | ||
653 | static int | |
654 | spdk_bus_probe(void) | |
655 | { | |
656 | return 0; | |
7c673cae | 657 | } |
11fdf7f2 TL |
658 | |
659 | static struct rte_device * | |
660 | spdk_bus_find_device(const struct rte_device *start, | |
661 | rte_dev_cmp_t cmp, const void *data) | |
662 | { | |
663 | return NULL; | |
664 | } | |
665 | ||
666 | #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) | |
667 | static enum rte_iova_mode | |
668 | spdk_bus_get_iommu_class(void) { | |
669 | /* Since we register our PCI drivers after EAL init, we have no chance | |
670 | * of switching into RTE_IOVA_VA (virtual addresses as iova) iommu | |
671 | * class. DPDK uses RTE_IOVA_PA by default because for some platforms | |
672 | * it's the only supported mode, but then SPDK does not support those | |
673 | * platforms and doesn't mind defaulting to RTE_IOVA_VA. The rte_pci bus | |
674 | * will force RTE_IOVA_PA if RTE_IOVA_VA simply can not be used | |
675 | * (i.e. at least one device on the system is bound to uio_pci_generic), | |
676 | * so we simply return RTE_IOVA_VA here. | |
677 | */ | |
678 | return RTE_IOVA_VA; | |
679 | } | |
680 | #endif | |
681 | ||
682 | struct rte_bus spdk_bus = { | |
683 | .scan = spdk_bus_scan, | |
684 | .probe = spdk_bus_probe, | |
685 | .find_device = spdk_bus_find_device, | |
686 | #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) | |
687 | .get_iommu_class = spdk_bus_get_iommu_class, | |
688 | #endif | |
689 | }; | |
690 | ||
691 | RTE_REGISTER_BUS(spdk, spdk_bus); |