4 * Copyright (c) Intel Corporation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include "spdk/stdinc.h"
36 #include "env_internal.h"
38 #include <rte_config.h>
39 #include <rte_eal_memconfig.h>
41 #include "spdk_internal/assert.h"
42 #include "spdk_internal/memory.h"
44 #include "spdk/assert.h"
45 #include "spdk/likely.h"
46 #include "spdk/queue.h"
47 #include "spdk/util.h"
48 #include "spdk/env_dpdk.h"
51 #define SPDK_VFIO_ENABLED 0
53 #include <linux/version.h>
54 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
55 #define SPDK_VFIO_ENABLED 1
56 #include <linux/vfio.h>
59 struct spdk_vfio_dma_map
{
60 struct vfio_iommu_type1_dma_map map
;
61 struct vfio_iommu_type1_dma_unmap unmap
;
62 TAILQ_ENTRY(spdk_vfio_dma_map
) tailq
;
70 TAILQ_HEAD(, spdk_vfio_dma_map
) maps
;
71 pthread_mutex_t mutex
;
74 static struct vfio_cfg g_vfio
= {
77 .noiommu_enabled
= false,
79 .maps
= TAILQ_HEAD_INITIALIZER(g_vfio
.maps
),
80 .mutex
= PTHREAD_MUTEX_INITIALIZER
84 #define SPDK_VFIO_ENABLED 0
89 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
91 #define DEBUG_PRINT(...)
94 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
95 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
97 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
98 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
100 /* Page is registered */
101 #define REG_MAP_REGISTERED (1ULL << 62)
103 /* A notification region barrier. The 2MB translation entry that's marked
104 * with this flag must be unregistered separately. This allows contiguous
105 * regions to be unregistered in the same chunks they were registered.
107 #define REG_MAP_NOTIFY_START (1ULL << 63)
109 /* Translation of a single 2MB page. */
111 uint64_t translation_2mb
;
114 /* Second-level map table indexed by bits [21..29] of the virtual address.
115 * Each entry contains the address translation or error for entries that haven't
116 * been retrieved yet.
119 struct map_2mb map
[1ULL << (SHIFT_1GB
- SHIFT_2MB
)];
122 /* Top-level map table indexed by bits [30..47] of the virtual address.
123 * Each entry points to a second-level map table or NULL.
126 struct map_1gb
*map
[1ULL << (SHIFT_256TB
- SHIFT_1GB
)];
129 /* Page-granularity memory address translation */
130 struct spdk_mem_map
{
131 struct map_256tb map_256tb
;
132 pthread_mutex_t mutex
;
133 uint64_t default_translation
;
134 struct spdk_mem_map_ops ops
;
136 TAILQ_ENTRY(spdk_mem_map
) tailq
;
139 /* Registrations map. The 64 bit translations are bit fields with the
140 * following layout (starting with the low bits):
144 static struct spdk_mem_map
*g_mem_reg_map
;
145 static TAILQ_HEAD(, spdk_mem_map
) g_spdk_mem_maps
= TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps
);
146 static pthread_mutex_t g_spdk_mem_map_mutex
= PTHREAD_MUTEX_INITIALIZER
;
149 * Walk the currently registered memory via the main memory registration map
150 * and call the new map's notify callback for each virtually contiguous region.
153 spdk_mem_map_notify_walk(struct spdk_mem_map
*map
, enum spdk_mem_map_notify_action action
)
157 uint64_t contig_start
= UINT64_MAX
;
158 uint64_t contig_end
= UINT64_MAX
;
159 struct map_1gb
*map_1gb
;
162 if (!g_mem_reg_map
) {
166 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
167 pthread_mutex_lock(&g_mem_reg_map
->mutex
);
170 idx_256tb
< sizeof(g_mem_reg_map
->map_256tb
.map
) / sizeof(g_mem_reg_map
->map_256tb
.map
[0]);
172 map_1gb
= g_mem_reg_map
->map_256tb
.map
[idx_256tb
];
175 if (contig_start
!= UINT64_MAX
) {
176 /* End of of a virtually contiguous range */
177 rc
= map
->ops
.notify_cb(map
->cb_ctx
, map
, action
,
178 (void *)contig_start
,
179 contig_end
- contig_start
+ VALUE_2MB
);
180 /* Don't bother handling unregister failures. It can't be any worse */
181 if (rc
!= 0 && action
== SPDK_MEM_MAP_NOTIFY_REGISTER
) {
185 contig_start
= UINT64_MAX
;
189 for (idx_1gb
= 0; idx_1gb
< sizeof(map_1gb
->map
) / sizeof(map_1gb
->map
[0]); idx_1gb
++) {
190 if ((map_1gb
->map
[idx_1gb
].translation_2mb
& REG_MAP_REGISTERED
) &&
191 (contig_start
== UINT64_MAX
||
192 (map_1gb
->map
[idx_1gb
].translation_2mb
& REG_MAP_NOTIFY_START
) == 0)) {
193 /* Rebuild the virtual address from the indexes */
194 uint64_t vaddr
= (idx_256tb
<< SHIFT_1GB
) | (idx_1gb
<< SHIFT_2MB
);
196 if (contig_start
== UINT64_MAX
) {
197 contig_start
= vaddr
;
202 if (contig_start
!= UINT64_MAX
) {
203 /* End of of a virtually contiguous range */
204 rc
= map
->ops
.notify_cb(map
->cb_ctx
, map
, action
,
205 (void *)contig_start
,
206 contig_end
- contig_start
+ VALUE_2MB
);
207 /* Don't bother handling unregister failures. It can't be any worse */
208 if (rc
!= 0 && action
== SPDK_MEM_MAP_NOTIFY_REGISTER
) {
212 /* This page might be a part of a neighbour region, so process
213 * it again. The idx_1gb will be incremented immediately.
217 contig_start
= UINT64_MAX
;
222 pthread_mutex_unlock(&g_mem_reg_map
->mutex
);
226 /* Unwind to the first empty translation so we don't unregister
227 * a region that just failed to register.
229 idx_256tb
= MAP_256TB_IDX((contig_start
>> SHIFT_2MB
) - 1);
230 idx_1gb
= MAP_1GB_IDX((contig_start
>> SHIFT_2MB
) - 1);
231 contig_start
= UINT64_MAX
;
232 contig_end
= UINT64_MAX
;
234 /* Unregister any memory we managed to register before the failure */
235 for (; idx_256tb
< SIZE_MAX
; idx_256tb
--) {
236 map_1gb
= g_mem_reg_map
->map_256tb
.map
[idx_256tb
];
239 if (contig_end
!= UINT64_MAX
) {
240 /* End of of a virtually contiguous range */
241 map
->ops
.notify_cb(map
->cb_ctx
, map
,
242 SPDK_MEM_MAP_NOTIFY_UNREGISTER
,
243 (void *)contig_start
,
244 contig_end
- contig_start
+ VALUE_2MB
);
246 contig_end
= UINT64_MAX
;
250 for (; idx_1gb
< UINT64_MAX
; idx_1gb
--) {
251 if ((map_1gb
->map
[idx_1gb
].translation_2mb
& REG_MAP_REGISTERED
) &&
252 (contig_end
== UINT64_MAX
|| (map_1gb
->map
[idx_1gb
].translation_2mb
& REG_MAP_NOTIFY_START
) == 0)) {
253 /* Rebuild the virtual address from the indexes */
254 uint64_t vaddr
= (idx_256tb
<< SHIFT_1GB
) | (idx_1gb
<< SHIFT_2MB
);
256 if (contig_end
== UINT64_MAX
) {
259 contig_start
= vaddr
;
261 if (contig_end
!= UINT64_MAX
) {
262 /* End of of a virtually contiguous range */
263 map
->ops
.notify_cb(map
->cb_ctx
, map
,
264 SPDK_MEM_MAP_NOTIFY_UNREGISTER
,
265 (void *)contig_start
,
266 contig_end
- contig_start
+ VALUE_2MB
);
269 contig_end
= UINT64_MAX
;
272 idx_1gb
= sizeof(map_1gb
->map
) / sizeof(map_1gb
->map
[0]) - 1;
275 pthread_mutex_unlock(&g_mem_reg_map
->mutex
);
279 struct spdk_mem_map
*
280 spdk_mem_map_alloc(uint64_t default_translation
, const struct spdk_mem_map_ops
*ops
, void *cb_ctx
)
282 struct spdk_mem_map
*map
;
285 map
= calloc(1, sizeof(*map
));
290 if (pthread_mutex_init(&map
->mutex
, NULL
)) {
295 map
->default_translation
= default_translation
;
296 map
->cb_ctx
= cb_ctx
;
301 if (ops
&& ops
->notify_cb
) {
302 pthread_mutex_lock(&g_spdk_mem_map_mutex
);
303 rc
= spdk_mem_map_notify_walk(map
, SPDK_MEM_MAP_NOTIFY_REGISTER
);
305 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
306 DEBUG_PRINT("Initial mem_map notify failed\n");
307 pthread_mutex_destroy(&map
->mutex
);
311 TAILQ_INSERT_TAIL(&g_spdk_mem_maps
, map
, tailq
);
312 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
319 spdk_mem_map_free(struct spdk_mem_map
**pmap
)
321 struct spdk_mem_map
*map
;
334 if (map
->ops
.notify_cb
) {
335 pthread_mutex_lock(&g_spdk_mem_map_mutex
);
336 spdk_mem_map_notify_walk(map
, SPDK_MEM_MAP_NOTIFY_UNREGISTER
);
337 TAILQ_REMOVE(&g_spdk_mem_maps
, map
, tailq
);
338 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
341 for (i
= 0; i
< sizeof(map
->map_256tb
.map
) / sizeof(map
->map_256tb
.map
[0]); i
++) {
342 free(map
->map_256tb
.map
[i
]);
345 pthread_mutex_destroy(&map
->mutex
);
352 spdk_mem_register(void *vaddr
, size_t len
)
354 struct spdk_mem_map
*map
;
360 if ((uintptr_t)vaddr
& ~MASK_256TB
) {
361 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr
);
365 if (((uintptr_t)vaddr
& MASK_2MB
) || (len
& MASK_2MB
)) {
366 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
367 __func__
, vaddr
, len
);
375 pthread_mutex_lock(&g_spdk_mem_map_mutex
);
379 while (seg_len
> 0) {
380 reg
= spdk_mem_map_translate(g_mem_reg_map
, (uint64_t)seg_vaddr
, NULL
);
381 if (reg
& REG_MAP_REGISTERED
) {
382 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
385 seg_vaddr
+= VALUE_2MB
;
386 seg_len
-= VALUE_2MB
;
392 spdk_mem_map_set_translation(g_mem_reg_map
, (uint64_t)vaddr
, VALUE_2MB
,
393 seg_len
== 0 ? REG_MAP_REGISTERED
| REG_MAP_NOTIFY_START
: REG_MAP_REGISTERED
);
394 seg_len
+= VALUE_2MB
;
399 TAILQ_FOREACH(map
, &g_spdk_mem_maps
, tailq
) {
400 rc
= map
->ops
.notify_cb(map
->cb_ctx
, map
, SPDK_MEM_MAP_NOTIFY_REGISTER
, seg_vaddr
, seg_len
);
402 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
407 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
412 spdk_mem_unregister(void *vaddr
, size_t len
)
414 struct spdk_mem_map
*map
;
418 uint64_t reg
, newreg
;
420 if ((uintptr_t)vaddr
& ~MASK_256TB
) {
421 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr
);
425 if (((uintptr_t)vaddr
& MASK_2MB
) || (len
& MASK_2MB
)) {
426 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
427 __func__
, vaddr
, len
);
431 pthread_mutex_lock(&g_spdk_mem_map_mutex
);
433 /* The first page must be a start of a region. Also check if it's
434 * registered to make sure we don't return -ERANGE for non-registered
437 reg
= spdk_mem_map_translate(g_mem_reg_map
, (uint64_t)vaddr
, NULL
);
438 if ((reg
& REG_MAP_REGISTERED
) && (reg
& REG_MAP_NOTIFY_START
) == 0) {
439 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
445 while (seg_len
> 0) {
446 reg
= spdk_mem_map_translate(g_mem_reg_map
, (uint64_t)seg_vaddr
, NULL
);
447 if ((reg
& REG_MAP_REGISTERED
) == 0) {
448 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
451 seg_vaddr
+= VALUE_2MB
;
452 seg_len
-= VALUE_2MB
;
455 newreg
= spdk_mem_map_translate(g_mem_reg_map
, (uint64_t)seg_vaddr
, NULL
);
456 /* If the next page is registered, it must be a start of a region as well,
457 * otherwise we'd be unregistering only a part of a region.
459 if ((newreg
& REG_MAP_NOTIFY_START
) == 0 && (newreg
& REG_MAP_REGISTERED
)) {
460 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
467 reg
= spdk_mem_map_translate(g_mem_reg_map
, (uint64_t)vaddr
, NULL
);
468 spdk_mem_map_set_translation(g_mem_reg_map
, (uint64_t)vaddr
, VALUE_2MB
, 0);
470 if (seg_len
> 0 && (reg
& REG_MAP_NOTIFY_START
)) {
471 TAILQ_FOREACH(map
, &g_spdk_mem_maps
, tailq
) {
472 rc
= map
->ops
.notify_cb(map
->cb_ctx
, map
, SPDK_MEM_MAP_NOTIFY_UNREGISTER
, seg_vaddr
, seg_len
);
474 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
482 seg_len
+= VALUE_2MB
;
490 TAILQ_FOREACH(map
, &g_spdk_mem_maps
, tailq
) {
491 rc
= map
->ops
.notify_cb(map
->cb_ctx
, map
, SPDK_MEM_MAP_NOTIFY_UNREGISTER
, seg_vaddr
, seg_len
);
493 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
499 pthread_mutex_unlock(&g_spdk_mem_map_mutex
);
503 static struct map_1gb
*
504 spdk_mem_map_get_map_1gb(struct spdk_mem_map
*map
, uint64_t vfn_2mb
)
506 struct map_1gb
*map_1gb
;
507 uint64_t idx_256tb
= MAP_256TB_IDX(vfn_2mb
);
510 if (spdk_unlikely(idx_256tb
>= SPDK_COUNTOF(map
->map_256tb
.map
))) {
514 map_1gb
= map
->map_256tb
.map
[idx_256tb
];
517 pthread_mutex_lock(&map
->mutex
);
519 /* Recheck to make sure nobody else got the mutex first. */
520 map_1gb
= map
->map_256tb
.map
[idx_256tb
];
522 map_1gb
= malloc(sizeof(struct map_1gb
));
524 /* initialize all entries to default translation */
525 for (i
= 0; i
< SPDK_COUNTOF(map_1gb
->map
); i
++) {
526 map_1gb
->map
[i
].translation_2mb
= map
->default_translation
;
528 map
->map_256tb
.map
[idx_256tb
] = map_1gb
;
532 pthread_mutex_unlock(&map
->mutex
);
535 DEBUG_PRINT("allocation failed\n");
544 spdk_mem_map_set_translation(struct spdk_mem_map
*map
, uint64_t vaddr
, uint64_t size
,
545 uint64_t translation
)
548 struct map_1gb
*map_1gb
;
550 struct map_2mb
*map_2mb
;
552 if ((uintptr_t)vaddr
& ~MASK_256TB
) {
553 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr
);
557 /* For now, only 2 MB-aligned registrations are supported */
558 if (((uintptr_t)vaddr
& MASK_2MB
) || (size
& MASK_2MB
)) {
559 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
560 __func__
, vaddr
, size
);
564 vfn_2mb
= vaddr
>> SHIFT_2MB
;
567 map_1gb
= spdk_mem_map_get_map_1gb(map
, vfn_2mb
);
569 DEBUG_PRINT("could not get %p map\n", (void *)vaddr
);
573 idx_1gb
= MAP_1GB_IDX(vfn_2mb
);
574 map_2mb
= &map_1gb
->map
[idx_1gb
];
575 map_2mb
->translation_2mb
= translation
;
585 spdk_mem_map_clear_translation(struct spdk_mem_map
*map
, uint64_t vaddr
, uint64_t size
)
588 struct map_1gb
*map_1gb
;
590 struct map_2mb
*map_2mb
;
592 if ((uintptr_t)vaddr
& ~MASK_256TB
) {
593 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr
);
597 /* For now, only 2 MB-aligned registrations are supported */
598 if (((uintptr_t)vaddr
& MASK_2MB
) || (size
& MASK_2MB
)) {
599 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
600 __func__
, vaddr
, size
);
604 vfn_2mb
= vaddr
>> SHIFT_2MB
;
607 map_1gb
= spdk_mem_map_get_map_1gb(map
, vfn_2mb
);
609 DEBUG_PRINT("could not get %p map\n", (void *)vaddr
);
613 idx_1gb
= MAP_1GB_IDX(vfn_2mb
);
614 map_2mb
= &map_1gb
->map
[idx_1gb
];
615 map_2mb
->translation_2mb
= map
->default_translation
;
625 spdk_mem_map_translate(const struct spdk_mem_map
*map
, uint64_t vaddr
, uint64_t *size
)
627 const struct map_1gb
*map_1gb
;
628 const struct map_2mb
*map_2mb
;
633 uint64_t prev_translation
;
634 uint64_t orig_translation
;
636 if (spdk_unlikely(vaddr
& ~MASK_256TB
)) {
637 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr
);
638 return map
->default_translation
;
641 vfn_2mb
= vaddr
>> SHIFT_2MB
;
642 idx_256tb
= MAP_256TB_IDX(vfn_2mb
);
643 idx_1gb
= MAP_1GB_IDX(vfn_2mb
);
645 map_1gb
= map
->map_256tb
.map
[idx_256tb
];
646 if (spdk_unlikely(!map_1gb
)) {
647 return map
->default_translation
;
650 cur_size
= VALUE_2MB
- _2MB_OFFSET(vaddr
);
651 map_2mb
= &map_1gb
->map
[idx_1gb
];
652 if (size
== NULL
|| map
->ops
.are_contiguous
== NULL
||
653 map_2mb
->translation_2mb
== map
->default_translation
) {
655 *size
= spdk_min(*size
, cur_size
);
657 return map_2mb
->translation_2mb
;
660 orig_translation
= map_2mb
->translation_2mb
;
661 prev_translation
= orig_translation
;
662 while (cur_size
< *size
) {
664 idx_256tb
= MAP_256TB_IDX(vfn_2mb
);
665 idx_1gb
= MAP_1GB_IDX(vfn_2mb
);
667 map_1gb
= map
->map_256tb
.map
[idx_256tb
];
668 if (spdk_unlikely(!map_1gb
)) {
672 map_2mb
= &map_1gb
->map
[idx_1gb
];
673 if (!map
->ops
.are_contiguous(prev_translation
, map_2mb
->translation_2mb
)) {
677 cur_size
+= VALUE_2MB
;
678 prev_translation
= map_2mb
->translation_2mb
;
681 *size
= spdk_min(*size
, cur_size
);
682 return orig_translation
;
685 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
687 memory_hotplug_cb(enum rte_mem_event event_type
,
688 const void *addr
, size_t len
, void *arg
)
690 if (event_type
== RTE_MEM_EVENT_ALLOC
) {
691 spdk_mem_register((void *)addr
, len
);
693 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
694 if (!spdk_env_dpdk_external_init()) {
699 /* Prior to DPDK 19.02, we have to worry about DPDK
700 * freeing memory in different units than it was allocated.
701 * That doesn't work with things like RDMA MRs. So for
702 * those versions of DPDK, mark each segment so that DPDK
703 * won't later free it. That ensures we don't have to deal
704 * with that scenario.
706 * DPDK 19.02 added the --match-allocations RTE flag to
707 * avoid this condition.
709 * Note: if the user initialized DPDK separately, we can't
710 * be sure that --match-allocations was specified, so need
711 * to still mark the segments so they aren't freed.
714 struct rte_memseg
*seg
;
716 seg
= rte_mem_virt2memseg(addr
, NULL
);
718 seg
->flags
|= RTE_MEMSEG_FLAG_DO_NOT_FREE
;
719 addr
= (void *)((uintptr_t)addr
+ seg
->hugepage_sz
);
720 len
-= seg
->hugepage_sz
;
722 } else if (event_type
== RTE_MEM_EVENT_FREE
) {
723 spdk_mem_unregister((void *)addr
, len
);
728 memory_iter_cb(const struct rte_memseg_list
*msl
,
729 const struct rte_memseg
*ms
, size_t len
, void *arg
)
731 return spdk_mem_register(ms
->addr
, len
);
736 spdk_mem_map_init(void)
738 g_mem_reg_map
= spdk_mem_map_alloc(0, NULL
, NULL
);
739 if (g_mem_reg_map
== NULL
) {
740 DEBUG_PRINT("memory registration map allocation failed\n");
745 * Walk all DPDK memory segments and register them
746 * with the master memory map
748 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
749 rte_mem_event_callback_register("spdk", memory_hotplug_cb
, NULL
);
750 rte_memseg_contig_walk(memory_iter_cb
, NULL
);
752 struct rte_mem_config
*mcfg
;
755 mcfg
= rte_eal_get_configuration()->mem_config
;
756 for (seg_idx
= 0; seg_idx
< RTE_MAX_MEMSEG
; seg_idx
++) {
757 struct rte_memseg
*seg
= &mcfg
->memseg
[seg_idx
];
759 if (seg
->addr
== NULL
) {
763 spdk_mem_register(seg
->addr
, seg
->len
);
770 spdk_iommu_is_enabled(void)
772 #if SPDK_VFIO_ENABLED
773 return g_vfio
.enabled
&& !g_vfio
.noiommu_enabled
;
779 struct spdk_vtophys_pci_device
{
780 struct rte_pci_device
*pci_device
;
781 TAILQ_ENTRY(spdk_vtophys_pci_device
) tailq
;
784 static pthread_mutex_t g_vtophys_pci_devices_mutex
= PTHREAD_MUTEX_INITIALIZER
;
785 static TAILQ_HEAD(, spdk_vtophys_pci_device
) g_vtophys_pci_devices
=
786 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices
);
788 static struct spdk_mem_map
*g_vtophys_map
;
790 #if SPDK_VFIO_ENABLED
792 vtophys_iommu_map_dma(uint64_t vaddr
, uint64_t iova
, uint64_t size
)
794 struct spdk_vfio_dma_map
*dma_map
;
797 dma_map
= calloc(1, sizeof(*dma_map
));
798 if (dma_map
== NULL
) {
802 dma_map
->map
.argsz
= sizeof(dma_map
->map
);
803 dma_map
->map
.flags
= VFIO_DMA_MAP_FLAG_READ
| VFIO_DMA_MAP_FLAG_WRITE
;
804 dma_map
->map
.vaddr
= vaddr
;
805 dma_map
->map
.iova
= iova
;
806 dma_map
->map
.size
= size
;
808 dma_map
->unmap
.argsz
= sizeof(dma_map
->unmap
);
809 dma_map
->unmap
.flags
= 0;
810 dma_map
->unmap
.iova
= iova
;
811 dma_map
->unmap
.size
= size
;
813 pthread_mutex_lock(&g_vfio
.mutex
);
814 if (g_vfio
.device_ref
== 0) {
815 /* VFIO requires at least one device (IOMMU group) to be added to
816 * a VFIO container before it is possible to perform any IOMMU
817 * operations on that container. This memory will be mapped once
818 * the first device (IOMMU group) is hotplugged.
820 * Since the vfio container is managed internally by DPDK, it is
821 * also possible that some device is already in that container, but
822 * it's not managed by SPDK - e.g. an NIC attached internally
823 * inside DPDK. We could map the memory straight away in such
824 * scenario, but there's no need to do it. DPDK devices clearly
825 * don't need our mappings and hence we defer the mapping
826 * unconditionally until the first SPDK-managed device is
832 ret
= ioctl(g_vfio
.fd
, VFIO_IOMMU_MAP_DMA
, &dma_map
->map
);
834 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno
);
835 pthread_mutex_unlock(&g_vfio
.mutex
);
841 TAILQ_INSERT_TAIL(&g_vfio
.maps
, dma_map
, tailq
);
842 pthread_mutex_unlock(&g_vfio
.mutex
);
847 vtophys_iommu_unmap_dma(uint64_t iova
, uint64_t size
)
849 struct spdk_vfio_dma_map
*dma_map
;
852 pthread_mutex_lock(&g_vfio
.mutex
);
853 TAILQ_FOREACH(dma_map
, &g_vfio
.maps
, tailq
) {
854 if (dma_map
->map
.iova
== iova
) {
859 if (dma_map
== NULL
) {
860 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64
" - it's not mapped\n", iova
);
861 pthread_mutex_unlock(&g_vfio
.mutex
);
865 /** don't support partial or multiple-page unmap for now */
866 assert(dma_map
->map
.size
== size
);
868 if (g_vfio
.device_ref
== 0) {
869 /* Memory is not mapped anymore, just remove it's references */
874 ret
= ioctl(g_vfio
.fd
, VFIO_IOMMU_UNMAP_DMA
, &dma_map
->unmap
);
876 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno
);
877 pthread_mutex_unlock(&g_vfio
.mutex
);
882 TAILQ_REMOVE(&g_vfio
.maps
, dma_map
, tailq
);
883 pthread_mutex_unlock(&g_vfio
.mutex
);
890 vtophys_get_paddr_memseg(uint64_t vaddr
)
893 struct rte_memseg
*seg
;
895 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
896 seg
= rte_mem_virt2memseg((void *)(uintptr_t)vaddr
, NULL
);
898 paddr
= seg
->phys_addr
;
899 if (paddr
== RTE_BAD_IOVA
) {
900 return SPDK_VTOPHYS_ERROR
;
902 paddr
+= (vaddr
- (uintptr_t)seg
->addr
);
906 struct rte_mem_config
*mcfg
;
909 mcfg
= rte_eal_get_configuration()->mem_config
;
910 for (seg_idx
= 0; seg_idx
< RTE_MAX_MEMSEG
; seg_idx
++) {
911 seg
= &mcfg
->memseg
[seg_idx
];
912 if (seg
->addr
== NULL
) {
916 if (vaddr
>= (uintptr_t)seg
->addr
&&
917 vaddr
< ((uintptr_t)seg
->addr
+ seg
->len
)) {
918 paddr
= seg
->phys_addr
;
919 if (paddr
== RTE_BAD_IOVA
) {
920 return SPDK_VTOPHYS_ERROR
;
922 paddr
+= (vaddr
- (uintptr_t)seg
->addr
);
928 return SPDK_VTOPHYS_ERROR
;
931 /* Try to get the paddr from /proc/self/pagemap */
933 vtophys_get_paddr_pagemap(uint64_t vaddr
)
937 paddr
= rte_mem_virt2iova((void *)vaddr
);
938 if (paddr
== RTE_BAD_IOVA
) {
940 * The vaddr may be valid but doesn't have a backing page
941 * assigned yet. Touch the page to ensure a backing page
942 * gets assigned, then try to translate again.
944 rte_atomic64_read((rte_atomic64_t
*)vaddr
);
945 paddr
= rte_mem_virt2iova((void *)vaddr
);
947 if (paddr
== RTE_BAD_IOVA
) {
948 /* Unable to get to the physical address. */
949 return SPDK_VTOPHYS_ERROR
;
955 /* Try to get the paddr from pci devices */
957 vtophys_get_paddr_pci(uint64_t vaddr
)
959 struct spdk_vtophys_pci_device
*vtophys_dev
;
961 struct rte_pci_device
*dev
;
962 struct rte_mem_resource
*res
;
965 pthread_mutex_lock(&g_vtophys_pci_devices_mutex
);
966 TAILQ_FOREACH(vtophys_dev
, &g_vtophys_pci_devices
, tailq
) {
967 dev
= vtophys_dev
->pci_device
;
969 for (r
= 0; r
< PCI_MAX_RESOURCE
; r
++) {
970 res
= &dev
->mem_resource
[r
];
971 if (res
->phys_addr
&& vaddr
>= (uint64_t)res
->addr
&&
972 vaddr
< (uint64_t)res
->addr
+ res
->len
) {
973 paddr
= res
->phys_addr
+ (vaddr
- (uint64_t)res
->addr
);
974 DEBUG_PRINT("%s: %p -> %p\n", __func__
, (void *)vaddr
,
976 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex
);
981 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex
);
983 return SPDK_VTOPHYS_ERROR
;
987 spdk_vtophys_notify(void *cb_ctx
, struct spdk_mem_map
*map
,
988 enum spdk_mem_map_notify_action action
,
989 void *vaddr
, size_t len
)
991 int rc
= 0, pci_phys
= 0;
994 if ((uintptr_t)vaddr
& ~MASK_256TB
) {
995 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr
);
999 if (((uintptr_t)vaddr
& MASK_2MB
) || (len
& MASK_2MB
)) {
1000 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
1001 __func__
, vaddr
, len
);
1006 /* Get the physical address from the DPDK memsegs */
1007 paddr
= vtophys_get_paddr_memseg((uint64_t)vaddr
);
1010 case SPDK_MEM_MAP_NOTIFY_REGISTER
:
1011 if (paddr
== SPDK_VTOPHYS_ERROR
) {
1012 /* This is not an address that DPDK is managing. */
1013 #if SPDK_VFIO_ENABLED
1014 if (spdk_iommu_is_enabled()) {
1015 /* We'll use the virtual address as the iova. DPDK
1016 * currently uses physical addresses as the iovas (or counts
1017 * up from 0 if it can't get physical addresses), so
1018 * the range of user space virtual addresses and physical
1019 * addresses will never overlap.
1021 paddr
= (uint64_t)vaddr
;
1022 rc
= vtophys_iommu_map_dma((uint64_t)vaddr
, paddr
, VALUE_2MB
);
1029 /* Get the physical address from /proc/self/pagemap. */
1030 paddr
= vtophys_get_paddr_pagemap((uint64_t)vaddr
);
1031 if (paddr
== SPDK_VTOPHYS_ERROR
) {
1032 /* Get the physical address from PCI devices */
1033 paddr
= vtophys_get_paddr_pci((uint64_t)vaddr
);
1034 if (paddr
== SPDK_VTOPHYS_ERROR
) {
1035 DEBUG_PRINT("could not get phys addr for %p\n", vaddr
);
1042 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
1043 if (!pci_phys
&& (paddr
& MASK_2MB
)) {
1044 DEBUG_PRINT("invalid paddr 0x%" PRIx64
" - must be 2MB aligned\n", paddr
);
1048 rc
= spdk_mem_map_set_translation(map
, (uint64_t)vaddr
, VALUE_2MB
, paddr
);
1050 case SPDK_MEM_MAP_NOTIFY_UNREGISTER
:
1051 #if SPDK_VFIO_ENABLED
1052 if (paddr
== SPDK_VTOPHYS_ERROR
) {
1054 * This is not an address that DPDK is managing. If vfio is enabled,
1055 * we need to unmap the range from the IOMMU
1057 if (spdk_iommu_is_enabled()) {
1058 uint64_t buffer_len
= VALUE_2MB
;
1059 paddr
= spdk_mem_map_translate(map
, (uint64_t)vaddr
, &buffer_len
);
1060 if (buffer_len
!= VALUE_2MB
) {
1063 rc
= vtophys_iommu_unmap_dma(paddr
, VALUE_2MB
);
1070 rc
= spdk_mem_map_clear_translation(map
, (uint64_t)vaddr
, VALUE_2MB
);
1086 #if SPDK_VFIO_ENABLED
1089 spdk_vfio_enabled(void)
1091 return rte_vfio_is_enabled("vfio_pci");
1094 /* Check if IOMMU is enabled on the system */
1096 has_iommu_groups(void)
1100 DIR *dir
= opendir("/sys/kernel/iommu_groups");
1106 while (count
< 3 && (d
= readdir(dir
)) != NULL
) {
1111 /* there will always be ./ and ../ entries */
1116 spdk_vfio_noiommu_enabled(void)
1118 return rte_vfio_noiommu_is_enabled();
1122 spdk_vtophys_iommu_init(void)
1124 char proc_fd_path
[PATH_MAX
+ 1];
1125 char link_path
[PATH_MAX
+ 1];
1126 const char vfio_path
[] = "/dev/vfio/vfio";
1130 if (!spdk_vfio_enabled()) {
1134 if (spdk_vfio_noiommu_enabled()) {
1135 g_vfio
.noiommu_enabled
= true;
1136 } else if (!has_iommu_groups()) {
1140 dir
= opendir("/proc/self/fd");
1142 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno
);
1146 while ((d
= readdir(dir
)) != NULL
) {
1147 if (d
->d_type
!= DT_LNK
) {
1151 snprintf(proc_fd_path
, sizeof(proc_fd_path
), "/proc/self/fd/%s", d
->d_name
);
1152 if (readlink(proc_fd_path
, link_path
, sizeof(link_path
)) != (sizeof(vfio_path
) - 1)) {
1156 if (memcmp(link_path
, vfio_path
, sizeof(vfio_path
) - 1) == 0) {
1157 sscanf(d
->d_name
, "%d", &g_vfio
.fd
);
1164 if (g_vfio
.fd
< 0) {
1165 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1169 g_vfio
.enabled
= true;
1176 spdk_vtophys_pci_device_added(struct rte_pci_device
*pci_device
)
1178 struct spdk_vtophys_pci_device
*vtophys_dev
;
1180 pthread_mutex_lock(&g_vtophys_pci_devices_mutex
);
1182 vtophys_dev
= calloc(1, sizeof(*vtophys_dev
));
1184 vtophys_dev
->pci_device
= pci_device
;
1185 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices
, vtophys_dev
, tailq
);
1187 DEBUG_PRINT("Memory allocation error\n");
1189 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex
);
1191 #if SPDK_VFIO_ENABLED
1192 struct spdk_vfio_dma_map
*dma_map
;
1195 if (!g_vfio
.enabled
) {
1199 pthread_mutex_lock(&g_vfio
.mutex
);
1200 g_vfio
.device_ref
++;
1201 if (g_vfio
.device_ref
> 1) {
1202 pthread_mutex_unlock(&g_vfio
.mutex
);
1206 /* This is the first SPDK device using DPDK vfio. This means that the first
1207 * IOMMU group might have been just been added to the DPDK vfio container.
1208 * From this point it is certain that the memory can be mapped now.
1210 TAILQ_FOREACH(dma_map
, &g_vfio
.maps
, tailq
) {
1211 ret
= ioctl(g_vfio
.fd
, VFIO_IOMMU_MAP_DMA
, &dma_map
->map
);
1213 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno
);
1217 pthread_mutex_unlock(&g_vfio
.mutex
);
1222 spdk_vtophys_pci_device_removed(struct rte_pci_device
*pci_device
)
1224 struct spdk_vtophys_pci_device
*vtophys_dev
;
1226 pthread_mutex_lock(&g_vtophys_pci_devices_mutex
);
1227 TAILQ_FOREACH(vtophys_dev
, &g_vtophys_pci_devices
, tailq
) {
1228 if (vtophys_dev
->pci_device
== pci_device
) {
1229 TAILQ_REMOVE(&g_vtophys_pci_devices
, vtophys_dev
, tailq
);
1234 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex
);
1236 #if SPDK_VFIO_ENABLED
1237 struct spdk_vfio_dma_map
*dma_map
;
1240 if (!g_vfio
.enabled
) {
1244 pthread_mutex_lock(&g_vfio
.mutex
);
1245 assert(g_vfio
.device_ref
> 0);
1246 g_vfio
.device_ref
--;
1247 if (g_vfio
.device_ref
> 0) {
1248 pthread_mutex_unlock(&g_vfio
.mutex
);
1252 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1253 * any additional devices using it's vfio container, all the mappings
1254 * will be automatically removed by the Linux vfio driver. We unmap
1255 * the memory manually to be able to easily re-map it later regardless
1256 * of other, external factors.
1258 TAILQ_FOREACH(dma_map
, &g_vfio
.maps
, tailq
) {
1259 ret
= ioctl(g_vfio
.fd
, VFIO_IOMMU_UNMAP_DMA
, &dma_map
->unmap
);
1261 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno
);
1265 pthread_mutex_unlock(&g_vfio
.mutex
);
1270 spdk_vtophys_init(void)
1272 const struct spdk_mem_map_ops vtophys_map_ops
= {
1273 .notify_cb
= spdk_vtophys_notify
,
1274 .are_contiguous
= NULL
1277 #if SPDK_VFIO_ENABLED
1278 spdk_vtophys_iommu_init();
1281 g_vtophys_map
= spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR
, &vtophys_map_ops
, NULL
);
1282 if (g_vtophys_map
== NULL
) {
1283 DEBUG_PRINT("vtophys map allocation failed\n");
1290 spdk_vtophys(void *buf
, uint64_t *size
)
1292 uint64_t vaddr
, paddr_2mb
;
1294 vaddr
= (uint64_t)buf
;
1295 paddr_2mb
= spdk_mem_map_translate(g_vtophys_map
, vaddr
, size
);
1298 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1299 * we will still bitwise-or it with the buf offset below, but the result will still be
1300 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1301 * unaligned) we must now check the return value before addition.
1303 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR
== UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1304 if (paddr_2mb
== SPDK_VTOPHYS_ERROR
) {
1305 return SPDK_VTOPHYS_ERROR
;
1307 return paddr_2mb
+ (vaddr
& MASK_2MB
);