]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/lib/env_dpdk/memory.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / lib / env_dpdk / memory.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "spdk/stdinc.h"
35
36 #include "env_internal.h"
37
38 #include <rte_config.h>
39 #include <rte_eal_memconfig.h>
40
41 #include "spdk_internal/assert.h"
42 #include "spdk_internal/memory.h"
43
44 #include "spdk/assert.h"
45 #include "spdk/likely.h"
46 #include "spdk/queue.h"
47 #include "spdk/util.h"
48 #include "spdk/env_dpdk.h"
49
50 #ifdef __FreeBSD__
51 #define SPDK_VFIO_ENABLED 0
52 #else
53 #include <linux/version.h>
54 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
55 #define SPDK_VFIO_ENABLED 1
56 #include <linux/vfio.h>
57 #include <rte_vfio.h>
58
59 struct spdk_vfio_dma_map {
60 struct vfio_iommu_type1_dma_map map;
61 struct vfio_iommu_type1_dma_unmap unmap;
62 TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
63 };
64
65 struct vfio_cfg {
66 int fd;
67 bool enabled;
68 bool noiommu_enabled;
69 unsigned device_ref;
70 TAILQ_HEAD(, spdk_vfio_dma_map) maps;
71 pthread_mutex_t mutex;
72 };
73
74 static struct vfio_cfg g_vfio = {
75 .fd = -1,
76 .enabled = false,
77 .noiommu_enabled = false,
78 .device_ref = 0,
79 .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
80 .mutex = PTHREAD_MUTEX_INITIALIZER
81 };
82
83 #else
84 #define SPDK_VFIO_ENABLED 0
85 #endif
86 #endif
87
88 #if DEBUG
89 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
90 #else
91 #define DEBUG_PRINT(...)
92 #endif
93
94 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
95 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
96
97 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
98 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
99
100 /* Page is registered */
101 #define REG_MAP_REGISTERED (1ULL << 62)
102
103 /* A notification region barrier. The 2MB translation entry that's marked
104 * with this flag must be unregistered separately. This allows contiguous
105 * regions to be unregistered in the same chunks they were registered.
106 */
107 #define REG_MAP_NOTIFY_START (1ULL << 63)
108
109 /* Translation of a single 2MB page. */
110 struct map_2mb {
111 uint64_t translation_2mb;
112 };
113
114 /* Second-level map table indexed by bits [21..29] of the virtual address.
115 * Each entry contains the address translation or error for entries that haven't
116 * been retrieved yet.
117 */
118 struct map_1gb {
119 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
120 };
121
122 /* Top-level map table indexed by bits [30..47] of the virtual address.
123 * Each entry points to a second-level map table or NULL.
124 */
125 struct map_256tb {
126 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
127 };
128
129 /* Page-granularity memory address translation */
130 struct spdk_mem_map {
131 struct map_256tb map_256tb;
132 pthread_mutex_t mutex;
133 uint64_t default_translation;
134 struct spdk_mem_map_ops ops;
135 void *cb_ctx;
136 TAILQ_ENTRY(spdk_mem_map) tailq;
137 };
138
139 /* Registrations map. The 64 bit translations are bit fields with the
140 * following layout (starting with the low bits):
141 * 0 - 61 : reserved
142 * 62 - 63 : flags
143 */
144 static struct spdk_mem_map *g_mem_reg_map;
145 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
146 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
147
148 /*
149 * Walk the currently registered memory via the main memory registration map
150 * and call the new map's notify callback for each virtually contiguous region.
151 */
152 static int
153 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
154 {
155 size_t idx_256tb;
156 uint64_t idx_1gb;
157 uint64_t contig_start = UINT64_MAX;
158 uint64_t contig_end = UINT64_MAX;
159 struct map_1gb *map_1gb;
160 int rc;
161
162 if (!g_mem_reg_map) {
163 return -EINVAL;
164 }
165
166 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
167 pthread_mutex_lock(&g_mem_reg_map->mutex);
168
169 for (idx_256tb = 0;
170 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
171 idx_256tb++) {
172 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
173
174 if (!map_1gb) {
175 if (contig_start != UINT64_MAX) {
176 /* End of of a virtually contiguous range */
177 rc = map->ops.notify_cb(map->cb_ctx, map, action,
178 (void *)contig_start,
179 contig_end - contig_start + VALUE_2MB);
180 /* Don't bother handling unregister failures. It can't be any worse */
181 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
182 goto err_unregister;
183 }
184 }
185 contig_start = UINT64_MAX;
186 continue;
187 }
188
189 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
190 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
191 (contig_start == UINT64_MAX ||
192 (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
193 /* Rebuild the virtual address from the indexes */
194 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
195
196 if (contig_start == UINT64_MAX) {
197 contig_start = vaddr;
198 }
199
200 contig_end = vaddr;
201 } else {
202 if (contig_start != UINT64_MAX) {
203 /* End of of a virtually contiguous range */
204 rc = map->ops.notify_cb(map->cb_ctx, map, action,
205 (void *)contig_start,
206 contig_end - contig_start + VALUE_2MB);
207 /* Don't bother handling unregister failures. It can't be any worse */
208 if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
209 goto err_unregister;
210 }
211
212 /* This page might be a part of a neighbour region, so process
213 * it again. The idx_1gb will be incremented immediately.
214 */
215 idx_1gb--;
216 }
217 contig_start = UINT64_MAX;
218 }
219 }
220 }
221
222 pthread_mutex_unlock(&g_mem_reg_map->mutex);
223 return 0;
224
225 err_unregister:
226 /* Unwind to the first empty translation so we don't unregister
227 * a region that just failed to register.
228 */
229 idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
230 idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
231 contig_start = UINT64_MAX;
232 contig_end = UINT64_MAX;
233
234 /* Unregister any memory we managed to register before the failure */
235 for (; idx_256tb < SIZE_MAX; idx_256tb--) {
236 map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
237
238 if (!map_1gb) {
239 if (contig_end != UINT64_MAX) {
240 /* End of of a virtually contiguous range */
241 map->ops.notify_cb(map->cb_ctx, map,
242 SPDK_MEM_MAP_NOTIFY_UNREGISTER,
243 (void *)contig_start,
244 contig_end - contig_start + VALUE_2MB);
245 }
246 contig_end = UINT64_MAX;
247 continue;
248 }
249
250 for (; idx_1gb < UINT64_MAX; idx_1gb--) {
251 if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
252 (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
253 /* Rebuild the virtual address from the indexes */
254 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
255
256 if (contig_end == UINT64_MAX) {
257 contig_end = vaddr;
258 }
259 contig_start = vaddr;
260 } else {
261 if (contig_end != UINT64_MAX) {
262 /* End of of a virtually contiguous range */
263 map->ops.notify_cb(map->cb_ctx, map,
264 SPDK_MEM_MAP_NOTIFY_UNREGISTER,
265 (void *)contig_start,
266 contig_end - contig_start + VALUE_2MB);
267 idx_1gb++;
268 }
269 contig_end = UINT64_MAX;
270 }
271 }
272 idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
273 }
274
275 pthread_mutex_unlock(&g_mem_reg_map->mutex);
276 return rc;
277 }
278
279 struct spdk_mem_map *
280 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
281 {
282 struct spdk_mem_map *map;
283 int rc;
284
285 map = calloc(1, sizeof(*map));
286 if (map == NULL) {
287 return NULL;
288 }
289
290 if (pthread_mutex_init(&map->mutex, NULL)) {
291 free(map);
292 return NULL;
293 }
294
295 map->default_translation = default_translation;
296 map->cb_ctx = cb_ctx;
297 if (ops) {
298 map->ops = *ops;
299 }
300
301 if (ops && ops->notify_cb) {
302 pthread_mutex_lock(&g_spdk_mem_map_mutex);
303 rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
304 if (rc != 0) {
305 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
306 DEBUG_PRINT("Initial mem_map notify failed\n");
307 pthread_mutex_destroy(&map->mutex);
308 free(map);
309 return NULL;
310 }
311 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
312 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
313 }
314
315 return map;
316 }
317
318 void
319 spdk_mem_map_free(struct spdk_mem_map **pmap)
320 {
321 struct spdk_mem_map *map;
322 size_t i;
323
324 if (!pmap) {
325 return;
326 }
327
328 map = *pmap;
329
330 if (!map) {
331 return;
332 }
333
334 if (map->ops.notify_cb) {
335 pthread_mutex_lock(&g_spdk_mem_map_mutex);
336 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
337 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
338 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
339 }
340
341 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
342 free(map->map_256tb.map[i]);
343 }
344
345 pthread_mutex_destroy(&map->mutex);
346
347 free(map);
348 *pmap = NULL;
349 }
350
351 int
352 spdk_mem_register(void *vaddr, size_t len)
353 {
354 struct spdk_mem_map *map;
355 int rc;
356 void *seg_vaddr;
357 size_t seg_len;
358 uint64_t reg;
359
360 if ((uintptr_t)vaddr & ~MASK_256TB) {
361 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
362 return -EINVAL;
363 }
364
365 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
366 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
367 __func__, vaddr, len);
368 return -EINVAL;
369 }
370
371 if (len == 0) {
372 return 0;
373 }
374
375 pthread_mutex_lock(&g_spdk_mem_map_mutex);
376
377 seg_vaddr = vaddr;
378 seg_len = len;
379 while (seg_len > 0) {
380 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
381 if (reg & REG_MAP_REGISTERED) {
382 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
383 return -EBUSY;
384 }
385 seg_vaddr += VALUE_2MB;
386 seg_len -= VALUE_2MB;
387 }
388
389 seg_vaddr = vaddr;
390 seg_len = 0;
391 while (len > 0) {
392 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
393 seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
394 seg_len += VALUE_2MB;
395 vaddr += VALUE_2MB;
396 len -= VALUE_2MB;
397 }
398
399 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
400 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
401 if (rc != 0) {
402 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
403 return rc;
404 }
405 }
406
407 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
408 return 0;
409 }
410
411 int
412 spdk_mem_unregister(void *vaddr, size_t len)
413 {
414 struct spdk_mem_map *map;
415 int rc;
416 void *seg_vaddr;
417 size_t seg_len;
418 uint64_t reg, newreg;
419
420 if ((uintptr_t)vaddr & ~MASK_256TB) {
421 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
422 return -EINVAL;
423 }
424
425 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
426 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
427 __func__, vaddr, len);
428 return -EINVAL;
429 }
430
431 pthread_mutex_lock(&g_spdk_mem_map_mutex);
432
433 /* The first page must be a start of a region. Also check if it's
434 * registered to make sure we don't return -ERANGE for non-registered
435 * regions.
436 */
437 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
438 if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
439 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
440 return -ERANGE;
441 }
442
443 seg_vaddr = vaddr;
444 seg_len = len;
445 while (seg_len > 0) {
446 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
447 if ((reg & REG_MAP_REGISTERED) == 0) {
448 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
449 return -EINVAL;
450 }
451 seg_vaddr += VALUE_2MB;
452 seg_len -= VALUE_2MB;
453 }
454
455 newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
456 /* If the next page is registered, it must be a start of a region as well,
457 * otherwise we'd be unregistering only a part of a region.
458 */
459 if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
460 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
461 return -ERANGE;
462 }
463 seg_vaddr = vaddr;
464 seg_len = 0;
465
466 while (len > 0) {
467 reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
468 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
469
470 if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
471 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
472 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
473 if (rc != 0) {
474 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
475 return rc;
476 }
477 }
478
479 seg_vaddr = vaddr;
480 seg_len = VALUE_2MB;
481 } else {
482 seg_len += VALUE_2MB;
483 }
484
485 vaddr += VALUE_2MB;
486 len -= VALUE_2MB;
487 }
488
489 if (seg_len > 0) {
490 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
491 rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
492 if (rc != 0) {
493 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
494 return rc;
495 }
496 }
497 }
498
499 pthread_mutex_unlock(&g_spdk_mem_map_mutex);
500 return 0;
501 }
502
503 static struct map_1gb *
504 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
505 {
506 struct map_1gb *map_1gb;
507 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
508 size_t i;
509
510 if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
511 return NULL;
512 }
513
514 map_1gb = map->map_256tb.map[idx_256tb];
515
516 if (!map_1gb) {
517 pthread_mutex_lock(&map->mutex);
518
519 /* Recheck to make sure nobody else got the mutex first. */
520 map_1gb = map->map_256tb.map[idx_256tb];
521 if (!map_1gb) {
522 map_1gb = malloc(sizeof(struct map_1gb));
523 if (map_1gb) {
524 /* initialize all entries to default translation */
525 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
526 map_1gb->map[i].translation_2mb = map->default_translation;
527 }
528 map->map_256tb.map[idx_256tb] = map_1gb;
529 }
530 }
531
532 pthread_mutex_unlock(&map->mutex);
533
534 if (!map_1gb) {
535 DEBUG_PRINT("allocation failed\n");
536 return NULL;
537 }
538 }
539
540 return map_1gb;
541 }
542
543 int
544 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
545 uint64_t translation)
546 {
547 uint64_t vfn_2mb;
548 struct map_1gb *map_1gb;
549 uint64_t idx_1gb;
550 struct map_2mb *map_2mb;
551
552 if ((uintptr_t)vaddr & ~MASK_256TB) {
553 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
554 return -EINVAL;
555 }
556
557 /* For now, only 2 MB-aligned registrations are supported */
558 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
559 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
560 __func__, vaddr, size);
561 return -EINVAL;
562 }
563
564 vfn_2mb = vaddr >> SHIFT_2MB;
565
566 while (size) {
567 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
568 if (!map_1gb) {
569 DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
570 return -ENOMEM;
571 }
572
573 idx_1gb = MAP_1GB_IDX(vfn_2mb);
574 map_2mb = &map_1gb->map[idx_1gb];
575 map_2mb->translation_2mb = translation;
576
577 size -= VALUE_2MB;
578 vfn_2mb++;
579 }
580
581 return 0;
582 }
583
584 int
585 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
586 {
587 uint64_t vfn_2mb;
588 struct map_1gb *map_1gb;
589 uint64_t idx_1gb;
590 struct map_2mb *map_2mb;
591
592 if ((uintptr_t)vaddr & ~MASK_256TB) {
593 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
594 return -EINVAL;
595 }
596
597 /* For now, only 2 MB-aligned registrations are supported */
598 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
599 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
600 __func__, vaddr, size);
601 return -EINVAL;
602 }
603
604 vfn_2mb = vaddr >> SHIFT_2MB;
605
606 while (size) {
607 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
608 if (!map_1gb) {
609 DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
610 return -ENOMEM;
611 }
612
613 idx_1gb = MAP_1GB_IDX(vfn_2mb);
614 map_2mb = &map_1gb->map[idx_1gb];
615 map_2mb->translation_2mb = map->default_translation;
616
617 size -= VALUE_2MB;
618 vfn_2mb++;
619 }
620
621 return 0;
622 }
623
624 inline uint64_t
625 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
626 {
627 const struct map_1gb *map_1gb;
628 const struct map_2mb *map_2mb;
629 uint64_t idx_256tb;
630 uint64_t idx_1gb;
631 uint64_t vfn_2mb;
632 uint64_t cur_size;
633 uint64_t prev_translation;
634 uint64_t orig_translation;
635
636 if (spdk_unlikely(vaddr & ~MASK_256TB)) {
637 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
638 return map->default_translation;
639 }
640
641 vfn_2mb = vaddr >> SHIFT_2MB;
642 idx_256tb = MAP_256TB_IDX(vfn_2mb);
643 idx_1gb = MAP_1GB_IDX(vfn_2mb);
644
645 map_1gb = map->map_256tb.map[idx_256tb];
646 if (spdk_unlikely(!map_1gb)) {
647 return map->default_translation;
648 }
649
650 cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
651 map_2mb = &map_1gb->map[idx_1gb];
652 if (size == NULL || map->ops.are_contiguous == NULL ||
653 map_2mb->translation_2mb == map->default_translation) {
654 if (size != NULL) {
655 *size = spdk_min(*size, cur_size);
656 }
657 return map_2mb->translation_2mb;
658 }
659
660 orig_translation = map_2mb->translation_2mb;
661 prev_translation = orig_translation;
662 while (cur_size < *size) {
663 vfn_2mb++;
664 idx_256tb = MAP_256TB_IDX(vfn_2mb);
665 idx_1gb = MAP_1GB_IDX(vfn_2mb);
666
667 map_1gb = map->map_256tb.map[idx_256tb];
668 if (spdk_unlikely(!map_1gb)) {
669 break;
670 }
671
672 map_2mb = &map_1gb->map[idx_1gb];
673 if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
674 break;
675 }
676
677 cur_size += VALUE_2MB;
678 prev_translation = map_2mb->translation_2mb;
679 }
680
681 *size = spdk_min(*size, cur_size);
682 return orig_translation;
683 }
684
685 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
686 static void
687 memory_hotplug_cb(enum rte_mem_event event_type,
688 const void *addr, size_t len, void *arg)
689 {
690 if (event_type == RTE_MEM_EVENT_ALLOC) {
691 spdk_mem_register((void *)addr, len);
692
693 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
694 if (!spdk_env_dpdk_external_init()) {
695 return;
696 }
697 #endif
698
699 /* Prior to DPDK 19.02, we have to worry about DPDK
700 * freeing memory in different units than it was allocated.
701 * That doesn't work with things like RDMA MRs. So for
702 * those versions of DPDK, mark each segment so that DPDK
703 * won't later free it. That ensures we don't have to deal
704 * with that scenario.
705 *
706 * DPDK 19.02 added the --match-allocations RTE flag to
707 * avoid this condition.
708 *
709 * Note: if the user initialized DPDK separately, we can't
710 * be sure that --match-allocations was specified, so need
711 * to still mark the segments so they aren't freed.
712 */
713 while (len > 0) {
714 struct rte_memseg *seg;
715
716 seg = rte_mem_virt2memseg(addr, NULL);
717 assert(seg != NULL);
718 seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
719 addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
720 len -= seg->hugepage_sz;
721 }
722 } else if (event_type == RTE_MEM_EVENT_FREE) {
723 spdk_mem_unregister((void *)addr, len);
724 }
725 }
726
727 static int
728 memory_iter_cb(const struct rte_memseg_list *msl,
729 const struct rte_memseg *ms, size_t len, void *arg)
730 {
731 return spdk_mem_register(ms->addr, len);
732 }
733 #endif
734
735 int
736 spdk_mem_map_init(void)
737 {
738 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
739 if (g_mem_reg_map == NULL) {
740 DEBUG_PRINT("memory registration map allocation failed\n");
741 return -1;
742 }
743
744 /*
745 * Walk all DPDK memory segments and register them
746 * with the master memory map
747 */
748 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
749 rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
750 rte_memseg_contig_walk(memory_iter_cb, NULL);
751 #else
752 struct rte_mem_config *mcfg;
753 size_t seg_idx;
754
755 mcfg = rte_eal_get_configuration()->mem_config;
756 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
757 struct rte_memseg *seg = &mcfg->memseg[seg_idx];
758
759 if (seg->addr == NULL) {
760 break;
761 }
762
763 spdk_mem_register(seg->addr, seg->len);
764 }
765 #endif
766 return 0;
767 }
768
769 bool
770 spdk_iommu_is_enabled(void)
771 {
772 #if SPDK_VFIO_ENABLED
773 return g_vfio.enabled && !g_vfio.noiommu_enabled;
774 #else
775 return false;
776 #endif
777 }
778
779 struct spdk_vtophys_pci_device {
780 struct rte_pci_device *pci_device;
781 TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
782 };
783
784 static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
785 static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
786 TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
787
788 static struct spdk_mem_map *g_vtophys_map;
789
790 #if SPDK_VFIO_ENABLED
791 static int
792 vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
793 {
794 struct spdk_vfio_dma_map *dma_map;
795 int ret;
796
797 dma_map = calloc(1, sizeof(*dma_map));
798 if (dma_map == NULL) {
799 return -ENOMEM;
800 }
801
802 dma_map->map.argsz = sizeof(dma_map->map);
803 dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
804 dma_map->map.vaddr = vaddr;
805 dma_map->map.iova = iova;
806 dma_map->map.size = size;
807
808 dma_map->unmap.argsz = sizeof(dma_map->unmap);
809 dma_map->unmap.flags = 0;
810 dma_map->unmap.iova = iova;
811 dma_map->unmap.size = size;
812
813 pthread_mutex_lock(&g_vfio.mutex);
814 if (g_vfio.device_ref == 0) {
815 /* VFIO requires at least one device (IOMMU group) to be added to
816 * a VFIO container before it is possible to perform any IOMMU
817 * operations on that container. This memory will be mapped once
818 * the first device (IOMMU group) is hotplugged.
819 *
820 * Since the vfio container is managed internally by DPDK, it is
821 * also possible that some device is already in that container, but
822 * it's not managed by SPDK - e.g. an NIC attached internally
823 * inside DPDK. We could map the memory straight away in such
824 * scenario, but there's no need to do it. DPDK devices clearly
825 * don't need our mappings and hence we defer the mapping
826 * unconditionally until the first SPDK-managed device is
827 * hotplugged.
828 */
829 goto out_insert;
830 }
831
832 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
833 if (ret) {
834 DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
835 pthread_mutex_unlock(&g_vfio.mutex);
836 free(dma_map);
837 return ret;
838 }
839
840 out_insert:
841 TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
842 pthread_mutex_unlock(&g_vfio.mutex);
843 return 0;
844 }
845
846 static int
847 vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
848 {
849 struct spdk_vfio_dma_map *dma_map;
850 int ret;
851
852 pthread_mutex_lock(&g_vfio.mutex);
853 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
854 if (dma_map->map.iova == iova) {
855 break;
856 }
857 }
858
859 if (dma_map == NULL) {
860 DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
861 pthread_mutex_unlock(&g_vfio.mutex);
862 return -ENXIO;
863 }
864
865 /** don't support partial or multiple-page unmap for now */
866 assert(dma_map->map.size == size);
867
868 if (g_vfio.device_ref == 0) {
869 /* Memory is not mapped anymore, just remove it's references */
870 goto out_remove;
871 }
872
873
874 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
875 if (ret) {
876 DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
877 pthread_mutex_unlock(&g_vfio.mutex);
878 return ret;
879 }
880
881 out_remove:
882 TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
883 pthread_mutex_unlock(&g_vfio.mutex);
884 free(dma_map);
885 return 0;
886 }
887 #endif
888
889 static uint64_t
890 vtophys_get_paddr_memseg(uint64_t vaddr)
891 {
892 uintptr_t paddr;
893 struct rte_memseg *seg;
894
895 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
896 seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
897 if (seg != NULL) {
898 paddr = seg->phys_addr;
899 if (paddr == RTE_BAD_IOVA) {
900 return SPDK_VTOPHYS_ERROR;
901 }
902 paddr += (vaddr - (uintptr_t)seg->addr);
903 return paddr;
904 }
905 #else
906 struct rte_mem_config *mcfg;
907 uint32_t seg_idx;
908
909 mcfg = rte_eal_get_configuration()->mem_config;
910 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
911 seg = &mcfg->memseg[seg_idx];
912 if (seg->addr == NULL) {
913 break;
914 }
915
916 if (vaddr >= (uintptr_t)seg->addr &&
917 vaddr < ((uintptr_t)seg->addr + seg->len)) {
918 paddr = seg->phys_addr;
919 if (paddr == RTE_BAD_IOVA) {
920 return SPDK_VTOPHYS_ERROR;
921 }
922 paddr += (vaddr - (uintptr_t)seg->addr);
923 return paddr;
924 }
925 }
926 #endif
927
928 return SPDK_VTOPHYS_ERROR;
929 }
930
931 /* Try to get the paddr from /proc/self/pagemap */
932 static uint64_t
933 vtophys_get_paddr_pagemap(uint64_t vaddr)
934 {
935 uintptr_t paddr;
936
937 paddr = rte_mem_virt2iova((void *)vaddr);
938 if (paddr == RTE_BAD_IOVA) {
939 /*
940 * The vaddr may be valid but doesn't have a backing page
941 * assigned yet. Touch the page to ensure a backing page
942 * gets assigned, then try to translate again.
943 */
944 rte_atomic64_read((rte_atomic64_t *)vaddr);
945 paddr = rte_mem_virt2iova((void *)vaddr);
946 }
947 if (paddr == RTE_BAD_IOVA) {
948 /* Unable to get to the physical address. */
949 return SPDK_VTOPHYS_ERROR;
950 }
951
952 return paddr;
953 }
954
955 /* Try to get the paddr from pci devices */
956 static uint64_t
957 vtophys_get_paddr_pci(uint64_t vaddr)
958 {
959 struct spdk_vtophys_pci_device *vtophys_dev;
960 uintptr_t paddr;
961 struct rte_pci_device *dev;
962 struct rte_mem_resource *res;
963 unsigned r;
964
965 pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
966 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
967 dev = vtophys_dev->pci_device;
968
969 for (r = 0; r < PCI_MAX_RESOURCE; r++) {
970 res = &dev->mem_resource[r];
971 if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
972 vaddr < (uint64_t)res->addr + res->len) {
973 paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
974 DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr,
975 (void *)paddr);
976 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
977 return paddr;
978 }
979 }
980 }
981 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
982
983 return SPDK_VTOPHYS_ERROR;
984 }
985
986 static int
987 spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
988 enum spdk_mem_map_notify_action action,
989 void *vaddr, size_t len)
990 {
991 int rc = 0, pci_phys = 0;
992 uint64_t paddr;
993
994 if ((uintptr_t)vaddr & ~MASK_256TB) {
995 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
996 return -EINVAL;
997 }
998
999 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1000 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
1001 __func__, vaddr, len);
1002 return -EINVAL;
1003 }
1004
1005 while (len > 0) {
1006 /* Get the physical address from the DPDK memsegs */
1007 paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1008
1009 switch (action) {
1010 case SPDK_MEM_MAP_NOTIFY_REGISTER:
1011 if (paddr == SPDK_VTOPHYS_ERROR) {
1012 /* This is not an address that DPDK is managing. */
1013 #if SPDK_VFIO_ENABLED
1014 if (spdk_iommu_is_enabled()) {
1015 /* We'll use the virtual address as the iova. DPDK
1016 * currently uses physical addresses as the iovas (or counts
1017 * up from 0 if it can't get physical addresses), so
1018 * the range of user space virtual addresses and physical
1019 * addresses will never overlap.
1020 */
1021 paddr = (uint64_t)vaddr;
1022 rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1023 if (rc) {
1024 return -EFAULT;
1025 }
1026 } else
1027 #endif
1028 {
1029 /* Get the physical address from /proc/self/pagemap. */
1030 paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1031 if (paddr == SPDK_VTOPHYS_ERROR) {
1032 /* Get the physical address from PCI devices */
1033 paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
1034 if (paddr == SPDK_VTOPHYS_ERROR) {
1035 DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1036 return -EFAULT;
1037 }
1038 pci_phys = 1;
1039 }
1040 }
1041 }
1042 /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
1043 if (!pci_phys && (paddr & MASK_2MB)) {
1044 DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1045 return -EINVAL;
1046 }
1047
1048 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1049 break;
1050 case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1051 #if SPDK_VFIO_ENABLED
1052 if (paddr == SPDK_VTOPHYS_ERROR) {
1053 /*
1054 * This is not an address that DPDK is managing. If vfio is enabled,
1055 * we need to unmap the range from the IOMMU
1056 */
1057 if (spdk_iommu_is_enabled()) {
1058 uint64_t buffer_len = VALUE_2MB;
1059 paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
1060 if (buffer_len != VALUE_2MB) {
1061 return -EINVAL;
1062 }
1063 rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1064 if (rc) {
1065 return -EFAULT;
1066 }
1067 }
1068 }
1069 #endif
1070 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1071 break;
1072 default:
1073 SPDK_UNREACHABLE();
1074 }
1075
1076 if (rc != 0) {
1077 return rc;
1078 }
1079 vaddr += VALUE_2MB;
1080 len -= VALUE_2MB;
1081 }
1082
1083 return rc;
1084 }
1085
1086 #if SPDK_VFIO_ENABLED
1087
1088 static bool
1089 spdk_vfio_enabled(void)
1090 {
1091 return rte_vfio_is_enabled("vfio_pci");
1092 }
1093
1094 /* Check if IOMMU is enabled on the system */
1095 static bool
1096 has_iommu_groups(void)
1097 {
1098 struct dirent *d;
1099 int count = 0;
1100 DIR *dir = opendir("/sys/kernel/iommu_groups");
1101
1102 if (dir == NULL) {
1103 return false;
1104 }
1105
1106 while (count < 3 && (d = readdir(dir)) != NULL) {
1107 count++;
1108 }
1109
1110 closedir(dir);
1111 /* there will always be ./ and ../ entries */
1112 return count > 2;
1113 }
1114
1115 static bool
1116 spdk_vfio_noiommu_enabled(void)
1117 {
1118 return rte_vfio_noiommu_is_enabled();
1119 }
1120
1121 static void
1122 spdk_vtophys_iommu_init(void)
1123 {
1124 char proc_fd_path[PATH_MAX + 1];
1125 char link_path[PATH_MAX + 1];
1126 const char vfio_path[] = "/dev/vfio/vfio";
1127 DIR *dir;
1128 struct dirent *d;
1129
1130 if (!spdk_vfio_enabled()) {
1131 return;
1132 }
1133
1134 if (spdk_vfio_noiommu_enabled()) {
1135 g_vfio.noiommu_enabled = true;
1136 } else if (!has_iommu_groups()) {
1137 return;
1138 }
1139
1140 dir = opendir("/proc/self/fd");
1141 if (!dir) {
1142 DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1143 return;
1144 }
1145
1146 while ((d = readdir(dir)) != NULL) {
1147 if (d->d_type != DT_LNK) {
1148 continue;
1149 }
1150
1151 snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1152 if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1153 continue;
1154 }
1155
1156 if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1157 sscanf(d->d_name, "%d", &g_vfio.fd);
1158 break;
1159 }
1160 }
1161
1162 closedir(dir);
1163
1164 if (g_vfio.fd < 0) {
1165 DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1166 return;
1167 }
1168
1169 g_vfio.enabled = true;
1170
1171 return;
1172 }
1173 #endif
1174
1175 void
1176 spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device)
1177 {
1178 struct spdk_vtophys_pci_device *vtophys_dev;
1179
1180 pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1181
1182 vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1183 if (vtophys_dev) {
1184 vtophys_dev->pci_device = pci_device;
1185 TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1186 } else {
1187 DEBUG_PRINT("Memory allocation error\n");
1188 }
1189 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1190
1191 #if SPDK_VFIO_ENABLED
1192 struct spdk_vfio_dma_map *dma_map;
1193 int ret;
1194
1195 if (!g_vfio.enabled) {
1196 return;
1197 }
1198
1199 pthread_mutex_lock(&g_vfio.mutex);
1200 g_vfio.device_ref++;
1201 if (g_vfio.device_ref > 1) {
1202 pthread_mutex_unlock(&g_vfio.mutex);
1203 return;
1204 }
1205
1206 /* This is the first SPDK device using DPDK vfio. This means that the first
1207 * IOMMU group might have been just been added to the DPDK vfio container.
1208 * From this point it is certain that the memory can be mapped now.
1209 */
1210 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1211 ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1212 if (ret) {
1213 DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1214 break;
1215 }
1216 }
1217 pthread_mutex_unlock(&g_vfio.mutex);
1218 #endif
1219 }
1220
1221 void
1222 spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1223 {
1224 struct spdk_vtophys_pci_device *vtophys_dev;
1225
1226 pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1227 TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1228 if (vtophys_dev->pci_device == pci_device) {
1229 TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1230 free(vtophys_dev);
1231 break;
1232 }
1233 }
1234 pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1235
1236 #if SPDK_VFIO_ENABLED
1237 struct spdk_vfio_dma_map *dma_map;
1238 int ret;
1239
1240 if (!g_vfio.enabled) {
1241 return;
1242 }
1243
1244 pthread_mutex_lock(&g_vfio.mutex);
1245 assert(g_vfio.device_ref > 0);
1246 g_vfio.device_ref--;
1247 if (g_vfio.device_ref > 0) {
1248 pthread_mutex_unlock(&g_vfio.mutex);
1249 return;
1250 }
1251
1252 /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1253 * any additional devices using it's vfio container, all the mappings
1254 * will be automatically removed by the Linux vfio driver. We unmap
1255 * the memory manually to be able to easily re-map it later regardless
1256 * of other, external factors.
1257 */
1258 TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1259 ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
1260 if (ret) {
1261 DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1262 break;
1263 }
1264 }
1265 pthread_mutex_unlock(&g_vfio.mutex);
1266 #endif
1267 }
1268
1269 int
1270 spdk_vtophys_init(void)
1271 {
1272 const struct spdk_mem_map_ops vtophys_map_ops = {
1273 .notify_cb = spdk_vtophys_notify,
1274 .are_contiguous = NULL
1275 };
1276
1277 #if SPDK_VFIO_ENABLED
1278 spdk_vtophys_iommu_init();
1279 #endif
1280
1281 g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1282 if (g_vtophys_map == NULL) {
1283 DEBUG_PRINT("vtophys map allocation failed\n");
1284 return -1;
1285 }
1286 return 0;
1287 }
1288
1289 uint64_t
1290 spdk_vtophys(void *buf, uint64_t *size)
1291 {
1292 uint64_t vaddr, paddr_2mb;
1293
1294 vaddr = (uint64_t)buf;
1295 paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1296
1297 /*
1298 * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1299 * we will still bitwise-or it with the buf offset below, but the result will still be
1300 * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1301 * unaligned) we must now check the return value before addition.
1302 */
1303 SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1304 if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1305 return SPDK_VTOPHYS_ERROR;
1306 } else {
1307 return paddr_2mb + (vaddr & MASK_2MB);
1308 }
1309 }