]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / dpdk / lib / librte_eal / common / eal_common_memory.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
3 */
4
5 #include <fcntl.h>
6 #include <errno.h>
7 #include <stdio.h>
8 #include <stdint.h>
9 #include <stdlib.h>
10 #include <stdarg.h>
11 #include <string.h>
12 #include <unistd.h>
13 #include <inttypes.h>
14 #include <sys/mman.h>
15 #include <sys/queue.h>
16
17 #include <rte_fbarray.h>
18 #include <rte_memory.h>
19 #include <rte_eal.h>
20 #include <rte_eal_memconfig.h>
21 #include <rte_errno.h>
22 #include <rte_log.h>
23
24 #include "eal_memalloc.h"
25 #include "eal_private.h"
26 #include "eal_internal_cfg.h"
27 #include "malloc_heap.h"
28
29 /*
30 * Try to mmap *size bytes in /dev/zero. If it is successful, return the
31 * pointer to the mmap'd area and keep *size unmodified. Else, retry
32 * with a smaller zone: decrease *size by hugepage_sz until it reaches
33 * 0. In this case, return NULL. Note: this function returns an address
34 * which is a multiple of hugepage size.
35 */
36
37 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
38
39 static void *next_baseaddr;
40 static uint64_t system_page_sz;
41
42 #ifdef RTE_ARCH_64
43 /*
44 * Linux kernel uses a really high address as starting address for serving
45 * mmaps calls. If there exists addressing limitations and IOVA mode is VA,
46 * this starting address is likely too high for those devices. However, it
47 * is possible to use a lower address in the process virtual address space
48 * as with 64 bits there is a lot of available space.
49 *
50 * Current known limitations are 39 or 40 bits. Setting the starting address
51 * at 4GB implies there are 508GB or 1020GB for mapping the available
52 * hugepages. This is likely enough for most systems, although a device with
53 * addressing limitations should call rte_mem_check_dma_mask for ensuring all
54 * memory is within supported range.
55 */
56 static uint64_t baseaddr = 0x100000000;
57 #endif
58
59 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
60 void *
61 eal_get_virtual_area(void *requested_addr, size_t *size,
62 size_t page_sz, int flags, int mmap_flags)
63 {
64 bool addr_is_hint, allow_shrink, unmap, no_align;
65 uint64_t map_sz;
66 void *mapped_addr, *aligned_addr;
67 uint8_t try = 0;
68
69 if (system_page_sz == 0)
70 system_page_sz = sysconf(_SC_PAGESIZE);
71
72 mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
73
74 RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
75
76 addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
77 allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
78 unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
79
80 if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 &&
81 rte_eal_process_type() == RTE_PROC_PRIMARY)
82 next_baseaddr = (void *) internal_config.base_virtaddr;
83
84 #ifdef RTE_ARCH_64
85 if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 &&
86 rte_eal_process_type() == RTE_PROC_PRIMARY)
87 next_baseaddr = (void *) baseaddr;
88 #endif
89 if (requested_addr == NULL && next_baseaddr != NULL) {
90 requested_addr = next_baseaddr;
91 requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
92 addr_is_hint = true;
93 }
94
95 /* we don't need alignment of resulting pointer in the following cases:
96 *
97 * 1. page size is equal to system size
98 * 2. we have a requested address, and it is page-aligned, and we will
99 * be discarding the address if we get a different one.
100 *
101 * for all other cases, alignment is potentially necessary.
102 */
103 no_align = (requested_addr != NULL &&
104 requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) &&
105 !addr_is_hint) ||
106 page_sz == system_page_sz;
107
108 do {
109 map_sz = no_align ? *size : *size + page_sz;
110 if (map_sz > SIZE_MAX) {
111 RTE_LOG(ERR, EAL, "Map size too big\n");
112 rte_errno = E2BIG;
113 return NULL;
114 }
115
116 mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_READ,
117 mmap_flags, -1, 0);
118 if (mapped_addr == MAP_FAILED && allow_shrink)
119 *size -= page_sz;
120
121 if (mapped_addr != MAP_FAILED && addr_is_hint &&
122 mapped_addr != requested_addr) {
123 try++;
124 next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
125 if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
126 /* hint was not used. Try with another offset */
127 munmap(mapped_addr, map_sz);
128 mapped_addr = MAP_FAILED;
129 requested_addr = next_baseaddr;
130 }
131 }
132 } while ((allow_shrink || addr_is_hint) &&
133 mapped_addr == MAP_FAILED && *size > 0);
134
135 /* align resulting address - if map failed, we will ignore the value
136 * anyway, so no need to add additional checks.
137 */
138 aligned_addr = no_align ? mapped_addr :
139 RTE_PTR_ALIGN(mapped_addr, page_sz);
140
141 if (*size == 0) {
142 RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
143 strerror(errno));
144 rte_errno = errno;
145 return NULL;
146 } else if (mapped_addr == MAP_FAILED) {
147 RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
148 strerror(errno));
149 /* pass errno up the call chain */
150 rte_errno = errno;
151 return NULL;
152 } else if (requested_addr != NULL && !addr_is_hint &&
153 aligned_addr != requested_addr) {
154 RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
155 requested_addr, aligned_addr);
156 munmap(mapped_addr, map_sz);
157 rte_errno = EADDRNOTAVAIL;
158 return NULL;
159 } else if (requested_addr != NULL && addr_is_hint &&
160 aligned_addr != requested_addr) {
161 RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
162 requested_addr, aligned_addr);
163 RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n");
164 } else if (next_baseaddr != NULL) {
165 next_baseaddr = RTE_PTR_ADD(aligned_addr, *size);
166 }
167
168 RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
169 aligned_addr, *size);
170
171 if (unmap) {
172 munmap(mapped_addr, map_sz);
173 } else if (!no_align) {
174 void *map_end, *aligned_end;
175 size_t before_len, after_len;
176
177 /* when we reserve space with alignment, we add alignment to
178 * mapping size. On 32-bit, if 1GB alignment was requested, this
179 * would waste 1GB of address space, which is a luxury we cannot
180 * afford. so, if alignment was performed, check if any unneeded
181 * address space can be unmapped back.
182 */
183
184 map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz);
185 aligned_end = RTE_PTR_ADD(aligned_addr, *size);
186
187 /* unmap space before aligned mmap address */
188 before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
189 if (before_len > 0)
190 munmap(mapped_addr, before_len);
191
192 /* unmap space after aligned end mmap address */
193 after_len = RTE_PTR_DIFF(map_end, aligned_end);
194 if (after_len > 0)
195 munmap(aligned_end, after_len);
196 }
197
198 return aligned_addr;
199 }
200
201 static struct rte_memseg *
202 virt2memseg(const void *addr, const struct rte_memseg_list *msl)
203 {
204 const struct rte_fbarray *arr;
205 void *start, *end;
206 int ms_idx;
207
208 if (msl == NULL)
209 return NULL;
210
211 /* a memseg list was specified, check if it's the right one */
212 start = msl->base_va;
213 end = RTE_PTR_ADD(start, msl->len);
214
215 if (addr < start || addr >= end)
216 return NULL;
217
218 /* now, calculate index */
219 arr = &msl->memseg_arr;
220 ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
221 return rte_fbarray_get(arr, ms_idx);
222 }
223
224 static struct rte_memseg_list *
225 virt2memseg_list(const void *addr)
226 {
227 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
228 struct rte_memseg_list *msl;
229 int msl_idx;
230
231 for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
232 void *start, *end;
233 msl = &mcfg->memsegs[msl_idx];
234
235 start = msl->base_va;
236 end = RTE_PTR_ADD(start, msl->len);
237 if (addr >= start && addr < end)
238 break;
239 }
240 /* if we didn't find our memseg list */
241 if (msl_idx == RTE_MAX_MEMSEG_LISTS)
242 return NULL;
243 return msl;
244 }
245
246 __rte_experimental struct rte_memseg_list *
247 rte_mem_virt2memseg_list(const void *addr)
248 {
249 return virt2memseg_list(addr);
250 }
251
252 struct virtiova {
253 rte_iova_t iova;
254 void *virt;
255 };
256 static int
257 find_virt(const struct rte_memseg_list *msl __rte_unused,
258 const struct rte_memseg *ms, void *arg)
259 {
260 struct virtiova *vi = arg;
261 if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
262 size_t offset = vi->iova - ms->iova;
263 vi->virt = RTE_PTR_ADD(ms->addr, offset);
264 /* stop the walk */
265 return 1;
266 }
267 return 0;
268 }
269 static int
270 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
271 const struct rte_memseg *ms, size_t len, void *arg)
272 {
273 struct virtiova *vi = arg;
274 if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
275 size_t offset = vi->iova - ms->iova;
276 vi->virt = RTE_PTR_ADD(ms->addr, offset);
277 /* stop the walk */
278 return 1;
279 }
280 return 0;
281 }
282
283 __rte_experimental void *
284 rte_mem_iova2virt(rte_iova_t iova)
285 {
286 struct virtiova vi;
287
288 memset(&vi, 0, sizeof(vi));
289
290 vi.iova = iova;
291 /* for legacy mem, we can get away with scanning VA-contiguous segments,
292 * as we know they are PA-contiguous as well
293 */
294 if (internal_config.legacy_mem)
295 rte_memseg_contig_walk(find_virt_legacy, &vi);
296 else
297 rte_memseg_walk(find_virt, &vi);
298
299 return vi.virt;
300 }
301
302 __rte_experimental struct rte_memseg *
303 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
304 {
305 return virt2memseg(addr, msl != NULL ? msl :
306 rte_mem_virt2memseg_list(addr));
307 }
308
309 static int
310 physmem_size(const struct rte_memseg_list *msl, void *arg)
311 {
312 uint64_t *total_len = arg;
313
314 if (msl->external)
315 return 0;
316
317 *total_len += msl->memseg_arr.count * msl->page_sz;
318
319 return 0;
320 }
321
322 /* get the total size of memory */
323 uint64_t
324 rte_eal_get_physmem_size(void)
325 {
326 uint64_t total_len = 0;
327
328 rte_memseg_list_walk(physmem_size, &total_len);
329
330 return total_len;
331 }
332
333 static int
334 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
335 void *arg)
336 {
337 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
338 int msl_idx, ms_idx, fd;
339 FILE *f = arg;
340
341 msl_idx = msl - mcfg->memsegs;
342 if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
343 return -1;
344
345 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
346 if (ms_idx < 0)
347 return -1;
348
349 fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
350 fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
351 "virt:%p, socket_id:%"PRId32", "
352 "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
353 "nrank:%"PRIx32" fd:%i\n",
354 msl_idx, ms_idx,
355 ms->iova,
356 ms->len,
357 ms->addr,
358 ms->socket_id,
359 ms->hugepage_sz,
360 ms->nchannel,
361 ms->nrank,
362 fd);
363
364 return 0;
365 }
366
367 /*
368 * Defining here because declared in rte_memory.h, but the actual implementation
369 * is in eal_common_memalloc.c, like all other memalloc internals.
370 */
371 int __rte_experimental
372 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb,
373 void *arg)
374 {
375 /* FreeBSD boots with legacy mem enabled by default */
376 if (internal_config.legacy_mem) {
377 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
378 rte_errno = ENOTSUP;
379 return -1;
380 }
381 return eal_memalloc_mem_event_callback_register(name, clb, arg);
382 }
383
384 int __rte_experimental
385 rte_mem_event_callback_unregister(const char *name, void *arg)
386 {
387 /* FreeBSD boots with legacy mem enabled by default */
388 if (internal_config.legacy_mem) {
389 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
390 rte_errno = ENOTSUP;
391 return -1;
392 }
393 return eal_memalloc_mem_event_callback_unregister(name, arg);
394 }
395
396 int __rte_experimental
397 rte_mem_alloc_validator_register(const char *name,
398 rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
399 {
400 /* FreeBSD boots with legacy mem enabled by default */
401 if (internal_config.legacy_mem) {
402 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
403 rte_errno = ENOTSUP;
404 return -1;
405 }
406 return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
407 limit);
408 }
409
410 int __rte_experimental
411 rte_mem_alloc_validator_unregister(const char *name, int socket_id)
412 {
413 /* FreeBSD boots with legacy mem enabled by default */
414 if (internal_config.legacy_mem) {
415 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
416 rte_errno = ENOTSUP;
417 return -1;
418 }
419 return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
420 }
421
422 /* Dump the physical memory layout on console */
423 void
424 rte_dump_physmem_layout(FILE *f)
425 {
426 rte_memseg_walk(dump_memseg, f);
427 }
428
429 static int
430 check_iova(const struct rte_memseg_list *msl __rte_unused,
431 const struct rte_memseg *ms, void *arg)
432 {
433 uint64_t *mask = arg;
434 rte_iova_t iova;
435
436 /* higher address within segment */
437 iova = (ms->iova + ms->len) - 1;
438 if (!(iova & *mask))
439 return 0;
440
441 RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n",
442 ms->iova, ms->len);
443
444 RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask);
445 return 1;
446 }
447
448 #define MAX_DMA_MASK_BITS 63
449
450 /* check memseg iovas are within the required range based on dma mask */
451 static int __rte_experimental
452 check_dma_mask(uint8_t maskbits, bool thread_unsafe)
453 {
454 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
455 uint64_t mask;
456 int ret;
457
458 /* Sanity check. We only check width can be managed with 64 bits
459 * variables. Indeed any higher value is likely wrong. */
460 if (maskbits > MAX_DMA_MASK_BITS) {
461 RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n",
462 maskbits, MAX_DMA_MASK_BITS);
463 return -1;
464 }
465
466 /* create dma mask */
467 mask = ~((1ULL << maskbits) - 1);
468
469 if (thread_unsafe)
470 ret = rte_memseg_walk_thread_unsafe(check_iova, &mask);
471 else
472 ret = rte_memseg_walk(check_iova, &mask);
473
474 if (ret)
475 /*
476 * Dma mask precludes hugepage usage.
477 * This device can not be used and we do not need to keep
478 * the dma mask.
479 */
480 return 1;
481
482 /*
483 * we need to keep the more restricted maskbit for checking
484 * potential dynamic memory allocation in the future.
485 */
486 mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
487 RTE_MIN(mcfg->dma_maskbits, maskbits);
488
489 return 0;
490 }
491
492 int __rte_experimental
493 rte_mem_check_dma_mask(uint8_t maskbits)
494 {
495 return check_dma_mask(maskbits, false);
496 }
497
498 int __rte_experimental
499 rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits)
500 {
501 return check_dma_mask(maskbits, true);
502 }
503
504 /*
505 * Set dma mask to use when memory initialization is done.
506 *
507 * This function should ONLY be used by code executed before the memory
508 * initialization. PMDs should use rte_mem_check_dma_mask if addressing
509 * limitations by the device.
510 */
511 void __rte_experimental
512 rte_mem_set_dma_mask(uint8_t maskbits)
513 {
514 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
515
516 mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
517 RTE_MIN(mcfg->dma_maskbits, maskbits);
518 }
519
520 /* return the number of memory channels */
521 unsigned rte_memory_get_nchannel(void)
522 {
523 return rte_eal_get_configuration()->mem_config->nchannel;
524 }
525
526 /* return the number of memory rank */
527 unsigned rte_memory_get_nrank(void)
528 {
529 return rte_eal_get_configuration()->mem_config->nrank;
530 }
531
532 static int
533 rte_eal_memdevice_init(void)
534 {
535 struct rte_config *config;
536
537 if (rte_eal_process_type() == RTE_PROC_SECONDARY)
538 return 0;
539
540 config = rte_eal_get_configuration();
541 config->mem_config->nchannel = internal_config.force_nchannel;
542 config->mem_config->nrank = internal_config.force_nrank;
543
544 return 0;
545 }
546
547 /* Lock page in physical memory and prevent from swapping. */
548 int
549 rte_mem_lock_page(const void *virt)
550 {
551 unsigned long virtual = (unsigned long)virt;
552 int page_size = getpagesize();
553 unsigned long aligned = (virtual & ~(page_size - 1));
554 return mlock((void *)aligned, page_size);
555 }
556
557 int __rte_experimental
558 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg)
559 {
560 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
561 int i, ms_idx, ret = 0;
562
563 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
564 struct rte_memseg_list *msl = &mcfg->memsegs[i];
565 const struct rte_memseg *ms;
566 struct rte_fbarray *arr;
567
568 if (msl->memseg_arr.count == 0)
569 continue;
570
571 arr = &msl->memseg_arr;
572
573 ms_idx = rte_fbarray_find_next_used(arr, 0);
574 while (ms_idx >= 0) {
575 int n_segs;
576 size_t len;
577
578 ms = rte_fbarray_get(arr, ms_idx);
579
580 /* find how many more segments there are, starting with
581 * this one.
582 */
583 n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
584 len = n_segs * msl->page_sz;
585
586 ret = func(msl, ms, len, arg);
587 if (ret)
588 return ret;
589 ms_idx = rte_fbarray_find_next_used(arr,
590 ms_idx + n_segs);
591 }
592 }
593 return 0;
594 }
595
596 int __rte_experimental
597 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
598 {
599 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
600 int ret = 0;
601
602 /* do not allow allocations/frees/init while we iterate */
603 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
604 ret = rte_memseg_contig_walk_thread_unsafe(func, arg);
605 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
606
607 return ret;
608 }
609
610 int __rte_experimental
611 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
612 {
613 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
614 int i, ms_idx, ret = 0;
615
616 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
617 struct rte_memseg_list *msl = &mcfg->memsegs[i];
618 const struct rte_memseg *ms;
619 struct rte_fbarray *arr;
620
621 if (msl->memseg_arr.count == 0)
622 continue;
623
624 arr = &msl->memseg_arr;
625
626 ms_idx = rte_fbarray_find_next_used(arr, 0);
627 while (ms_idx >= 0) {
628 ms = rte_fbarray_get(arr, ms_idx);
629 ret = func(msl, ms, arg);
630 if (ret)
631 return ret;
632 ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
633 }
634 }
635 return 0;
636 }
637
638 int __rte_experimental
639 rte_memseg_walk(rte_memseg_walk_t func, void *arg)
640 {
641 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
642 int ret = 0;
643
644 /* do not allow allocations/frees/init while we iterate */
645 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
646 ret = rte_memseg_walk_thread_unsafe(func, arg);
647 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
648
649 return ret;
650 }
651
652 int __rte_experimental
653 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
654 {
655 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
656 int i, ret = 0;
657
658 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
659 struct rte_memseg_list *msl = &mcfg->memsegs[i];
660
661 if (msl->base_va == NULL)
662 continue;
663
664 ret = func(msl, arg);
665 if (ret)
666 return ret;
667 }
668 return 0;
669 }
670
671 int __rte_experimental
672 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
673 {
674 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
675 int ret = 0;
676
677 /* do not allow allocations/frees/init while we iterate */
678 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
679 ret = rte_memseg_list_walk_thread_unsafe(func, arg);
680 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
681
682 return ret;
683 }
684
685 int __rte_experimental
686 rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
687 {
688 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
689 struct rte_memseg_list *msl;
690 struct rte_fbarray *arr;
691 int msl_idx, seg_idx, ret;
692
693 if (ms == NULL) {
694 rte_errno = EINVAL;
695 return -1;
696 }
697
698 msl = rte_mem_virt2memseg_list(ms->addr);
699 if (msl == NULL) {
700 rte_errno = EINVAL;
701 return -1;
702 }
703 arr = &msl->memseg_arr;
704
705 msl_idx = msl - mcfg->memsegs;
706 seg_idx = rte_fbarray_find_idx(arr, ms);
707
708 if (!rte_fbarray_is_used(arr, seg_idx)) {
709 rte_errno = ENOENT;
710 return -1;
711 }
712
713 /* segment fd API is not supported for external segments */
714 if (msl->external) {
715 rte_errno = ENOTSUP;
716 return -1;
717 }
718
719 ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
720 if (ret < 0) {
721 rte_errno = -ret;
722 ret = -1;
723 }
724 return ret;
725 }
726
727 int __rte_experimental
728 rte_memseg_get_fd(const struct rte_memseg *ms)
729 {
730 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
731 int ret;
732
733 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
734 ret = rte_memseg_get_fd_thread_unsafe(ms);
735 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
736
737 return ret;
738 }
739
740 int __rte_experimental
741 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
742 size_t *offset)
743 {
744 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
745 struct rte_memseg_list *msl;
746 struct rte_fbarray *arr;
747 int msl_idx, seg_idx, ret;
748
749 if (ms == NULL || offset == NULL) {
750 rte_errno = EINVAL;
751 return -1;
752 }
753
754 msl = rte_mem_virt2memseg_list(ms->addr);
755 if (msl == NULL) {
756 rte_errno = EINVAL;
757 return -1;
758 }
759 arr = &msl->memseg_arr;
760
761 msl_idx = msl - mcfg->memsegs;
762 seg_idx = rte_fbarray_find_idx(arr, ms);
763
764 if (!rte_fbarray_is_used(arr, seg_idx)) {
765 rte_errno = ENOENT;
766 return -1;
767 }
768
769 /* segment fd API is not supported for external segments */
770 if (msl->external) {
771 rte_errno = ENOTSUP;
772 return -1;
773 }
774
775 ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
776 if (ret < 0) {
777 rte_errno = -ret;
778 ret = -1;
779 }
780 return ret;
781 }
782
783 int __rte_experimental
784 rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
785 {
786 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
787 int ret;
788
789 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
790 ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
791 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
792
793 return ret;
794 }
795
796 int __rte_experimental
797 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
798 unsigned int n_pages, size_t page_sz)
799 {
800 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
801 unsigned int socket_id, n;
802 int ret = 0;
803
804 if (va_addr == NULL || page_sz == 0 || len == 0 ||
805 !rte_is_power_of_2(page_sz) ||
806 RTE_ALIGN(len, page_sz) != len ||
807 ((len / page_sz) != n_pages && iova_addrs != NULL) ||
808 !rte_is_aligned(va_addr, page_sz)) {
809 rte_errno = EINVAL;
810 return -1;
811 }
812 rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
813
814 /* make sure the segment doesn't already exist */
815 if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
816 rte_errno = EEXIST;
817 ret = -1;
818 goto unlock;
819 }
820
821 /* get next available socket ID */
822 socket_id = mcfg->next_socket_id;
823 if (socket_id > INT32_MAX) {
824 RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n");
825 rte_errno = ENOSPC;
826 ret = -1;
827 goto unlock;
828 }
829
830 /* we can create a new memseg */
831 n = len / page_sz;
832 if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
833 page_sz, "extmem", socket_id) == NULL) {
834 ret = -1;
835 goto unlock;
836 }
837
838 /* memseg list successfully created - increment next socket ID */
839 mcfg->next_socket_id++;
840 unlock:
841 rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
842 return ret;
843 }
844
845 int __rte_experimental
846 rte_extmem_unregister(void *va_addr, size_t len)
847 {
848 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
849 struct rte_memseg_list *msl;
850 int ret = 0;
851
852 if (va_addr == NULL || len == 0) {
853 rte_errno = EINVAL;
854 return -1;
855 }
856 rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
857
858 /* find our segment */
859 msl = malloc_heap_find_external_seg(va_addr, len);
860 if (msl == NULL) {
861 rte_errno = ENOENT;
862 ret = -1;
863 goto unlock;
864 }
865
866 ret = malloc_heap_destroy_external_seg(msl);
867 unlock:
868 rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
869 return ret;
870 }
871
872 static int
873 sync_memory(void *va_addr, size_t len, bool attach)
874 {
875 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
876 struct rte_memseg_list *msl;
877 int ret = 0;
878
879 if (va_addr == NULL || len == 0) {
880 rte_errno = EINVAL;
881 return -1;
882 }
883 rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
884
885 /* find our segment */
886 msl = malloc_heap_find_external_seg(va_addr, len);
887 if (msl == NULL) {
888 rte_errno = ENOENT;
889 ret = -1;
890 goto unlock;
891 }
892 if (attach)
893 ret = rte_fbarray_attach(&msl->memseg_arr);
894 else
895 ret = rte_fbarray_detach(&msl->memseg_arr);
896
897 unlock:
898 rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
899 return ret;
900 }
901
902 int __rte_experimental
903 rte_extmem_attach(void *va_addr, size_t len)
904 {
905 return sync_memory(va_addr, len, true);
906 }
907
908 int __rte_experimental
909 rte_extmem_detach(void *va_addr, size_t len)
910 {
911 return sync_memory(va_addr, len, false);
912 }
913
914 /* init memory subsystem */
915 int
916 rte_eal_memory_init(void)
917 {
918 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
919 int retval;
920 RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
921
922 if (!mcfg)
923 return -1;
924
925 /* lock mem hotplug here, to prevent races while we init */
926 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
927
928 if (rte_eal_memseg_init() < 0)
929 goto fail;
930
931 if (eal_memalloc_init() < 0)
932 goto fail;
933
934 retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
935 rte_eal_hugepage_init() :
936 rte_eal_hugepage_attach();
937 if (retval < 0)
938 goto fail;
939
940 if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
941 goto fail;
942
943 return 0;
944 fail:
945 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
946 return -1;
947 }