]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2010-2014 Intel Corporation | |
3 | */ | |
4 | ||
9f95a23c | 5 | #include <fcntl.h> |
11fdf7f2 TL |
6 | #include <errno.h> |
7 | #include <stdio.h> | |
8 | #include <stdint.h> | |
9 | #include <stdlib.h> | |
10 | #include <stdarg.h> | |
11 | #include <string.h> | |
12 | #include <unistd.h> | |
13 | #include <inttypes.h> | |
14 | #include <sys/mman.h> | |
15 | #include <sys/queue.h> | |
16 | ||
17 | #include <rte_fbarray.h> | |
18 | #include <rte_memory.h> | |
19 | #include <rte_eal.h> | |
20 | #include <rte_eal_memconfig.h> | |
21 | #include <rte_errno.h> | |
22 | #include <rte_log.h> | |
23 | ||
24 | #include "eal_memalloc.h" | |
25 | #include "eal_private.h" | |
26 | #include "eal_internal_cfg.h" | |
f67539c2 | 27 | #include "eal_memcfg.h" |
9f95a23c | 28 | #include "malloc_heap.h" |
11fdf7f2 TL |
29 | |
30 | /* | |
31 | * Try to mmap *size bytes in /dev/zero. If it is successful, return the | |
32 | * pointer to the mmap'd area and keep *size unmodified. Else, retry | |
33 | * with a smaller zone: decrease *size by hugepage_sz until it reaches | |
34 | * 0. In this case, return NULL. Note: this function returns an address | |
35 | * which is a multiple of hugepage size. | |
36 | */ | |
37 | ||
38 | #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" | |
39 | ||
40 | static void *next_baseaddr; | |
41 | static uint64_t system_page_sz; | |
42 | ||
f67539c2 TL |
43 | #ifdef RTE_EXEC_ENV_LINUX |
44 | #define RTE_DONTDUMP MADV_DONTDUMP | |
45 | #elif defined RTE_EXEC_ENV_FREEBSD | |
46 | #define RTE_DONTDUMP MADV_NOCORE | |
47 | #else | |
48 | #error "madvise doesn't support this OS" | |
9f95a23c TL |
49 | #endif |
50 | ||
51 | #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5 | |
11fdf7f2 TL |
52 | void * |
53 | eal_get_virtual_area(void *requested_addr, size_t *size, | |
54 | size_t page_sz, int flags, int mmap_flags) | |
55 | { | |
56 | bool addr_is_hint, allow_shrink, unmap, no_align; | |
57 | uint64_t map_sz; | |
58 | void *mapped_addr, *aligned_addr; | |
9f95a23c | 59 | uint8_t try = 0; |
11fdf7f2 TL |
60 | |
61 | if (system_page_sz == 0) | |
62 | system_page_sz = sysconf(_SC_PAGESIZE); | |
63 | ||
64 | mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; | |
65 | ||
66 | RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); | |
67 | ||
68 | addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; | |
69 | allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; | |
70 | unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; | |
71 | ||
72 | if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 && | |
73 | rte_eal_process_type() == RTE_PROC_PRIMARY) | |
74 | next_baseaddr = (void *) internal_config.base_virtaddr; | |
75 | ||
9f95a23c TL |
76 | #ifdef RTE_ARCH_64 |
77 | if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 && | |
78 | rte_eal_process_type() == RTE_PROC_PRIMARY) | |
f67539c2 | 79 | next_baseaddr = (void *) eal_get_baseaddr(); |
9f95a23c | 80 | #endif |
11fdf7f2 TL |
81 | if (requested_addr == NULL && next_baseaddr != NULL) { |
82 | requested_addr = next_baseaddr; | |
83 | requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); | |
84 | addr_is_hint = true; | |
85 | } | |
86 | ||
87 | /* we don't need alignment of resulting pointer in the following cases: | |
88 | * | |
89 | * 1. page size is equal to system size | |
90 | * 2. we have a requested address, and it is page-aligned, and we will | |
91 | * be discarding the address if we get a different one. | |
92 | * | |
93 | * for all other cases, alignment is potentially necessary. | |
94 | */ | |
95 | no_align = (requested_addr != NULL && | |
96 | requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && | |
97 | !addr_is_hint) || | |
98 | page_sz == system_page_sz; | |
99 | ||
100 | do { | |
101 | map_sz = no_align ? *size : *size + page_sz; | |
102 | if (map_sz > SIZE_MAX) { | |
103 | RTE_LOG(ERR, EAL, "Map size too big\n"); | |
104 | rte_errno = E2BIG; | |
105 | return NULL; | |
106 | } | |
107 | ||
f67539c2 | 108 | mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE, |
11fdf7f2 TL |
109 | mmap_flags, -1, 0); |
110 | if (mapped_addr == MAP_FAILED && allow_shrink) | |
111 | *size -= page_sz; | |
9f95a23c TL |
112 | |
113 | if (mapped_addr != MAP_FAILED && addr_is_hint && | |
114 | mapped_addr != requested_addr) { | |
115 | try++; | |
116 | next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); | |
117 | if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) { | |
118 | /* hint was not used. Try with another offset */ | |
119 | munmap(mapped_addr, map_sz); | |
120 | mapped_addr = MAP_FAILED; | |
121 | requested_addr = next_baseaddr; | |
122 | } | |
123 | } | |
124 | } while ((allow_shrink || addr_is_hint) && | |
125 | mapped_addr == MAP_FAILED && *size > 0); | |
11fdf7f2 TL |
126 | |
127 | /* align resulting address - if map failed, we will ignore the value | |
128 | * anyway, so no need to add additional checks. | |
129 | */ | |
130 | aligned_addr = no_align ? mapped_addr : | |
131 | RTE_PTR_ALIGN(mapped_addr, page_sz); | |
132 | ||
133 | if (*size == 0) { | |
134 | RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", | |
135 | strerror(errno)); | |
136 | rte_errno = errno; | |
137 | return NULL; | |
138 | } else if (mapped_addr == MAP_FAILED) { | |
139 | RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", | |
140 | strerror(errno)); | |
141 | /* pass errno up the call chain */ | |
142 | rte_errno = errno; | |
143 | return NULL; | |
144 | } else if (requested_addr != NULL && !addr_is_hint && | |
145 | aligned_addr != requested_addr) { | |
146 | RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", | |
147 | requested_addr, aligned_addr); | |
148 | munmap(mapped_addr, map_sz); | |
149 | rte_errno = EADDRNOTAVAIL; | |
150 | return NULL; | |
151 | } else if (requested_addr != NULL && addr_is_hint && | |
152 | aligned_addr != requested_addr) { | |
153 | RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", | |
154 | requested_addr, aligned_addr); | |
155 | RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); | |
156 | } else if (next_baseaddr != NULL) { | |
157 | next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); | |
158 | } | |
159 | ||
160 | RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", | |
161 | aligned_addr, *size); | |
162 | ||
163 | if (unmap) { | |
164 | munmap(mapped_addr, map_sz); | |
165 | } else if (!no_align) { | |
166 | void *map_end, *aligned_end; | |
167 | size_t before_len, after_len; | |
168 | ||
169 | /* when we reserve space with alignment, we add alignment to | |
170 | * mapping size. On 32-bit, if 1GB alignment was requested, this | |
171 | * would waste 1GB of address space, which is a luxury we cannot | |
172 | * afford. so, if alignment was performed, check if any unneeded | |
173 | * address space can be unmapped back. | |
174 | */ | |
175 | ||
176 | map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); | |
177 | aligned_end = RTE_PTR_ADD(aligned_addr, *size); | |
178 | ||
179 | /* unmap space before aligned mmap address */ | |
180 | before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); | |
181 | if (before_len > 0) | |
182 | munmap(mapped_addr, before_len); | |
183 | ||
184 | /* unmap space after aligned end mmap address */ | |
185 | after_len = RTE_PTR_DIFF(map_end, aligned_end); | |
186 | if (after_len > 0) | |
187 | munmap(aligned_end, after_len); | |
188 | } | |
189 | ||
f67539c2 TL |
190 | if (!unmap) { |
191 | /* Exclude these pages from a core dump. */ | |
192 | if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0) | |
193 | RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", | |
194 | strerror(errno)); | |
195 | } | |
196 | ||
11fdf7f2 TL |
197 | return aligned_addr; |
198 | } | |
199 | ||
200 | static struct rte_memseg * | |
201 | virt2memseg(const void *addr, const struct rte_memseg_list *msl) | |
202 | { | |
203 | const struct rte_fbarray *arr; | |
204 | void *start, *end; | |
205 | int ms_idx; | |
206 | ||
207 | if (msl == NULL) | |
208 | return NULL; | |
209 | ||
210 | /* a memseg list was specified, check if it's the right one */ | |
211 | start = msl->base_va; | |
9f95a23c | 212 | end = RTE_PTR_ADD(start, msl->len); |
11fdf7f2 TL |
213 | |
214 | if (addr < start || addr >= end) | |
215 | return NULL; | |
216 | ||
217 | /* now, calculate index */ | |
218 | arr = &msl->memseg_arr; | |
219 | ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; | |
220 | return rte_fbarray_get(arr, ms_idx); | |
221 | } | |
222 | ||
223 | static struct rte_memseg_list * | |
224 | virt2memseg_list(const void *addr) | |
225 | { | |
226 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
227 | struct rte_memseg_list *msl; | |
228 | int msl_idx; | |
229 | ||
230 | for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { | |
231 | void *start, *end; | |
232 | msl = &mcfg->memsegs[msl_idx]; | |
233 | ||
234 | start = msl->base_va; | |
9f95a23c | 235 | end = RTE_PTR_ADD(start, msl->len); |
11fdf7f2 TL |
236 | if (addr >= start && addr < end) |
237 | break; | |
238 | } | |
239 | /* if we didn't find our memseg list */ | |
240 | if (msl_idx == RTE_MAX_MEMSEG_LISTS) | |
241 | return NULL; | |
242 | return msl; | |
243 | } | |
244 | ||
f67539c2 | 245 | struct rte_memseg_list * |
11fdf7f2 TL |
246 | rte_mem_virt2memseg_list(const void *addr) |
247 | { | |
248 | return virt2memseg_list(addr); | |
249 | } | |
250 | ||
251 | struct virtiova { | |
252 | rte_iova_t iova; | |
253 | void *virt; | |
254 | }; | |
255 | static int | |
256 | find_virt(const struct rte_memseg_list *msl __rte_unused, | |
257 | const struct rte_memseg *ms, void *arg) | |
258 | { | |
259 | struct virtiova *vi = arg; | |
260 | if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { | |
261 | size_t offset = vi->iova - ms->iova; | |
262 | vi->virt = RTE_PTR_ADD(ms->addr, offset); | |
263 | /* stop the walk */ | |
264 | return 1; | |
265 | } | |
266 | return 0; | |
267 | } | |
268 | static int | |
269 | find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, | |
270 | const struct rte_memseg *ms, size_t len, void *arg) | |
271 | { | |
272 | struct virtiova *vi = arg; | |
273 | if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { | |
274 | size_t offset = vi->iova - ms->iova; | |
275 | vi->virt = RTE_PTR_ADD(ms->addr, offset); | |
276 | /* stop the walk */ | |
277 | return 1; | |
278 | } | |
279 | return 0; | |
280 | } | |
281 | ||
f67539c2 | 282 | void * |
11fdf7f2 TL |
283 | rte_mem_iova2virt(rte_iova_t iova) |
284 | { | |
285 | struct virtiova vi; | |
286 | ||
287 | memset(&vi, 0, sizeof(vi)); | |
288 | ||
289 | vi.iova = iova; | |
290 | /* for legacy mem, we can get away with scanning VA-contiguous segments, | |
291 | * as we know they are PA-contiguous as well | |
292 | */ | |
293 | if (internal_config.legacy_mem) | |
294 | rte_memseg_contig_walk(find_virt_legacy, &vi); | |
295 | else | |
296 | rte_memseg_walk(find_virt, &vi); | |
297 | ||
298 | return vi.virt; | |
299 | } | |
300 | ||
f67539c2 | 301 | struct rte_memseg * |
11fdf7f2 TL |
302 | rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) |
303 | { | |
304 | return virt2memseg(addr, msl != NULL ? msl : | |
305 | rte_mem_virt2memseg_list(addr)); | |
306 | } | |
307 | ||
308 | static int | |
309 | physmem_size(const struct rte_memseg_list *msl, void *arg) | |
310 | { | |
311 | uint64_t *total_len = arg; | |
312 | ||
9f95a23c TL |
313 | if (msl->external) |
314 | return 0; | |
315 | ||
11fdf7f2 TL |
316 | *total_len += msl->memseg_arr.count * msl->page_sz; |
317 | ||
318 | return 0; | |
319 | } | |
320 | ||
321 | /* get the total size of memory */ | |
322 | uint64_t | |
323 | rte_eal_get_physmem_size(void) | |
324 | { | |
325 | uint64_t total_len = 0; | |
326 | ||
327 | rte_memseg_list_walk(physmem_size, &total_len); | |
328 | ||
329 | return total_len; | |
330 | } | |
331 | ||
332 | static int | |
333 | dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, | |
334 | void *arg) | |
335 | { | |
336 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
9f95a23c | 337 | int msl_idx, ms_idx, fd; |
11fdf7f2 TL |
338 | FILE *f = arg; |
339 | ||
340 | msl_idx = msl - mcfg->memsegs; | |
341 | if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) | |
342 | return -1; | |
343 | ||
344 | ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); | |
345 | if (ms_idx < 0) | |
346 | return -1; | |
347 | ||
9f95a23c | 348 | fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); |
11fdf7f2 TL |
349 | fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " |
350 | "virt:%p, socket_id:%"PRId32", " | |
351 | "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " | |
9f95a23c | 352 | "nrank:%"PRIx32" fd:%i\n", |
11fdf7f2 TL |
353 | msl_idx, ms_idx, |
354 | ms->iova, | |
355 | ms->len, | |
356 | ms->addr, | |
357 | ms->socket_id, | |
358 | ms->hugepage_sz, | |
359 | ms->nchannel, | |
9f95a23c TL |
360 | ms->nrank, |
361 | fd); | |
11fdf7f2 TL |
362 | |
363 | return 0; | |
364 | } | |
365 | ||
366 | /* | |
367 | * Defining here because declared in rte_memory.h, but the actual implementation | |
368 | * is in eal_common_memalloc.c, like all other memalloc internals. | |
369 | */ | |
f67539c2 | 370 | int |
11fdf7f2 TL |
371 | rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, |
372 | void *arg) | |
373 | { | |
374 | /* FreeBSD boots with legacy mem enabled by default */ | |
375 | if (internal_config.legacy_mem) { | |
376 | RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); | |
377 | rte_errno = ENOTSUP; | |
378 | return -1; | |
379 | } | |
380 | return eal_memalloc_mem_event_callback_register(name, clb, arg); | |
381 | } | |
382 | ||
f67539c2 | 383 | int |
11fdf7f2 TL |
384 | rte_mem_event_callback_unregister(const char *name, void *arg) |
385 | { | |
386 | /* FreeBSD boots with legacy mem enabled by default */ | |
387 | if (internal_config.legacy_mem) { | |
388 | RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); | |
389 | rte_errno = ENOTSUP; | |
390 | return -1; | |
391 | } | |
392 | return eal_memalloc_mem_event_callback_unregister(name, arg); | |
393 | } | |
394 | ||
f67539c2 | 395 | int |
11fdf7f2 TL |
396 | rte_mem_alloc_validator_register(const char *name, |
397 | rte_mem_alloc_validator_t clb, int socket_id, size_t limit) | |
398 | { | |
399 | /* FreeBSD boots with legacy mem enabled by default */ | |
400 | if (internal_config.legacy_mem) { | |
401 | RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); | |
402 | rte_errno = ENOTSUP; | |
403 | return -1; | |
404 | } | |
405 | return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, | |
406 | limit); | |
407 | } | |
408 | ||
f67539c2 | 409 | int |
11fdf7f2 TL |
410 | rte_mem_alloc_validator_unregister(const char *name, int socket_id) |
411 | { | |
412 | /* FreeBSD boots with legacy mem enabled by default */ | |
413 | if (internal_config.legacy_mem) { | |
414 | RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); | |
415 | rte_errno = ENOTSUP; | |
416 | return -1; | |
417 | } | |
418 | return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); | |
419 | } | |
420 | ||
421 | /* Dump the physical memory layout on console */ | |
422 | void | |
423 | rte_dump_physmem_layout(FILE *f) | |
424 | { | |
425 | rte_memseg_walk(dump_memseg, f); | |
426 | } | |
427 | ||
9f95a23c TL |
428 | static int |
429 | check_iova(const struct rte_memseg_list *msl __rte_unused, | |
430 | const struct rte_memseg *ms, void *arg) | |
431 | { | |
432 | uint64_t *mask = arg; | |
433 | rte_iova_t iova; | |
434 | ||
435 | /* higher address within segment */ | |
436 | iova = (ms->iova + ms->len) - 1; | |
437 | if (!(iova & *mask)) | |
438 | return 0; | |
439 | ||
440 | RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", | |
441 | ms->iova, ms->len); | |
442 | ||
443 | RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); | |
444 | return 1; | |
445 | } | |
446 | ||
447 | #define MAX_DMA_MASK_BITS 63 | |
448 | ||
449 | /* check memseg iovas are within the required range based on dma mask */ | |
f67539c2 | 450 | static int |
9f95a23c TL |
451 | check_dma_mask(uint8_t maskbits, bool thread_unsafe) |
452 | { | |
453 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
454 | uint64_t mask; | |
455 | int ret; | |
456 | ||
457 | /* Sanity check. We only check width can be managed with 64 bits | |
458 | * variables. Indeed any higher value is likely wrong. */ | |
459 | if (maskbits > MAX_DMA_MASK_BITS) { | |
460 | RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", | |
461 | maskbits, MAX_DMA_MASK_BITS); | |
462 | return -1; | |
463 | } | |
464 | ||
465 | /* create dma mask */ | |
466 | mask = ~((1ULL << maskbits) - 1); | |
467 | ||
468 | if (thread_unsafe) | |
469 | ret = rte_memseg_walk_thread_unsafe(check_iova, &mask); | |
470 | else | |
471 | ret = rte_memseg_walk(check_iova, &mask); | |
472 | ||
473 | if (ret) | |
474 | /* | |
475 | * Dma mask precludes hugepage usage. | |
476 | * This device can not be used and we do not need to keep | |
477 | * the dma mask. | |
478 | */ | |
479 | return 1; | |
480 | ||
481 | /* | |
482 | * we need to keep the more restricted maskbit for checking | |
483 | * potential dynamic memory allocation in the future. | |
484 | */ | |
485 | mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : | |
486 | RTE_MIN(mcfg->dma_maskbits, maskbits); | |
487 | ||
488 | return 0; | |
489 | } | |
490 | ||
f67539c2 | 491 | int |
9f95a23c TL |
492 | rte_mem_check_dma_mask(uint8_t maskbits) |
493 | { | |
494 | return check_dma_mask(maskbits, false); | |
495 | } | |
496 | ||
f67539c2 | 497 | int |
9f95a23c TL |
498 | rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits) |
499 | { | |
500 | return check_dma_mask(maskbits, true); | |
501 | } | |
502 | ||
503 | /* | |
504 | * Set dma mask to use when memory initialization is done. | |
505 | * | |
506 | * This function should ONLY be used by code executed before the memory | |
507 | * initialization. PMDs should use rte_mem_check_dma_mask if addressing | |
508 | * limitations by the device. | |
509 | */ | |
f67539c2 | 510 | void |
9f95a23c TL |
511 | rte_mem_set_dma_mask(uint8_t maskbits) |
512 | { | |
513 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
514 | ||
515 | mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : | |
516 | RTE_MIN(mcfg->dma_maskbits, maskbits); | |
517 | } | |
518 | ||
11fdf7f2 TL |
519 | /* return the number of memory channels */ |
520 | unsigned rte_memory_get_nchannel(void) | |
521 | { | |
522 | return rte_eal_get_configuration()->mem_config->nchannel; | |
523 | } | |
524 | ||
525 | /* return the number of memory rank */ | |
526 | unsigned rte_memory_get_nrank(void) | |
527 | { | |
528 | return rte_eal_get_configuration()->mem_config->nrank; | |
529 | } | |
530 | ||
531 | static int | |
532 | rte_eal_memdevice_init(void) | |
533 | { | |
534 | struct rte_config *config; | |
535 | ||
536 | if (rte_eal_process_type() == RTE_PROC_SECONDARY) | |
537 | return 0; | |
538 | ||
539 | config = rte_eal_get_configuration(); | |
540 | config->mem_config->nchannel = internal_config.force_nchannel; | |
541 | config->mem_config->nrank = internal_config.force_nrank; | |
542 | ||
543 | return 0; | |
544 | } | |
545 | ||
546 | /* Lock page in physical memory and prevent from swapping. */ | |
547 | int | |
548 | rte_mem_lock_page(const void *virt) | |
549 | { | |
550 | unsigned long virtual = (unsigned long)virt; | |
551 | int page_size = getpagesize(); | |
552 | unsigned long aligned = (virtual & ~(page_size - 1)); | |
553 | return mlock((void *)aligned, page_size); | |
554 | } | |
555 | ||
f67539c2 | 556 | int |
11fdf7f2 TL |
557 | rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) |
558 | { | |
559 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
560 | int i, ms_idx, ret = 0; | |
561 | ||
562 | for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { | |
563 | struct rte_memseg_list *msl = &mcfg->memsegs[i]; | |
564 | const struct rte_memseg *ms; | |
565 | struct rte_fbarray *arr; | |
566 | ||
567 | if (msl->memseg_arr.count == 0) | |
568 | continue; | |
569 | ||
570 | arr = &msl->memseg_arr; | |
571 | ||
572 | ms_idx = rte_fbarray_find_next_used(arr, 0); | |
573 | while (ms_idx >= 0) { | |
574 | int n_segs; | |
575 | size_t len; | |
576 | ||
577 | ms = rte_fbarray_get(arr, ms_idx); | |
578 | ||
579 | /* find how many more segments there are, starting with | |
580 | * this one. | |
581 | */ | |
582 | n_segs = rte_fbarray_find_contig_used(arr, ms_idx); | |
583 | len = n_segs * msl->page_sz; | |
584 | ||
585 | ret = func(msl, ms, len, arg); | |
586 | if (ret) | |
587 | return ret; | |
588 | ms_idx = rte_fbarray_find_next_used(arr, | |
589 | ms_idx + n_segs); | |
590 | } | |
591 | } | |
592 | return 0; | |
593 | } | |
594 | ||
f67539c2 | 595 | int |
11fdf7f2 TL |
596 | rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) |
597 | { | |
11fdf7f2 TL |
598 | int ret = 0; |
599 | ||
600 | /* do not allow allocations/frees/init while we iterate */ | |
f67539c2 | 601 | rte_mcfg_mem_read_lock(); |
11fdf7f2 | 602 | ret = rte_memseg_contig_walk_thread_unsafe(func, arg); |
f67539c2 | 603 | rte_mcfg_mem_read_unlock(); |
11fdf7f2 TL |
604 | |
605 | return ret; | |
606 | } | |
607 | ||
f67539c2 | 608 | int |
11fdf7f2 TL |
609 | rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) |
610 | { | |
611 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
612 | int i, ms_idx, ret = 0; | |
613 | ||
614 | for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { | |
615 | struct rte_memseg_list *msl = &mcfg->memsegs[i]; | |
616 | const struct rte_memseg *ms; | |
617 | struct rte_fbarray *arr; | |
618 | ||
619 | if (msl->memseg_arr.count == 0) | |
620 | continue; | |
621 | ||
622 | arr = &msl->memseg_arr; | |
623 | ||
624 | ms_idx = rte_fbarray_find_next_used(arr, 0); | |
625 | while (ms_idx >= 0) { | |
626 | ms = rte_fbarray_get(arr, ms_idx); | |
627 | ret = func(msl, ms, arg); | |
628 | if (ret) | |
629 | return ret; | |
630 | ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); | |
631 | } | |
632 | } | |
633 | return 0; | |
634 | } | |
635 | ||
f67539c2 | 636 | int |
11fdf7f2 TL |
637 | rte_memseg_walk(rte_memseg_walk_t func, void *arg) |
638 | { | |
11fdf7f2 TL |
639 | int ret = 0; |
640 | ||
641 | /* do not allow allocations/frees/init while we iterate */ | |
f67539c2 | 642 | rte_mcfg_mem_read_lock(); |
11fdf7f2 | 643 | ret = rte_memseg_walk_thread_unsafe(func, arg); |
f67539c2 | 644 | rte_mcfg_mem_read_unlock(); |
11fdf7f2 TL |
645 | |
646 | return ret; | |
647 | } | |
648 | ||
f67539c2 | 649 | int |
11fdf7f2 TL |
650 | rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) |
651 | { | |
652 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
653 | int i, ret = 0; | |
654 | ||
655 | for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { | |
656 | struct rte_memseg_list *msl = &mcfg->memsegs[i]; | |
657 | ||
658 | if (msl->base_va == NULL) | |
659 | continue; | |
660 | ||
661 | ret = func(msl, arg); | |
662 | if (ret) | |
663 | return ret; | |
664 | } | |
665 | return 0; | |
666 | } | |
667 | ||
f67539c2 | 668 | int |
11fdf7f2 TL |
669 | rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) |
670 | { | |
11fdf7f2 TL |
671 | int ret = 0; |
672 | ||
673 | /* do not allow allocations/frees/init while we iterate */ | |
f67539c2 | 674 | rte_mcfg_mem_read_lock(); |
11fdf7f2 | 675 | ret = rte_memseg_list_walk_thread_unsafe(func, arg); |
f67539c2 | 676 | rte_mcfg_mem_read_unlock(); |
11fdf7f2 TL |
677 | |
678 | return ret; | |
679 | } | |
680 | ||
f67539c2 | 681 | int |
9f95a23c TL |
682 | rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) |
683 | { | |
684 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
685 | struct rte_memseg_list *msl; | |
686 | struct rte_fbarray *arr; | |
687 | int msl_idx, seg_idx, ret; | |
688 | ||
689 | if (ms == NULL) { | |
690 | rte_errno = EINVAL; | |
691 | return -1; | |
692 | } | |
693 | ||
694 | msl = rte_mem_virt2memseg_list(ms->addr); | |
695 | if (msl == NULL) { | |
696 | rte_errno = EINVAL; | |
697 | return -1; | |
698 | } | |
699 | arr = &msl->memseg_arr; | |
700 | ||
701 | msl_idx = msl - mcfg->memsegs; | |
702 | seg_idx = rte_fbarray_find_idx(arr, ms); | |
703 | ||
704 | if (!rte_fbarray_is_used(arr, seg_idx)) { | |
705 | rte_errno = ENOENT; | |
706 | return -1; | |
707 | } | |
708 | ||
709 | /* segment fd API is not supported for external segments */ | |
710 | if (msl->external) { | |
711 | rte_errno = ENOTSUP; | |
712 | return -1; | |
713 | } | |
714 | ||
715 | ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); | |
716 | if (ret < 0) { | |
717 | rte_errno = -ret; | |
718 | ret = -1; | |
719 | } | |
720 | return ret; | |
721 | } | |
722 | ||
f67539c2 | 723 | int |
9f95a23c TL |
724 | rte_memseg_get_fd(const struct rte_memseg *ms) |
725 | { | |
9f95a23c TL |
726 | int ret; |
727 | ||
f67539c2 | 728 | rte_mcfg_mem_read_lock(); |
9f95a23c | 729 | ret = rte_memseg_get_fd_thread_unsafe(ms); |
f67539c2 | 730 | rte_mcfg_mem_read_unlock(); |
9f95a23c TL |
731 | |
732 | return ret; | |
733 | } | |
734 | ||
f67539c2 | 735 | int |
9f95a23c TL |
736 | rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, |
737 | size_t *offset) | |
738 | { | |
739 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
740 | struct rte_memseg_list *msl; | |
741 | struct rte_fbarray *arr; | |
742 | int msl_idx, seg_idx, ret; | |
743 | ||
744 | if (ms == NULL || offset == NULL) { | |
745 | rte_errno = EINVAL; | |
746 | return -1; | |
747 | } | |
748 | ||
749 | msl = rte_mem_virt2memseg_list(ms->addr); | |
750 | if (msl == NULL) { | |
751 | rte_errno = EINVAL; | |
752 | return -1; | |
753 | } | |
754 | arr = &msl->memseg_arr; | |
755 | ||
756 | msl_idx = msl - mcfg->memsegs; | |
757 | seg_idx = rte_fbarray_find_idx(arr, ms); | |
758 | ||
759 | if (!rte_fbarray_is_used(arr, seg_idx)) { | |
760 | rte_errno = ENOENT; | |
761 | return -1; | |
762 | } | |
763 | ||
764 | /* segment fd API is not supported for external segments */ | |
765 | if (msl->external) { | |
766 | rte_errno = ENOTSUP; | |
767 | return -1; | |
768 | } | |
769 | ||
770 | ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); | |
771 | if (ret < 0) { | |
772 | rte_errno = -ret; | |
773 | ret = -1; | |
774 | } | |
775 | return ret; | |
776 | } | |
777 | ||
f67539c2 | 778 | int |
9f95a23c TL |
779 | rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) |
780 | { | |
9f95a23c TL |
781 | int ret; |
782 | ||
f67539c2 | 783 | rte_mcfg_mem_read_lock(); |
9f95a23c | 784 | ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); |
f67539c2 | 785 | rte_mcfg_mem_read_unlock(); |
9f95a23c TL |
786 | |
787 | return ret; | |
788 | } | |
789 | ||
f67539c2 | 790 | int |
9f95a23c TL |
791 | rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[], |
792 | unsigned int n_pages, size_t page_sz) | |
793 | { | |
794 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
795 | unsigned int socket_id, n; | |
796 | int ret = 0; | |
797 | ||
798 | if (va_addr == NULL || page_sz == 0 || len == 0 || | |
799 | !rte_is_power_of_2(page_sz) || | |
800 | RTE_ALIGN(len, page_sz) != len || | |
801 | ((len / page_sz) != n_pages && iova_addrs != NULL) || | |
802 | !rte_is_aligned(va_addr, page_sz)) { | |
803 | rte_errno = EINVAL; | |
804 | return -1; | |
805 | } | |
f67539c2 | 806 | rte_mcfg_mem_write_lock(); |
9f95a23c TL |
807 | |
808 | /* make sure the segment doesn't already exist */ | |
809 | if (malloc_heap_find_external_seg(va_addr, len) != NULL) { | |
810 | rte_errno = EEXIST; | |
811 | ret = -1; | |
812 | goto unlock; | |
813 | } | |
814 | ||
815 | /* get next available socket ID */ | |
816 | socket_id = mcfg->next_socket_id; | |
817 | if (socket_id > INT32_MAX) { | |
818 | RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); | |
819 | rte_errno = ENOSPC; | |
820 | ret = -1; | |
821 | goto unlock; | |
822 | } | |
823 | ||
824 | /* we can create a new memseg */ | |
825 | n = len / page_sz; | |
826 | if (malloc_heap_create_external_seg(va_addr, iova_addrs, n, | |
827 | page_sz, "extmem", socket_id) == NULL) { | |
828 | ret = -1; | |
829 | goto unlock; | |
830 | } | |
831 | ||
832 | /* memseg list successfully created - increment next socket ID */ | |
833 | mcfg->next_socket_id++; | |
834 | unlock: | |
f67539c2 | 835 | rte_mcfg_mem_write_unlock(); |
9f95a23c TL |
836 | return ret; |
837 | } | |
838 | ||
f67539c2 | 839 | int |
9f95a23c TL |
840 | rte_extmem_unregister(void *va_addr, size_t len) |
841 | { | |
9f95a23c TL |
842 | struct rte_memseg_list *msl; |
843 | int ret = 0; | |
844 | ||
845 | if (va_addr == NULL || len == 0) { | |
846 | rte_errno = EINVAL; | |
847 | return -1; | |
848 | } | |
f67539c2 | 849 | rte_mcfg_mem_write_lock(); |
9f95a23c TL |
850 | |
851 | /* find our segment */ | |
852 | msl = malloc_heap_find_external_seg(va_addr, len); | |
853 | if (msl == NULL) { | |
854 | rte_errno = ENOENT; | |
855 | ret = -1; | |
856 | goto unlock; | |
857 | } | |
858 | ||
859 | ret = malloc_heap_destroy_external_seg(msl); | |
860 | unlock: | |
f67539c2 | 861 | rte_mcfg_mem_write_unlock(); |
9f95a23c TL |
862 | return ret; |
863 | } | |
864 | ||
865 | static int | |
866 | sync_memory(void *va_addr, size_t len, bool attach) | |
867 | { | |
9f95a23c TL |
868 | struct rte_memseg_list *msl; |
869 | int ret = 0; | |
870 | ||
871 | if (va_addr == NULL || len == 0) { | |
872 | rte_errno = EINVAL; | |
873 | return -1; | |
874 | } | |
f67539c2 | 875 | rte_mcfg_mem_write_lock(); |
9f95a23c TL |
876 | |
877 | /* find our segment */ | |
878 | msl = malloc_heap_find_external_seg(va_addr, len); | |
879 | if (msl == NULL) { | |
880 | rte_errno = ENOENT; | |
881 | ret = -1; | |
882 | goto unlock; | |
883 | } | |
884 | if (attach) | |
885 | ret = rte_fbarray_attach(&msl->memseg_arr); | |
886 | else | |
887 | ret = rte_fbarray_detach(&msl->memseg_arr); | |
888 | ||
889 | unlock: | |
f67539c2 | 890 | rte_mcfg_mem_write_unlock(); |
9f95a23c TL |
891 | return ret; |
892 | } | |
893 | ||
f67539c2 | 894 | int |
9f95a23c TL |
895 | rte_extmem_attach(void *va_addr, size_t len) |
896 | { | |
897 | return sync_memory(va_addr, len, true); | |
898 | } | |
899 | ||
f67539c2 | 900 | int |
9f95a23c TL |
901 | rte_extmem_detach(void *va_addr, size_t len) |
902 | { | |
903 | return sync_memory(va_addr, len, false); | |
904 | } | |
905 | ||
11fdf7f2 TL |
906 | /* init memory subsystem */ |
907 | int | |
908 | rte_eal_memory_init(void) | |
909 | { | |
910 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
911 | int retval; | |
912 | RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); | |
913 | ||
914 | if (!mcfg) | |
915 | return -1; | |
916 | ||
917 | /* lock mem hotplug here, to prevent races while we init */ | |
f67539c2 | 918 | rte_mcfg_mem_read_lock(); |
11fdf7f2 TL |
919 | |
920 | if (rte_eal_memseg_init() < 0) | |
921 | goto fail; | |
922 | ||
923 | if (eal_memalloc_init() < 0) | |
924 | goto fail; | |
925 | ||
926 | retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? | |
927 | rte_eal_hugepage_init() : | |
928 | rte_eal_hugepage_attach(); | |
929 | if (retval < 0) | |
930 | goto fail; | |
931 | ||
932 | if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0) | |
933 | goto fail; | |
934 | ||
935 | return 0; | |
936 | fail: | |
f67539c2 | 937 | rte_mcfg_mem_read_unlock(); |
11fdf7f2 TL |
938 | return -1; |
939 | } |