]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2010-2014 Intel Corporation | |
3 | */ | |
4 | #include <stdint.h> | |
5 | #include <stddef.h> | |
6 | #include <stdlib.h> | |
7 | #include <stdio.h> | |
8 | #include <stdarg.h> | |
9 | #include <errno.h> | |
10 | #include <sys/queue.h> | |
11 | ||
12 | #include <rte_memory.h> | |
13 | #include <rte_errno.h> | |
14 | #include <rte_eal.h> | |
15 | #include <rte_eal_memconfig.h> | |
16 | #include <rte_launch.h> | |
17 | #include <rte_per_lcore.h> | |
18 | #include <rte_lcore.h> | |
19 | #include <rte_common.h> | |
20 | #include <rte_string_fns.h> | |
21 | #include <rte_spinlock.h> | |
22 | #include <rte_memcpy.h> | |
23 | #include <rte_atomic.h> | |
24 | #include <rte_fbarray.h> | |
25 | ||
26 | #include "eal_internal_cfg.h" | |
27 | #include "eal_memalloc.h" | |
28 | #include "malloc_elem.h" | |
29 | #include "malloc_heap.h" | |
30 | #include "malloc_mp.h" | |
31 | ||
9f95a23c TL |
32 | /* start external socket ID's at a very high number */ |
33 | #define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ | |
34 | #define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) | |
35 | ||
11fdf7f2 TL |
36 | static unsigned |
37 | check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) | |
38 | { | |
39 | unsigned check_flag = 0; | |
40 | ||
41 | if (!(flags & ~RTE_MEMZONE_SIZE_HINT_ONLY)) | |
42 | return 1; | |
43 | ||
44 | switch (hugepage_sz) { | |
45 | case RTE_PGSIZE_256K: | |
46 | check_flag = RTE_MEMZONE_256KB; | |
47 | break; | |
48 | case RTE_PGSIZE_2M: | |
49 | check_flag = RTE_MEMZONE_2MB; | |
50 | break; | |
51 | case RTE_PGSIZE_16M: | |
52 | check_flag = RTE_MEMZONE_16MB; | |
53 | break; | |
54 | case RTE_PGSIZE_256M: | |
55 | check_flag = RTE_MEMZONE_256MB; | |
56 | break; | |
57 | case RTE_PGSIZE_512M: | |
58 | check_flag = RTE_MEMZONE_512MB; | |
59 | break; | |
60 | case RTE_PGSIZE_1G: | |
61 | check_flag = RTE_MEMZONE_1GB; | |
62 | break; | |
63 | case RTE_PGSIZE_4G: | |
64 | check_flag = RTE_MEMZONE_4GB; | |
65 | break; | |
66 | case RTE_PGSIZE_16G: | |
67 | check_flag = RTE_MEMZONE_16GB; | |
68 | } | |
69 | ||
70 | return check_flag & flags; | |
71 | } | |
72 | ||
9f95a23c TL |
73 | int |
74 | malloc_socket_to_heap_id(unsigned int socket_id) | |
75 | { | |
76 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
77 | int i; | |
78 | ||
79 | for (i = 0; i < RTE_MAX_HEAPS; i++) { | |
80 | struct malloc_heap *heap = &mcfg->malloc_heaps[i]; | |
81 | ||
82 | if (heap->socket_id == socket_id) | |
83 | return i; | |
84 | } | |
85 | return -1; | |
86 | } | |
87 | ||
11fdf7f2 TL |
88 | /* |
89 | * Expand the heap with a memory area. | |
90 | */ | |
91 | static struct malloc_elem * | |
92 | malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, | |
93 | void *start, size_t len) | |
94 | { | |
95 | struct malloc_elem *elem = start; | |
96 | ||
9f95a23c | 97 | malloc_elem_init(elem, heap, msl, len, elem, len); |
11fdf7f2 TL |
98 | |
99 | malloc_elem_insert(elem); | |
100 | ||
101 | elem = malloc_elem_join_adjacent_free(elem); | |
102 | ||
103 | malloc_elem_free_list_insert(elem); | |
104 | ||
105 | return elem; | |
106 | } | |
107 | ||
108 | static int | |
109 | malloc_add_seg(const struct rte_memseg_list *msl, | |
110 | const struct rte_memseg *ms, size_t len, void *arg __rte_unused) | |
111 | { | |
112 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
113 | struct rte_memseg_list *found_msl; | |
114 | struct malloc_heap *heap; | |
9f95a23c | 115 | int msl_idx, heap_idx; |
11fdf7f2 | 116 | |
9f95a23c TL |
117 | if (msl->external) |
118 | return 0; | |
119 | ||
120 | heap_idx = malloc_socket_to_heap_id(msl->socket_id); | |
121 | if (heap_idx < 0) { | |
122 | RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); | |
123 | return -1; | |
124 | } | |
125 | heap = &mcfg->malloc_heaps[heap_idx]; | |
11fdf7f2 TL |
126 | |
127 | /* msl is const, so find it */ | |
128 | msl_idx = msl - mcfg->memsegs; | |
129 | ||
130 | if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) | |
131 | return -1; | |
132 | ||
133 | found_msl = &mcfg->memsegs[msl_idx]; | |
134 | ||
135 | malloc_heap_add_memory(heap, found_msl, ms->addr, len); | |
136 | ||
137 | heap->total_size += len; | |
138 | ||
139 | RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, | |
140 | msl->socket_id); | |
141 | return 0; | |
142 | } | |
143 | ||
144 | /* | |
145 | * Iterates through the freelist for a heap to find a free element | |
146 | * which can store data of the required size and with the requested alignment. | |
147 | * If size is 0, find the biggest available elem. | |
148 | * Returns null on failure, or pointer to element on success. | |
149 | */ | |
150 | static struct malloc_elem * | |
151 | find_suitable_element(struct malloc_heap *heap, size_t size, | |
152 | unsigned int flags, size_t align, size_t bound, bool contig) | |
153 | { | |
154 | size_t idx; | |
155 | struct malloc_elem *elem, *alt_elem = NULL; | |
156 | ||
157 | for (idx = malloc_elem_free_list_index(size); | |
158 | idx < RTE_HEAP_NUM_FREELISTS; idx++) { | |
159 | for (elem = LIST_FIRST(&heap->free_head[idx]); | |
160 | !!elem; elem = LIST_NEXT(elem, free_list)) { | |
161 | if (malloc_elem_can_hold(elem, size, align, bound, | |
162 | contig)) { | |
163 | if (check_hugepage_sz(flags, | |
164 | elem->msl->page_sz)) | |
165 | return elem; | |
166 | if (alt_elem == NULL) | |
167 | alt_elem = elem; | |
168 | } | |
169 | } | |
170 | } | |
171 | ||
172 | if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY)) | |
173 | return alt_elem; | |
174 | ||
175 | return NULL; | |
176 | } | |
177 | ||
178 | /* | |
179 | * Iterates through the freelist for a heap to find a free element with the | |
180 | * biggest size and requested alignment. Will also set size to whatever element | |
181 | * size that was found. | |
182 | * Returns null on failure, or pointer to element on success. | |
183 | */ | |
184 | static struct malloc_elem * | |
185 | find_biggest_element(struct malloc_heap *heap, size_t *size, | |
186 | unsigned int flags, size_t align, bool contig) | |
187 | { | |
188 | struct malloc_elem *elem, *max_elem = NULL; | |
189 | size_t idx, max_size = 0; | |
190 | ||
191 | for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { | |
192 | for (elem = LIST_FIRST(&heap->free_head[idx]); | |
193 | !!elem; elem = LIST_NEXT(elem, free_list)) { | |
194 | size_t cur_size; | |
195 | if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && | |
196 | !check_hugepage_sz(flags, | |
197 | elem->msl->page_sz)) | |
198 | continue; | |
199 | if (contig) { | |
200 | cur_size = | |
201 | malloc_elem_find_max_iova_contig(elem, | |
202 | align); | |
203 | } else { | |
204 | void *data_start = RTE_PTR_ADD(elem, | |
205 | MALLOC_ELEM_HEADER_LEN); | |
206 | void *data_end = RTE_PTR_ADD(elem, elem->size - | |
207 | MALLOC_ELEM_TRAILER_LEN); | |
208 | void *aligned = RTE_PTR_ALIGN_CEIL(data_start, | |
209 | align); | |
210 | /* check if aligned data start is beyond end */ | |
211 | if (aligned >= data_end) | |
212 | continue; | |
213 | cur_size = RTE_PTR_DIFF(data_end, aligned); | |
214 | } | |
215 | if (cur_size > max_size) { | |
216 | max_size = cur_size; | |
217 | max_elem = elem; | |
218 | } | |
219 | } | |
220 | } | |
221 | ||
222 | *size = max_size; | |
223 | return max_elem; | |
224 | } | |
225 | ||
226 | /* | |
227 | * Main function to allocate a block of memory from the heap. | |
228 | * It locks the free list, scans it, and adds a new memseg if the | |
229 | * scan fails. Once the new memseg is added, it re-scans and should return | |
230 | * the new element after releasing the lock. | |
231 | */ | |
232 | static void * | |
233 | heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, | |
234 | unsigned int flags, size_t align, size_t bound, bool contig) | |
235 | { | |
236 | struct malloc_elem *elem; | |
237 | ||
238 | size = RTE_CACHE_LINE_ROUNDUP(size); | |
239 | align = RTE_CACHE_LINE_ROUNDUP(align); | |
240 | ||
241 | elem = find_suitable_element(heap, size, flags, align, bound, contig); | |
242 | if (elem != NULL) { | |
243 | elem = malloc_elem_alloc(elem, size, align, bound, contig); | |
244 | ||
245 | /* increase heap's count of allocated elements */ | |
246 | heap->alloc_count++; | |
247 | } | |
248 | ||
249 | return elem == NULL ? NULL : (void *)(&elem[1]); | |
250 | } | |
251 | ||
252 | static void * | |
253 | heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, | |
254 | unsigned int flags, size_t align, bool contig) | |
255 | { | |
256 | struct malloc_elem *elem; | |
257 | size_t size; | |
258 | ||
259 | align = RTE_CACHE_LINE_ROUNDUP(align); | |
260 | ||
261 | elem = find_biggest_element(heap, &size, flags, align, contig); | |
262 | if (elem != NULL) { | |
263 | elem = malloc_elem_alloc(elem, size, align, 0, contig); | |
264 | ||
265 | /* increase heap's count of allocated elements */ | |
266 | heap->alloc_count++; | |
267 | } | |
268 | ||
269 | return elem == NULL ? NULL : (void *)(&elem[1]); | |
270 | } | |
271 | ||
272 | /* this function is exposed in malloc_mp.h */ | |
273 | void | |
274 | rollback_expand_heap(struct rte_memseg **ms, int n_segs, | |
275 | struct malloc_elem *elem, void *map_addr, size_t map_len) | |
276 | { | |
277 | if (elem != NULL) { | |
278 | malloc_elem_free_list_remove(elem); | |
279 | malloc_elem_hide_region(elem, map_addr, map_len); | |
280 | } | |
281 | ||
282 | eal_memalloc_free_seg_bulk(ms, n_segs); | |
283 | } | |
284 | ||
285 | /* this function is exposed in malloc_mp.h */ | |
286 | struct malloc_elem * | |
287 | alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, | |
288 | int socket, unsigned int flags, size_t align, size_t bound, | |
289 | bool contig, struct rte_memseg **ms, int n_segs) | |
290 | { | |
9f95a23c | 291 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; |
11fdf7f2 TL |
292 | struct rte_memseg_list *msl; |
293 | struct malloc_elem *elem = NULL; | |
294 | size_t alloc_sz; | |
295 | int allocd_pages; | |
296 | void *ret, *map_addr; | |
297 | ||
298 | alloc_sz = (size_t)pg_sz * n_segs; | |
299 | ||
300 | /* first, check if we're allowed to allocate this memory */ | |
301 | if (eal_memalloc_mem_alloc_validate(socket, | |
302 | heap->total_size + alloc_sz) < 0) { | |
303 | RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); | |
304 | return NULL; | |
305 | } | |
306 | ||
307 | allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, | |
308 | socket, true); | |
309 | ||
310 | /* make sure we've allocated our pages... */ | |
311 | if (allocd_pages < 0) | |
312 | return NULL; | |
313 | ||
314 | map_addr = ms[0]->addr; | |
315 | msl = rte_mem_virt2memseg_list(map_addr); | |
316 | ||
317 | /* check if we wanted contiguous memory but didn't get it */ | |
318 | if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { | |
319 | RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", | |
320 | __func__); | |
321 | goto fail; | |
322 | } | |
323 | ||
9f95a23c TL |
324 | /* |
325 | * Once we have all the memseg lists configured, if there is a dma mask | |
326 | * set, check iova addresses are not out of range. Otherwise the device | |
327 | * setting the dma mask could have problems with the mapped memory. | |
328 | * | |
329 | * There are two situations when this can happen: | |
330 | * 1) memory initialization | |
331 | * 2) dynamic memory allocation | |
332 | * | |
333 | * For 1), an error when checking dma mask implies app can not be | |
334 | * executed. For 2) implies the new memory can not be added. | |
335 | */ | |
336 | if (mcfg->dma_maskbits && | |
337 | rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { | |
338 | /* | |
339 | * Currently this can only happen if IOMMU is enabled | |
340 | * and the address width supported by the IOMMU hw is | |
341 | * not enough for using the memory mapped IOVAs. | |
342 | * | |
343 | * If IOVA is VA, advice to try with '--iova-mode pa' | |
344 | * which could solve some situations when IOVA VA is not | |
345 | * really needed. | |
346 | */ | |
347 | RTE_LOG(ERR, EAL, | |
348 | "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask\n", | |
349 | __func__); | |
350 | ||
351 | /* | |
352 | * If IOVA is VA and it is possible to run with IOVA PA, | |
353 | * because user is root, give and advice for solving the | |
354 | * problem. | |
355 | */ | |
356 | if ((rte_eal_iova_mode() == RTE_IOVA_VA) && | |
357 | rte_eal_using_phys_addrs()) | |
358 | RTE_LOG(ERR, EAL, | |
359 | "%s(): Please try initializing EAL with --iova-mode=pa parameter\n", | |
360 | __func__); | |
361 | goto fail; | |
362 | } | |
363 | ||
11fdf7f2 TL |
364 | /* add newly minted memsegs to malloc heap */ |
365 | elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); | |
366 | ||
367 | /* try once more, as now we have allocated new memory */ | |
368 | ret = find_suitable_element(heap, elt_size, flags, align, bound, | |
369 | contig); | |
370 | ||
371 | if (ret == NULL) | |
372 | goto fail; | |
373 | ||
374 | return elem; | |
375 | ||
376 | fail: | |
377 | rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); | |
378 | return NULL; | |
379 | } | |
380 | ||
381 | static int | |
382 | try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, | |
383 | size_t elt_size, int socket, unsigned int flags, size_t align, | |
384 | size_t bound, bool contig) | |
385 | { | |
386 | struct malloc_elem *elem; | |
387 | struct rte_memseg **ms; | |
388 | void *map_addr; | |
389 | size_t alloc_sz; | |
390 | int n_segs; | |
391 | bool callback_triggered = false; | |
392 | ||
393 | alloc_sz = RTE_ALIGN_CEIL(align + elt_size + | |
394 | MALLOC_ELEM_TRAILER_LEN, pg_sz); | |
395 | n_segs = alloc_sz / pg_sz; | |
396 | ||
397 | /* we can't know in advance how many pages we'll need, so we malloc */ | |
398 | ms = malloc(sizeof(*ms) * n_segs); | |
11fdf7f2 TL |
399 | if (ms == NULL) |
400 | return -1; | |
9f95a23c | 401 | memset(ms, 0, sizeof(*ms) * n_segs); |
11fdf7f2 TL |
402 | |
403 | elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, | |
404 | bound, contig, ms, n_segs); | |
405 | ||
406 | if (elem == NULL) | |
407 | goto free_ms; | |
408 | ||
409 | map_addr = ms[0]->addr; | |
410 | ||
411 | /* notify user about changes in memory map */ | |
412 | eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); | |
413 | ||
414 | /* notify other processes that this has happened */ | |
415 | if (request_sync()) { | |
416 | /* we couldn't ensure all processes have mapped memory, | |
417 | * so free it back and notify everyone that it's been | |
418 | * freed back. | |
419 | * | |
420 | * technically, we could've avoided adding memory addresses to | |
421 | * the map, but that would've led to inconsistent behavior | |
422 | * between primary and secondary processes, as those get | |
423 | * callbacks during sync. therefore, force primary process to | |
424 | * do alloc-and-rollback syncs as well. | |
425 | */ | |
426 | callback_triggered = true; | |
427 | goto free_elem; | |
428 | } | |
429 | heap->total_size += alloc_sz; | |
430 | ||
431 | RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", | |
432 | socket, alloc_sz >> 20ULL); | |
433 | ||
434 | free(ms); | |
435 | ||
436 | return 0; | |
437 | ||
438 | free_elem: | |
439 | if (callback_triggered) | |
440 | eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, | |
441 | map_addr, alloc_sz); | |
442 | ||
443 | rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); | |
444 | ||
445 | request_sync(); | |
446 | free_ms: | |
447 | free(ms); | |
448 | ||
449 | return -1; | |
450 | } | |
451 | ||
452 | static int | |
453 | try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, | |
454 | size_t elt_size, int socket, unsigned int flags, size_t align, | |
455 | size_t bound, bool contig) | |
456 | { | |
457 | struct malloc_mp_req req; | |
458 | int req_result; | |
459 | ||
460 | memset(&req, 0, sizeof(req)); | |
461 | ||
462 | req.t = REQ_TYPE_ALLOC; | |
463 | req.alloc_req.align = align; | |
464 | req.alloc_req.bound = bound; | |
465 | req.alloc_req.contig = contig; | |
466 | req.alloc_req.flags = flags; | |
467 | req.alloc_req.elt_size = elt_size; | |
468 | req.alloc_req.page_sz = pg_sz; | |
469 | req.alloc_req.socket = socket; | |
470 | req.alloc_req.heap = heap; /* it's in shared memory */ | |
471 | ||
472 | req_result = request_to_primary(&req); | |
473 | ||
474 | if (req_result != 0) | |
475 | return -1; | |
476 | ||
477 | if (req.result != REQ_RESULT_SUCCESS) | |
478 | return -1; | |
479 | ||
480 | return 0; | |
481 | } | |
482 | ||
483 | static int | |
484 | try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, | |
485 | int socket, unsigned int flags, size_t align, size_t bound, | |
486 | bool contig) | |
487 | { | |
488 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
489 | int ret; | |
490 | ||
491 | rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); | |
492 | ||
493 | if (rte_eal_process_type() == RTE_PROC_PRIMARY) { | |
494 | ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, | |
495 | flags, align, bound, contig); | |
496 | } else { | |
497 | ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, | |
498 | flags, align, bound, contig); | |
499 | } | |
500 | ||
501 | rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); | |
502 | return ret; | |
503 | } | |
504 | ||
505 | static int | |
506 | compare_pagesz(const void *a, const void *b) | |
507 | { | |
508 | const struct rte_memseg_list * const*mpa = a; | |
509 | const struct rte_memseg_list * const*mpb = b; | |
510 | const struct rte_memseg_list *msla = *mpa; | |
511 | const struct rte_memseg_list *mslb = *mpb; | |
512 | uint64_t pg_sz_a = msla->page_sz; | |
513 | uint64_t pg_sz_b = mslb->page_sz; | |
514 | ||
515 | if (pg_sz_a < pg_sz_b) | |
516 | return -1; | |
517 | if (pg_sz_a > pg_sz_b) | |
518 | return 1; | |
519 | return 0; | |
520 | } | |
521 | ||
522 | static int | |
523 | alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, | |
524 | unsigned int flags, size_t align, size_t bound, bool contig) | |
525 | { | |
526 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
527 | struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; | |
528 | struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; | |
529 | uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; | |
530 | uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; | |
531 | uint64_t prev_pg_sz; | |
532 | int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; | |
533 | bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; | |
534 | unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; | |
535 | void *ret; | |
536 | ||
537 | memset(requested_msls, 0, sizeof(requested_msls)); | |
538 | memset(other_msls, 0, sizeof(other_msls)); | |
539 | memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); | |
540 | memset(other_pg_sz, 0, sizeof(other_pg_sz)); | |
541 | ||
542 | /* | |
543 | * go through memseg list and take note of all the page sizes available, | |
544 | * and if any of them were specifically requested by the user. | |
545 | */ | |
546 | n_requested_msls = 0; | |
547 | n_other_msls = 0; | |
548 | for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { | |
549 | struct rte_memseg_list *msl = &mcfg->memsegs[i]; | |
550 | ||
551 | if (msl->socket_id != socket) | |
552 | continue; | |
553 | ||
554 | if (msl->base_va == NULL) | |
555 | continue; | |
556 | ||
557 | /* if pages of specific size were requested */ | |
558 | if (size_flags != 0 && check_hugepage_sz(size_flags, | |
559 | msl->page_sz)) | |
560 | requested_msls[n_requested_msls++] = msl; | |
561 | else if (size_flags == 0 || size_hint) | |
562 | other_msls[n_other_msls++] = msl; | |
563 | } | |
564 | ||
565 | /* sort the lists, smallest first */ | |
566 | qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), | |
567 | compare_pagesz); | |
568 | qsort(other_msls, n_other_msls, sizeof(other_msls[0]), | |
569 | compare_pagesz); | |
570 | ||
571 | /* now, extract page sizes we are supposed to try */ | |
572 | prev_pg_sz = 0; | |
573 | n_requested_pg_sz = 0; | |
574 | for (i = 0; i < n_requested_msls; i++) { | |
575 | uint64_t pg_sz = requested_msls[i]->page_sz; | |
576 | ||
577 | if (prev_pg_sz != pg_sz) { | |
578 | requested_pg_sz[n_requested_pg_sz++] = pg_sz; | |
579 | prev_pg_sz = pg_sz; | |
580 | } | |
581 | } | |
582 | prev_pg_sz = 0; | |
583 | n_other_pg_sz = 0; | |
584 | for (i = 0; i < n_other_msls; i++) { | |
585 | uint64_t pg_sz = other_msls[i]->page_sz; | |
586 | ||
587 | if (prev_pg_sz != pg_sz) { | |
588 | other_pg_sz[n_other_pg_sz++] = pg_sz; | |
589 | prev_pg_sz = pg_sz; | |
590 | } | |
591 | } | |
592 | ||
593 | /* finally, try allocating memory of specified page sizes, starting from | |
594 | * the smallest sizes | |
595 | */ | |
596 | for (i = 0; i < n_requested_pg_sz; i++) { | |
597 | uint64_t pg_sz = requested_pg_sz[i]; | |
598 | ||
599 | /* | |
600 | * do not pass the size hint here, as user expects other page | |
601 | * sizes first, before resorting to best effort allocation. | |
602 | */ | |
603 | if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, | |
604 | align, bound, contig)) | |
605 | return 0; | |
606 | } | |
607 | if (n_other_pg_sz == 0) | |
608 | return -1; | |
609 | ||
610 | /* now, check if we can reserve anything with size hint */ | |
611 | ret = find_suitable_element(heap, size, flags, align, bound, contig); | |
612 | if (ret != NULL) | |
613 | return 0; | |
614 | ||
615 | /* | |
616 | * we still couldn't reserve memory, so try expanding heap with other | |
617 | * page sizes, if there are any | |
618 | */ | |
619 | for (i = 0; i < n_other_pg_sz; i++) { | |
620 | uint64_t pg_sz = other_pg_sz[i]; | |
621 | ||
622 | if (!try_expand_heap(heap, pg_sz, size, socket, flags, | |
623 | align, bound, contig)) | |
624 | return 0; | |
625 | } | |
626 | return -1; | |
627 | } | |
628 | ||
629 | /* this will try lower page sizes first */ | |
630 | static void * | |
9f95a23c TL |
631 | malloc_heap_alloc_on_heap_id(const char *type, size_t size, |
632 | unsigned int heap_id, unsigned int flags, size_t align, | |
633 | size_t bound, bool contig) | |
11fdf7f2 TL |
634 | { |
635 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
9f95a23c | 636 | struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; |
11fdf7f2 | 637 | unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; |
9f95a23c | 638 | int socket_id; |
11fdf7f2 TL |
639 | void *ret; |
640 | ||
641 | rte_spinlock_lock(&(heap->lock)); | |
642 | ||
643 | align = align == 0 ? 1 : align; | |
644 | ||
645 | /* for legacy mode, try once and with all flags */ | |
646 | if (internal_config.legacy_mem) { | |
647 | ret = heap_alloc(heap, type, size, flags, align, bound, contig); | |
648 | goto alloc_unlock; | |
649 | } | |
650 | ||
651 | /* | |
652 | * we do not pass the size hint here, because even if allocation fails, | |
653 | * we may still be able to allocate memory from appropriate page sizes, | |
654 | * we just need to request more memory first. | |
655 | */ | |
9f95a23c TL |
656 | |
657 | socket_id = rte_socket_id_by_idx(heap_id); | |
658 | /* | |
659 | * if socket ID is negative, we cannot find a socket ID for this heap - | |
660 | * which means it's an external heap. those can have unexpected page | |
661 | * sizes, so if the user asked to allocate from there - assume user | |
662 | * knows what they're doing, and allow allocating from there with any | |
663 | * page size flags. | |
664 | */ | |
665 | if (socket_id < 0) | |
666 | size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; | |
667 | ||
11fdf7f2 TL |
668 | ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); |
669 | if (ret != NULL) | |
670 | goto alloc_unlock; | |
671 | ||
9f95a23c TL |
672 | /* if socket ID is invalid, this is an external heap */ |
673 | if (socket_id < 0) | |
674 | goto alloc_unlock; | |
675 | ||
676 | if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, | |
677 | bound, contig)) { | |
11fdf7f2 TL |
678 | ret = heap_alloc(heap, type, size, flags, align, bound, contig); |
679 | ||
680 | /* this should have succeeded */ | |
681 | if (ret == NULL) | |
682 | RTE_LOG(ERR, EAL, "Error allocating from heap\n"); | |
683 | } | |
684 | alloc_unlock: | |
685 | rte_spinlock_unlock(&(heap->lock)); | |
686 | return ret; | |
687 | } | |
688 | ||
689 | void * | |
690 | malloc_heap_alloc(const char *type, size_t size, int socket_arg, | |
691 | unsigned int flags, size_t align, size_t bound, bool contig) | |
692 | { | |
9f95a23c | 693 | int socket, heap_id, i; |
11fdf7f2 TL |
694 | void *ret; |
695 | ||
696 | /* return NULL if size is 0 or alignment is not power-of-2 */ | |
697 | if (size == 0 || (align && !rte_is_power_of_2(align))) | |
698 | return NULL; | |
699 | ||
9f95a23c | 700 | if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) |
11fdf7f2 TL |
701 | socket_arg = SOCKET_ID_ANY; |
702 | ||
703 | if (socket_arg == SOCKET_ID_ANY) | |
704 | socket = malloc_get_numa_socket(); | |
705 | else | |
706 | socket = socket_arg; | |
707 | ||
9f95a23c TL |
708 | /* turn socket ID into heap ID */ |
709 | heap_id = malloc_socket_to_heap_id(socket); | |
710 | /* if heap id is negative, socket ID was invalid */ | |
711 | if (heap_id < 0) | |
11fdf7f2 TL |
712 | return NULL; |
713 | ||
9f95a23c TL |
714 | ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, |
715 | bound, contig); | |
11fdf7f2 TL |
716 | if (ret != NULL || socket_arg != SOCKET_ID_ANY) |
717 | return ret; | |
718 | ||
9f95a23c TL |
719 | /* try other heaps. we are only iterating through native DPDK sockets, |
720 | * so external heaps won't be included. | |
721 | */ | |
11fdf7f2 | 722 | for (i = 0; i < (int) rte_socket_count(); i++) { |
9f95a23c | 723 | if (i == heap_id) |
11fdf7f2 | 724 | continue; |
9f95a23c TL |
725 | ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, |
726 | bound, contig); | |
11fdf7f2 TL |
727 | if (ret != NULL) |
728 | return ret; | |
729 | } | |
730 | return NULL; | |
731 | } | |
732 | ||
733 | static void * | |
9f95a23c TL |
734 | heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, |
735 | unsigned int flags, size_t align, bool contig) | |
11fdf7f2 TL |
736 | { |
737 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
9f95a23c | 738 | struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; |
11fdf7f2 TL |
739 | void *ret; |
740 | ||
741 | rte_spinlock_lock(&(heap->lock)); | |
742 | ||
743 | align = align == 0 ? 1 : align; | |
744 | ||
745 | ret = heap_alloc_biggest(heap, type, flags, align, contig); | |
746 | ||
747 | rte_spinlock_unlock(&(heap->lock)); | |
748 | ||
749 | return ret; | |
750 | } | |
751 | ||
752 | void * | |
753 | malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, | |
754 | size_t align, bool contig) | |
755 | { | |
9f95a23c | 756 | int socket, i, cur_socket, heap_id; |
11fdf7f2 TL |
757 | void *ret; |
758 | ||
759 | /* return NULL if align is not power-of-2 */ | |
760 | if ((align && !rte_is_power_of_2(align))) | |
761 | return NULL; | |
762 | ||
763 | if (!rte_eal_has_hugepages()) | |
764 | socket_arg = SOCKET_ID_ANY; | |
765 | ||
766 | if (socket_arg == SOCKET_ID_ANY) | |
767 | socket = malloc_get_numa_socket(); | |
768 | else | |
769 | socket = socket_arg; | |
770 | ||
9f95a23c TL |
771 | /* turn socket ID into heap ID */ |
772 | heap_id = malloc_socket_to_heap_id(socket); | |
773 | /* if heap id is negative, socket ID was invalid */ | |
774 | if (heap_id < 0) | |
11fdf7f2 TL |
775 | return NULL; |
776 | ||
9f95a23c | 777 | ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, |
11fdf7f2 TL |
778 | contig); |
779 | if (ret != NULL || socket_arg != SOCKET_ID_ANY) | |
780 | return ret; | |
781 | ||
782 | /* try other heaps */ | |
783 | for (i = 0; i < (int) rte_socket_count(); i++) { | |
784 | cur_socket = rte_socket_id_by_idx(i); | |
785 | if (cur_socket == socket) | |
786 | continue; | |
9f95a23c TL |
787 | ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, |
788 | contig); | |
11fdf7f2 TL |
789 | if (ret != NULL) |
790 | return ret; | |
791 | } | |
792 | return NULL; | |
793 | } | |
794 | ||
795 | /* this function is exposed in malloc_mp.h */ | |
796 | int | |
797 | malloc_heap_free_pages(void *aligned_start, size_t aligned_len) | |
798 | { | |
799 | int n_segs, seg_idx, max_seg_idx; | |
800 | struct rte_memseg_list *msl; | |
801 | size_t page_sz; | |
802 | ||
803 | msl = rte_mem_virt2memseg_list(aligned_start); | |
804 | if (msl == NULL) | |
805 | return -1; | |
806 | ||
807 | page_sz = (size_t)msl->page_sz; | |
808 | n_segs = aligned_len / page_sz; | |
809 | seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; | |
810 | max_seg_idx = seg_idx + n_segs; | |
811 | ||
812 | for (; seg_idx < max_seg_idx; seg_idx++) { | |
813 | struct rte_memseg *ms; | |
814 | ||
815 | ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); | |
816 | eal_memalloc_free_seg(ms); | |
817 | } | |
818 | return 0; | |
819 | } | |
820 | ||
821 | int | |
822 | malloc_heap_free(struct malloc_elem *elem) | |
823 | { | |
824 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
825 | struct malloc_heap *heap; | |
826 | void *start, *aligned_start, *end, *aligned_end; | |
827 | size_t len, aligned_len, page_sz; | |
828 | struct rte_memseg_list *msl; | |
829 | unsigned int i, n_segs, before_space, after_space; | |
830 | int ret; | |
831 | ||
832 | if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) | |
833 | return -1; | |
834 | ||
835 | /* elem may be merged with previous element, so keep heap address */ | |
836 | heap = elem->heap; | |
837 | msl = elem->msl; | |
838 | page_sz = (size_t)msl->page_sz; | |
839 | ||
840 | rte_spinlock_lock(&(heap->lock)); | |
841 | ||
842 | /* mark element as free */ | |
843 | elem->state = ELEM_FREE; | |
844 | ||
845 | elem = malloc_elem_free(elem); | |
846 | ||
847 | /* anything after this is a bonus */ | |
848 | ret = 0; | |
849 | ||
9f95a23c TL |
850 | /* ...of which we can't avail if we are in legacy mode, or if this is an |
851 | * externally allocated segment. | |
852 | */ | |
853 | if (internal_config.legacy_mem || (msl->external > 0)) | |
11fdf7f2 TL |
854 | goto free_unlock; |
855 | ||
856 | /* check if we can free any memory back to the system */ | |
857 | if (elem->size < page_sz) | |
858 | goto free_unlock; | |
859 | ||
9f95a23c TL |
860 | /* if user requested to match allocations, the sizes must match - if not, |
861 | * we will defer freeing these hugepages until the entire original allocation | |
862 | * can be freed | |
863 | */ | |
864 | if (internal_config.match_allocations && elem->size != elem->orig_size) | |
865 | goto free_unlock; | |
866 | ||
11fdf7f2 TL |
867 | /* probably, but let's make sure, as we may not be using up full page */ |
868 | start = elem; | |
869 | len = elem->size; | |
870 | aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); | |
871 | end = RTE_PTR_ADD(elem, len); | |
872 | aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); | |
873 | ||
874 | aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); | |
875 | ||
876 | /* can't free anything */ | |
877 | if (aligned_len < page_sz) | |
878 | goto free_unlock; | |
879 | ||
880 | /* we can free something. however, some of these pages may be marked as | |
881 | * unfreeable, so also check that as well | |
882 | */ | |
883 | n_segs = aligned_len / page_sz; | |
884 | for (i = 0; i < n_segs; i++) { | |
885 | const struct rte_memseg *tmp = | |
886 | rte_mem_virt2memseg(aligned_start, msl); | |
887 | ||
888 | if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { | |
889 | /* this is an unfreeable segment, so move start */ | |
890 | aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); | |
891 | } | |
892 | } | |
893 | ||
894 | /* recalculate length and number of segments */ | |
895 | aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); | |
896 | n_segs = aligned_len / page_sz; | |
897 | ||
898 | /* check if we can still free some pages */ | |
899 | if (n_segs == 0) | |
900 | goto free_unlock; | |
901 | ||
902 | /* We're not done yet. We also have to check if by freeing space we will | |
903 | * be leaving free elements that are too small to store new elements. | |
904 | * Check if we have enough space in the beginning and at the end, or if | |
905 | * start/end are exactly page aligned. | |
906 | */ | |
907 | before_space = RTE_PTR_DIFF(aligned_start, elem); | |
908 | after_space = RTE_PTR_DIFF(end, aligned_end); | |
909 | if (before_space != 0 && | |
910 | before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { | |
911 | /* There is not enough space before start, but we may be able to | |
912 | * move the start forward by one page. | |
913 | */ | |
914 | if (n_segs == 1) | |
915 | goto free_unlock; | |
916 | ||
917 | /* move start */ | |
918 | aligned_start = RTE_PTR_ADD(aligned_start, page_sz); | |
919 | aligned_len -= page_sz; | |
920 | n_segs--; | |
921 | } | |
922 | if (after_space != 0 && after_space < | |
923 | MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { | |
924 | /* There is not enough space after end, but we may be able to | |
925 | * move the end backwards by one page. | |
926 | */ | |
927 | if (n_segs == 1) | |
928 | goto free_unlock; | |
929 | ||
930 | /* move end */ | |
931 | aligned_end = RTE_PTR_SUB(aligned_end, page_sz); | |
932 | aligned_len -= page_sz; | |
933 | n_segs--; | |
934 | } | |
935 | ||
936 | /* now we can finally free us some pages */ | |
937 | ||
938 | rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); | |
939 | ||
940 | /* | |
941 | * we allow secondary processes to clear the heap of this allocated | |
942 | * memory because it is safe to do so, as even if notifications about | |
943 | * unmapped pages don't make it to other processes, heap is shared | |
944 | * across all processes, and will become empty of this memory anyway, | |
945 | * and nothing can allocate it back unless primary process will be able | |
946 | * to deliver allocation message to every single running process. | |
947 | */ | |
948 | ||
949 | malloc_elem_free_list_remove(elem); | |
950 | ||
951 | malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); | |
952 | ||
953 | heap->total_size -= aligned_len; | |
954 | ||
955 | if (rte_eal_process_type() == RTE_PROC_PRIMARY) { | |
956 | /* notify user about changes in memory map */ | |
957 | eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, | |
958 | aligned_start, aligned_len); | |
959 | ||
960 | /* don't care if any of this fails */ | |
961 | malloc_heap_free_pages(aligned_start, aligned_len); | |
962 | ||
963 | request_sync(); | |
964 | } else { | |
965 | struct malloc_mp_req req; | |
966 | ||
967 | memset(&req, 0, sizeof(req)); | |
968 | ||
969 | req.t = REQ_TYPE_FREE; | |
970 | req.free_req.addr = aligned_start; | |
971 | req.free_req.len = aligned_len; | |
972 | ||
973 | /* | |
974 | * we request primary to deallocate pages, but we don't do it | |
975 | * in this thread. instead, we notify primary that we would like | |
976 | * to deallocate pages, and this process will receive another | |
977 | * request (in parallel) that will do it for us on another | |
978 | * thread. | |
979 | * | |
980 | * we also don't really care if this succeeds - the data is | |
981 | * already removed from the heap, so it is, for all intents and | |
982 | * purposes, hidden from the rest of DPDK even if some other | |
983 | * process (including this one) may have these pages mapped. | |
984 | * | |
985 | * notifications about deallocated memory happen during sync. | |
986 | */ | |
987 | request_to_primary(&req); | |
988 | } | |
989 | ||
990 | RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", | |
991 | msl->socket_id, aligned_len >> 20ULL); | |
992 | ||
993 | rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); | |
994 | free_unlock: | |
995 | rte_spinlock_unlock(&(heap->lock)); | |
996 | return ret; | |
997 | } | |
998 | ||
999 | int | |
1000 | malloc_heap_resize(struct malloc_elem *elem, size_t size) | |
1001 | { | |
1002 | int ret; | |
1003 | ||
1004 | if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) | |
1005 | return -1; | |
1006 | ||
1007 | rte_spinlock_lock(&(elem->heap->lock)); | |
1008 | ||
1009 | ret = malloc_elem_resize(elem, size); | |
1010 | ||
1011 | rte_spinlock_unlock(&(elem->heap->lock)); | |
1012 | ||
1013 | return ret; | |
1014 | } | |
1015 | ||
1016 | /* | |
9f95a23c | 1017 | * Function to retrieve data for a given heap |
11fdf7f2 TL |
1018 | */ |
1019 | int | |
1020 | malloc_heap_get_stats(struct malloc_heap *heap, | |
1021 | struct rte_malloc_socket_stats *socket_stats) | |
1022 | { | |
1023 | size_t idx; | |
1024 | struct malloc_elem *elem; | |
1025 | ||
1026 | rte_spinlock_lock(&heap->lock); | |
1027 | ||
1028 | /* Initialise variables for heap */ | |
1029 | socket_stats->free_count = 0; | |
1030 | socket_stats->heap_freesz_bytes = 0; | |
1031 | socket_stats->greatest_free_size = 0; | |
1032 | ||
1033 | /* Iterate through free list */ | |
1034 | for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { | |
1035 | for (elem = LIST_FIRST(&heap->free_head[idx]); | |
1036 | !!elem; elem = LIST_NEXT(elem, free_list)) | |
1037 | { | |
1038 | socket_stats->free_count++; | |
1039 | socket_stats->heap_freesz_bytes += elem->size; | |
1040 | if (elem->size > socket_stats->greatest_free_size) | |
1041 | socket_stats->greatest_free_size = elem->size; | |
1042 | } | |
1043 | } | |
1044 | /* Get stats on overall heap and allocated memory on this heap */ | |
1045 | socket_stats->heap_totalsz_bytes = heap->total_size; | |
1046 | socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes - | |
1047 | socket_stats->heap_freesz_bytes); | |
1048 | socket_stats->alloc_count = heap->alloc_count; | |
1049 | ||
1050 | rte_spinlock_unlock(&heap->lock); | |
1051 | return 0; | |
1052 | } | |
1053 | ||
1054 | /* | |
9f95a23c | 1055 | * Function to retrieve data for a given heap |
11fdf7f2 TL |
1056 | */ |
1057 | void | |
1058 | malloc_heap_dump(struct malloc_heap *heap, FILE *f) | |
1059 | { | |
1060 | struct malloc_elem *elem; | |
1061 | ||
1062 | rte_spinlock_lock(&heap->lock); | |
1063 | ||
1064 | fprintf(f, "Heap size: 0x%zx\n", heap->total_size); | |
1065 | fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); | |
1066 | ||
1067 | elem = heap->first; | |
1068 | while (elem) { | |
1069 | malloc_elem_dump(elem, f); | |
1070 | elem = elem->next; | |
1071 | } | |
1072 | ||
1073 | rte_spinlock_unlock(&heap->lock); | |
1074 | } | |
1075 | ||
9f95a23c TL |
1076 | static int |
1077 | destroy_elem(struct malloc_elem *elem, size_t len) | |
1078 | { | |
1079 | struct malloc_heap *heap = elem->heap; | |
1080 | ||
1081 | /* notify all subscribers that a memory area is going to be removed */ | |
1082 | eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); | |
1083 | ||
1084 | /* this element can be removed */ | |
1085 | malloc_elem_free_list_remove(elem); | |
1086 | malloc_elem_hide_region(elem, elem, len); | |
1087 | ||
1088 | heap->total_size -= len; | |
1089 | ||
1090 | memset(elem, 0, sizeof(*elem)); | |
1091 | ||
1092 | return 0; | |
1093 | } | |
1094 | ||
1095 | struct rte_memseg_list * | |
1096 | malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], | |
1097 | unsigned int n_pages, size_t page_sz, const char *seg_name, | |
1098 | unsigned int socket_id) | |
1099 | { | |
1100 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
1101 | char fbarray_name[RTE_FBARRAY_NAME_LEN]; | |
1102 | struct rte_memseg_list *msl = NULL; | |
1103 | struct rte_fbarray *arr; | |
1104 | size_t seg_len = n_pages * page_sz; | |
1105 | unsigned int i; | |
1106 | ||
1107 | /* first, find a free memseg list */ | |
1108 | for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { | |
1109 | struct rte_memseg_list *tmp = &mcfg->memsegs[i]; | |
1110 | if (tmp->base_va == NULL) { | |
1111 | msl = tmp; | |
1112 | break; | |
1113 | } | |
1114 | } | |
1115 | if (msl == NULL) { | |
1116 | RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); | |
1117 | rte_errno = ENOSPC; | |
1118 | return NULL; | |
1119 | } | |
1120 | ||
1121 | snprintf(fbarray_name, sizeof(fbarray_name) - 1, "%s_%p", | |
1122 | seg_name, va_addr); | |
1123 | ||
1124 | /* create the backing fbarray */ | |
1125 | if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, | |
1126 | sizeof(struct rte_memseg)) < 0) { | |
1127 | RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); | |
1128 | return NULL; | |
1129 | } | |
1130 | arr = &msl->memseg_arr; | |
1131 | ||
1132 | /* fbarray created, fill it up */ | |
1133 | for (i = 0; i < n_pages; i++) { | |
1134 | struct rte_memseg *ms; | |
1135 | ||
1136 | rte_fbarray_set_used(arr, i); | |
1137 | ms = rte_fbarray_get(arr, i); | |
1138 | ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); | |
1139 | ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; | |
1140 | ms->hugepage_sz = page_sz; | |
1141 | ms->len = page_sz; | |
1142 | ms->nchannel = rte_memory_get_nchannel(); | |
1143 | ms->nrank = rte_memory_get_nrank(); | |
1144 | ms->socket_id = socket_id; | |
1145 | } | |
1146 | ||
1147 | /* set up the memseg list */ | |
1148 | msl->base_va = va_addr; | |
1149 | msl->page_sz = page_sz; | |
1150 | msl->socket_id = socket_id; | |
1151 | msl->len = seg_len; | |
1152 | msl->version = 0; | |
1153 | msl->external = 1; | |
1154 | ||
1155 | return msl; | |
1156 | } | |
1157 | ||
1158 | struct extseg_walk_arg { | |
1159 | void *va_addr; | |
1160 | size_t len; | |
1161 | struct rte_memseg_list *msl; | |
1162 | }; | |
1163 | ||
1164 | static int | |
1165 | extseg_walk(const struct rte_memseg_list *msl, void *arg) | |
1166 | { | |
1167 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
1168 | struct extseg_walk_arg *wa = arg; | |
1169 | ||
1170 | if (msl->base_va == wa->va_addr && msl->len == wa->len) { | |
1171 | unsigned int found_idx; | |
1172 | ||
1173 | /* msl is const */ | |
1174 | found_idx = msl - mcfg->memsegs; | |
1175 | wa->msl = &mcfg->memsegs[found_idx]; | |
1176 | return 1; | |
1177 | } | |
1178 | return 0; | |
1179 | } | |
1180 | ||
1181 | struct rte_memseg_list * | |
1182 | malloc_heap_find_external_seg(void *va_addr, size_t len) | |
1183 | { | |
1184 | struct extseg_walk_arg wa; | |
1185 | int res; | |
1186 | ||
1187 | wa.va_addr = va_addr; | |
1188 | wa.len = len; | |
1189 | ||
1190 | res = rte_memseg_list_walk_thread_unsafe(extseg_walk, &wa); | |
1191 | ||
1192 | if (res != 1) { | |
1193 | /* 0 means nothing was found, -1 shouldn't happen */ | |
1194 | if (res == 0) | |
1195 | rte_errno = ENOENT; | |
1196 | return NULL; | |
1197 | } | |
1198 | return wa.msl; | |
1199 | } | |
1200 | ||
1201 | int | |
1202 | malloc_heap_destroy_external_seg(struct rte_memseg_list *msl) | |
1203 | { | |
1204 | /* destroy the fbarray backing this memory */ | |
1205 | if (rte_fbarray_destroy(&msl->memseg_arr) < 0) | |
1206 | return -1; | |
1207 | ||
1208 | /* reset the memseg list */ | |
1209 | memset(msl, 0, sizeof(*msl)); | |
1210 | ||
1211 | return 0; | |
1212 | } | |
1213 | ||
1214 | int | |
1215 | malloc_heap_add_external_memory(struct malloc_heap *heap, | |
1216 | struct rte_memseg_list *msl) | |
1217 | { | |
1218 | /* erase contents of new memory */ | |
1219 | memset(msl->base_va, 0, msl->len); | |
1220 | ||
1221 | /* now, add newly minted memory to the malloc heap */ | |
1222 | malloc_heap_add_memory(heap, msl, msl->base_va, msl->len); | |
1223 | ||
1224 | heap->total_size += msl->len; | |
1225 | ||
1226 | /* all done! */ | |
1227 | RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", | |
1228 | heap->name, msl->base_va); | |
1229 | ||
1230 | /* notify all subscribers that a new memory area has been added */ | |
1231 | eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, | |
1232 | msl->base_va, msl->len); | |
1233 | ||
1234 | return 0; | |
1235 | } | |
1236 | ||
1237 | int | |
1238 | malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, | |
1239 | size_t len) | |
1240 | { | |
1241 | struct malloc_elem *elem = heap->first; | |
1242 | ||
1243 | /* find element with specified va address */ | |
1244 | while (elem != NULL && elem != va_addr) { | |
1245 | elem = elem->next; | |
1246 | /* stop if we've blown past our VA */ | |
1247 | if (elem > (struct malloc_elem *)va_addr) { | |
1248 | rte_errno = ENOENT; | |
1249 | return -1; | |
1250 | } | |
1251 | } | |
1252 | /* check if element was found */ | |
1253 | if (elem == NULL || elem->msl->len != len) { | |
1254 | rte_errno = ENOENT; | |
1255 | return -1; | |
1256 | } | |
1257 | /* if element's size is not equal to segment len, segment is busy */ | |
1258 | if (elem->state == ELEM_BUSY || elem->size != len) { | |
1259 | rte_errno = EBUSY; | |
1260 | return -1; | |
1261 | } | |
1262 | return destroy_elem(elem, len); | |
1263 | } | |
1264 | ||
1265 | int | |
1266 | malloc_heap_create(struct malloc_heap *heap, const char *heap_name) | |
1267 | { | |
1268 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
1269 | uint32_t next_socket_id = mcfg->next_socket_id; | |
1270 | ||
1271 | /* prevent overflow. did you really create 2 billion heaps??? */ | |
1272 | if (next_socket_id > INT32_MAX) { | |
1273 | RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); | |
1274 | rte_errno = ENOSPC; | |
1275 | return -1; | |
1276 | } | |
1277 | ||
1278 | /* initialize empty heap */ | |
1279 | heap->alloc_count = 0; | |
1280 | heap->first = NULL; | |
1281 | heap->last = NULL; | |
1282 | LIST_INIT(heap->free_head); | |
1283 | rte_spinlock_init(&heap->lock); | |
1284 | heap->total_size = 0; | |
1285 | heap->socket_id = next_socket_id; | |
1286 | ||
1287 | /* we hold a global mem hotplug writelock, so it's safe to increment */ | |
1288 | mcfg->next_socket_id++; | |
1289 | ||
1290 | /* set up name */ | |
1291 | strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); | |
1292 | return 0; | |
1293 | } | |
1294 | ||
1295 | int | |
1296 | malloc_heap_destroy(struct malloc_heap *heap) | |
1297 | { | |
1298 | if (heap->alloc_count != 0) { | |
1299 | RTE_LOG(ERR, EAL, "Heap is still in use\n"); | |
1300 | rte_errno = EBUSY; | |
1301 | return -1; | |
1302 | } | |
1303 | if (heap->first != NULL || heap->last != NULL) { | |
1304 | RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); | |
1305 | rte_errno = EBUSY; | |
1306 | return -1; | |
1307 | } | |
1308 | if (heap->total_size != 0) | |
1309 | RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); | |
1310 | ||
1311 | /* after this, the lock will be dropped */ | |
1312 | memset(heap, 0, sizeof(*heap)); | |
1313 | ||
1314 | return 0; | |
1315 | } | |
1316 | ||
11fdf7f2 TL |
1317 | int |
1318 | rte_eal_malloc_heap_init(void) | |
1319 | { | |
1320 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; | |
9f95a23c TL |
1321 | unsigned int i; |
1322 | ||
1323 | if (internal_config.match_allocations) { | |
1324 | RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n"); | |
1325 | } | |
1326 | ||
1327 | if (rte_eal_process_type() == RTE_PROC_PRIMARY) { | |
1328 | /* assign min socket ID to external heaps */ | |
1329 | mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; | |
1330 | ||
1331 | /* assign names to default DPDK heaps */ | |
1332 | for (i = 0; i < rte_socket_count(); i++) { | |
1333 | struct malloc_heap *heap = &mcfg->malloc_heaps[i]; | |
1334 | char heap_name[RTE_HEAP_NAME_MAX_LEN]; | |
1335 | int socket_id = rte_socket_id_by_idx(i); | |
1336 | ||
1337 | snprintf(heap_name, sizeof(heap_name) - 1, | |
1338 | "socket_%i", socket_id); | |
1339 | strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); | |
1340 | heap->socket_id = socket_id; | |
1341 | } | |
1342 | } | |
1343 | ||
11fdf7f2 TL |
1344 | |
1345 | if (register_mp_requests()) { | |
1346 | RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); | |
1347 | rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); | |
1348 | return -1; | |
1349 | } | |
1350 | ||
1351 | /* unlock mem hotplug here. it's safe for primary as no requests can | |
1352 | * even come before primary itself is fully initialized, and secondaries | |
1353 | * do not need to initialize the heap. | |
1354 | */ | |
1355 | rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); | |
1356 | ||
1357 | /* secondary process does not need to initialize anything */ | |
1358 | if (rte_eal_process_type() != RTE_PROC_PRIMARY) | |
1359 | return 0; | |
1360 | ||
1361 | /* add all IOVA-contiguous areas to the heap */ | |
1362 | return rte_memseg_contig_walk(malloc_add_seg, NULL); | |
1363 | } |