1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017-2018 Intel Corporation
5 #define _FILE_OFFSET_BITS 64
15 #include <sys/types.h>
17 #include <sys/queue.h>
22 #include <sys/ioctl.h>
26 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
30 #include <linux/falloc.h>
31 #include <linux/mman.h> /* for hugetlb-related mmap flags */
33 #include <rte_common.h>
35 #include <rte_eal_memconfig.h>
37 #include <rte_memory.h>
38 #include <rte_spinlock.h>
40 #include "eal_filesystem.h"
41 #include "eal_internal_cfg.h"
42 #include "eal_memalloc.h"
43 #include "eal_private.h"
45 const int anonymous_hugepages_supported
=
48 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
51 #define RTE_MAP_HUGE_SHIFT 26
55 * not all kernel version support fallocate on hugetlbfs, so fall back to
56 * ftruncate and disallow deallocation if fallocate is not supported.
58 static int fallocate_supported
= -1; /* unknown */
60 /* for single-file segments, we need some kind of mechanism to keep track of
61 * which hugepages can be freed back to the system, and which cannot. we cannot
62 * use flock() because they don't allow locking parts of a file, and we cannot
63 * use fcntl() due to issues with their semantics, so we will have to rely on a
64 * bunch of lockfiles for each page.
66 * we cannot know how many pages a system will have in advance, but we do know
67 * that they come in lists, and we know lengths of these lists. so, simply store
68 * a malloc'd array of fd's indexed by list and segment index.
70 * they will be initialized at startup, and filled as we allocate/deallocate
71 * segments. also, use this to track memseg list proper fd.
74 int *fds
; /**< dynamically allocated array of segment lock fd's */
75 int memseg_list_fd
; /**< memseg list fd */
76 int len
; /**< total length of the array */
77 int count
; /**< entries used in an array */
78 } lock_fds
[RTE_MAX_MEMSEG_LISTS
];
80 /** local copy of a memory map, used to synchronize memory hotplug in MP */
81 static struct rte_memseg_list local_memsegs
[RTE_MAX_MEMSEG_LISTS
];
83 static sigjmp_buf huge_jmpenv
;
85 static void __rte_unused
huge_sigbus_handler(int signo __rte_unused
)
87 siglongjmp(huge_jmpenv
, 1);
90 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
91 * non-static local variable in the stack frame calling sigsetjmp might be
92 * clobbered by a call to longjmp.
94 static int __rte_unused
huge_wrap_sigsetjmp(void)
96 return sigsetjmp(huge_jmpenv
, 1);
99 static struct sigaction huge_action_old
;
100 static int huge_need_recover
;
102 static void __rte_unused
103 huge_register_sigbus(void)
106 struct sigaction action
;
109 sigaddset(&mask
, SIGBUS
);
111 action
.sa_mask
= mask
;
112 action
.sa_handler
= huge_sigbus_handler
;
114 huge_need_recover
= !sigaction(SIGBUS
, &action
, &huge_action_old
);
117 static void __rte_unused
118 huge_recover_sigbus(void)
120 if (huge_need_recover
) {
121 sigaction(SIGBUS
, &huge_action_old
, NULL
);
122 huge_need_recover
= 0;
126 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
131 /* Check if kernel supports NUMA. */
132 if (numa_available() != 0) {
133 RTE_LOG(DEBUG
, EAL
, "NUMA is not supported.\n");
140 prepare_numa(int *oldpolicy
, struct bitmask
*oldmask
, int socket_id
)
142 RTE_LOG(DEBUG
, EAL
, "Trying to obtain current memory policy.\n");
143 if (get_mempolicy(oldpolicy
, oldmask
->maskp
,
144 oldmask
->size
+ 1, 0, 0) < 0) {
146 "Failed to get current mempolicy: %s. "
147 "Assuming MPOL_DEFAULT.\n", strerror(errno
));
148 oldpolicy
= MPOL_DEFAULT
;
151 "Setting policy MPOL_PREFERRED for socket %d\n",
153 numa_set_preferred(socket_id
);
157 restore_numa(int *oldpolicy
, struct bitmask
*oldmask
)
160 "Restoring previous memory policy: %d\n", *oldpolicy
);
161 if (*oldpolicy
== MPOL_DEFAULT
) {
162 numa_set_localalloc();
163 } else if (set_mempolicy(*oldpolicy
, oldmask
->maskp
,
164 oldmask
->size
+ 1) < 0) {
165 RTE_LOG(ERR
, EAL
, "Failed to restore mempolicy: %s\n",
167 numa_set_localalloc();
169 numa_free_cpumask(oldmask
);
174 * uses fstat to report the size of a file on disk
177 get_file_size(int fd
)
180 if (fstat(fd
, &st
) < 0)
185 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
186 static int lock(int fd
, int type
)
190 /* flock may be interrupted */
192 ret
= flock(fd
, type
| LOCK_NB
);
193 } while (ret
&& errno
== EINTR
);
195 if (ret
&& errno
== EWOULDBLOCK
) {
199 RTE_LOG(ERR
, EAL
, "%s(): error calling flock(): %s\n",
200 __func__
, strerror(errno
));
203 /* lock was successful */
207 static int get_segment_lock_fd(int list_idx
, int seg_idx
)
209 char path
[PATH_MAX
] = {0};
212 if (list_idx
< 0 || list_idx
>= (int)RTE_DIM(lock_fds
))
214 if (seg_idx
< 0 || seg_idx
>= lock_fds
[list_idx
].len
)
217 fd
= lock_fds
[list_idx
].fds
[seg_idx
];
218 /* does this lock already exist? */
222 eal_get_hugefile_lock_path(path
, sizeof(path
),
223 list_idx
* RTE_MAX_MEMSEG_PER_LIST
+ seg_idx
);
225 fd
= open(path
, O_CREAT
| O_RDWR
, 0660);
227 RTE_LOG(ERR
, EAL
, "%s(): error creating lockfile '%s': %s\n",
228 __func__
, path
, strerror(errno
));
231 /* take out a read lock */
232 if (lock(fd
, LOCK_SH
) != 1) {
233 RTE_LOG(ERR
, EAL
, "%s(): failed to take out a readlock on '%s': %s\n",
234 __func__
, path
, strerror(errno
));
238 /* store it for future reference */
239 lock_fds
[list_idx
].fds
[seg_idx
] = fd
;
240 lock_fds
[list_idx
].count
++;
244 static int unlock_segment(int list_idx
, int seg_idx
)
248 if (list_idx
< 0 || list_idx
>= (int)RTE_DIM(lock_fds
))
250 if (seg_idx
< 0 || seg_idx
>= lock_fds
[list_idx
].len
)
253 fd
= lock_fds
[list_idx
].fds
[seg_idx
];
255 /* upgrade lock to exclusive to see if we can remove the lockfile */
256 ret
= lock(fd
, LOCK_EX
);
258 /* we've succeeded in taking exclusive lock, this lockfile may
261 char path
[PATH_MAX
] = {0};
262 eal_get_hugefile_lock_path(path
, sizeof(path
),
263 list_idx
* RTE_MAX_MEMSEG_PER_LIST
+ seg_idx
);
265 RTE_LOG(ERR
, EAL
, "%s(): error removing lockfile '%s': %s\n",
266 __func__
, path
, strerror(errno
));
269 /* we don't want to leak the fd, so even if we fail to lock, close fd
270 * and remove it from list anyway.
273 lock_fds
[list_idx
].fds
[seg_idx
] = -1;
274 lock_fds
[list_idx
].count
--;
282 get_seg_fd(char *path
, int buflen
, struct hugepage_info
*hi
,
283 unsigned int list_idx
, unsigned int seg_idx
)
287 if (internal_config
.single_file_segments
) {
288 /* create a hugepage file path */
289 eal_get_hugefile_path(path
, buflen
, hi
->hugedir
, list_idx
);
291 fd
= lock_fds
[list_idx
].memseg_list_fd
;
294 fd
= open(path
, O_CREAT
| O_RDWR
, 0600);
296 RTE_LOG(ERR
, EAL
, "%s(): open failed: %s\n",
297 __func__
, strerror(errno
));
300 /* take out a read lock and keep it indefinitely */
301 if (lock(fd
, LOCK_SH
) < 0) {
302 RTE_LOG(ERR
, EAL
, "%s(): lock failed: %s\n",
303 __func__
, strerror(errno
));
307 lock_fds
[list_idx
].memseg_list_fd
= fd
;
310 /* create a hugepage file path */
311 eal_get_hugefile_path(path
, buflen
, hi
->hugedir
,
312 list_idx
* RTE_MAX_MEMSEG_PER_LIST
+ seg_idx
);
313 fd
= open(path
, O_CREAT
| O_RDWR
, 0600);
315 RTE_LOG(DEBUG
, EAL
, "%s(): open failed: %s\n", __func__
,
319 /* take out a read lock */
320 if (lock(fd
, LOCK_SH
) < 0) {
321 RTE_LOG(ERR
, EAL
, "%s(): lock failed: %s\n",
322 __func__
, strerror(errno
));
331 resize_hugefile(int fd
, char *path
, int list_idx
, int seg_idx
,
332 uint64_t fa_offset
, uint64_t page_sz
, bool grow
)
336 if (fallocate_supported
== 0) {
337 /* we cannot deallocate memory if fallocate() is not
338 * supported, and hugepage file is already locked at
339 * creation, so no further synchronization needed.
343 RTE_LOG(DEBUG
, EAL
, "%s(): fallocate not supported, not freeing page back to the system\n",
347 uint64_t new_size
= fa_offset
+ page_sz
;
348 uint64_t cur_size
= get_file_size(fd
);
350 /* fallocate isn't supported, fall back to ftruncate */
351 if (new_size
> cur_size
&&
352 ftruncate(fd
, new_size
) < 0) {
353 RTE_LOG(DEBUG
, EAL
, "%s(): ftruncate() failed: %s\n",
354 __func__
, strerror(errno
));
358 int flags
= grow
? 0 : FALLOC_FL_PUNCH_HOLE
|
362 /* if fallocate() is supported, we need to take out a
363 * read lock on allocate (to prevent other processes
364 * from deallocating this page), and take out a write
365 * lock on deallocate (to ensure nobody else is using
368 * read locks on page itself are already taken out at
369 * file creation, in get_seg_fd().
371 * we cannot rely on simple use of flock() call, because
372 * we need to be able to lock a section of the file,
373 * and we cannot use fcntl() locks, because of numerous
374 * problems with their semantics, so we will use
375 * deterministically named lock files for each section
378 * if we're shrinking the file, we want to upgrade our
379 * lock from shared to exclusive.
381 * lock_fd is an fd for a lockfile, not for the segment
384 lock_fd
= get_segment_lock_fd(list_idx
, seg_idx
);
387 /* we are using this lockfile to determine
388 * whether this particular page is locked, as we
389 * are in single file segments mode and thus
390 * cannot use regular flock() to get this info.
392 * we want to try and take out an exclusive lock
393 * on the lock file to determine if we're the
394 * last ones using this page, and if not, we
395 * won't be shrinking it, and will instead exit
398 ret
= lock(lock_fd
, LOCK_EX
);
400 /* drop the lock on the lockfile, so that even
401 * if we couldn't shrink the file ourselves, we
402 * are signalling to other processes that we're
403 * no longer using this page.
405 if (unlock_segment(list_idx
, seg_idx
))
406 RTE_LOG(ERR
, EAL
, "Could not unlock segment\n");
408 /* additionally, if this was the last lock on
409 * this segment list, we can safely close the
410 * page file fd, so that one of the processes
411 * could then delete the file after shrinking.
413 if (ret
< 1 && lock_fds
[list_idx
].count
== 0) {
415 lock_fds
[list_idx
].memseg_list_fd
= -1;
419 RTE_LOG(ERR
, EAL
, "Could not lock segment\n");
423 /* failed to lock, not an error. */
427 /* grow or shrink the file */
428 ret
= fallocate(fd
, flags
, fa_offset
, page_sz
);
431 if (fallocate_supported
== -1 &&
433 RTE_LOG(ERR
, EAL
, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
436 fallocate_supported
= 0;
438 RTE_LOG(DEBUG
, EAL
, "%s(): fallocate() failed: %s\n",
444 fallocate_supported
= 1;
446 /* we've grew/shrunk the file, and we hold an
447 * exclusive lock now. check if there are no
448 * more segments active in this segment list,
449 * and remove the file if there aren't.
451 if (lock_fds
[list_idx
].count
== 0) {
453 RTE_LOG(ERR
, EAL
, "%s(): unlinking '%s' failed: %s\n",
457 lock_fds
[list_idx
].memseg_list_fd
= -1;
466 alloc_seg(struct rte_memseg
*ms
, void *addr
, int socket_id
,
467 struct hugepage_info
*hi
, unsigned int list_idx
,
468 unsigned int seg_idx
)
470 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
471 int cur_socket_id
= 0;
483 alloc_sz
= hi
->hugepage_sz
;
484 if (!internal_config
.single_file_segments
&&
485 internal_config
.in_memory
&&
486 anonymous_hugepages_supported
) {
489 log2
= rte_log2_u32(alloc_sz
);
490 /* as per mmap() manpage, all page sizes are log2 of page size
491 * shifted by MAP_HUGE_SHIFT
493 flags
= (log2
<< RTE_MAP_HUGE_SHIFT
) | MAP_HUGETLB
| MAP_FIXED
|
494 MAP_PRIVATE
| MAP_ANONYMOUS
;
496 va
= mmap(addr
, alloc_sz
, PROT_READ
| PROT_WRITE
, flags
, -1, 0);
498 /* single-file segments codepath will never be active because
499 * in-memory mode is incompatible with it and it's stopped at
500 * EAL initialization stage, however the compiler doesn't know
501 * that and complains about map_offset being used uninitialized
502 * on failure codepaths while having in-memory mode enabled. so,
503 * assign a value here.
507 /* takes out a read lock on segment or segment list */
508 fd
= get_seg_fd(path
, sizeof(path
), hi
, list_idx
, seg_idx
);
510 RTE_LOG(ERR
, EAL
, "Couldn't get fd on hugepage file\n");
514 if (internal_config
.single_file_segments
) {
515 map_offset
= seg_idx
* alloc_sz
;
516 ret
= resize_hugefile(fd
, path
, list_idx
, seg_idx
,
517 map_offset
, alloc_sz
, true);
522 if (ftruncate(fd
, alloc_sz
) < 0) {
523 RTE_LOG(DEBUG
, EAL
, "%s(): ftruncate() failed: %s\n",
524 __func__
, strerror(errno
));
527 if (internal_config
.hugepage_unlink
) {
529 RTE_LOG(DEBUG
, EAL
, "%s(): unlink() failed: %s\n",
530 __func__
, strerror(errno
));
537 * map the segment, and populate page tables, the kernel fills
538 * this segment with zeros if it's a new page.
540 va
= mmap(addr
, alloc_sz
, PROT_READ
| PROT_WRITE
,
541 MAP_SHARED
| MAP_POPULATE
| MAP_FIXED
, fd
,
545 if (va
== MAP_FAILED
) {
546 RTE_LOG(DEBUG
, EAL
, "%s(): mmap() failed: %s\n", __func__
,
548 /* mmap failed, but the previous region might have been
549 * unmapped anyway. try to remap it
554 RTE_LOG(DEBUG
, EAL
, "%s(): wrong mmap() address\n", __func__
);
555 munmap(va
, alloc_sz
);
559 /* In linux, hugetlb limitations, like cgroup, are
560 * enforced at fault time instead of mmap(), even
561 * with the option of MAP_POPULATE. Kernel will send
562 * a SIGBUS signal. To avoid to be killed, save stack
563 * environment here, if SIGBUS happens, we can jump
566 if (huge_wrap_sigsetjmp()) {
567 RTE_LOG(DEBUG
, EAL
, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
568 (unsigned int)(alloc_sz
>> 20));
572 /* we need to trigger a write to the page to enforce page fault and
573 * ensure that page is accessible to us, but we can't overwrite value
574 * that is already there, so read the old value, and write itback.
575 * kernel populates the page with zeroes initially.
577 *(volatile int *)addr
= *(volatile int *)addr
;
579 iova
= rte_mem_virt2iova(addr
);
580 if (iova
== RTE_BAD_PHYS_ADDR
) {
581 RTE_LOG(DEBUG
, EAL
, "%s(): can't get IOVA addr\n",
586 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
587 move_pages(getpid(), 1, &addr
, NULL
, &cur_socket_id
, 0);
589 if (cur_socket_id
!= socket_id
) {
591 "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
592 __func__
, socket_id
, cur_socket_id
);
596 /* for non-single file segments that aren't in-memory, we can close fd
598 if (!internal_config
.single_file_segments
&& !internal_config
.in_memory
)
602 ms
->hugepage_sz
= alloc_sz
;
604 ms
->nchannel
= rte_memory_get_nchannel();
605 ms
->nrank
= rte_memory_get_nrank();
607 ms
->socket_id
= socket_id
;
612 munmap(addr
, alloc_sz
);
615 #ifdef RTE_ARCH_PPC_64
616 flags
|= MAP_HUGETLB
;
618 new_addr
= eal_get_virtual_area(addr
, &alloc_sz
, alloc_sz
, 0, flags
);
619 if (new_addr
!= addr
) {
620 if (new_addr
!= NULL
)
621 munmap(new_addr
, alloc_sz
);
622 /* we're leaving a hole in our virtual address space. if
623 * somebody else maps this hole now, we could accidentally
624 * override it in the future.
626 RTE_LOG(CRIT
, EAL
, "Can't mmap holes in our virtual address space\n");
629 /* in-memory mode will never be single-file-segments mode */
630 if (internal_config
.single_file_segments
) {
631 resize_hugefile(fd
, path
, list_idx
, seg_idx
, map_offset
,
633 /* ignore failure, can't make it any worse */
635 /* only remove file if we can take out a write lock */
636 if (internal_config
.hugepage_unlink
== 0 &&
637 internal_config
.in_memory
== 0 &&
638 lock(fd
, LOCK_EX
) == 1)
646 free_seg(struct rte_memseg
*ms
, struct hugepage_info
*hi
,
647 unsigned int list_idx
, unsigned int seg_idx
)
653 /* erase page data */
654 memset(ms
->addr
, 0, ms
->len
);
656 if (mmap(ms
->addr
, ms
->len
, PROT_READ
,
657 MAP_PRIVATE
| MAP_ANONYMOUS
| MAP_FIXED
, -1, 0) ==
659 RTE_LOG(DEBUG
, EAL
, "couldn't unmap page\n");
663 /* if we've already unlinked the page, nothing needs to be done */
664 if (internal_config
.hugepage_unlink
) {
665 memset(ms
, 0, sizeof(*ms
));
669 /* if we are not in single file segments mode, we're going to unmap the
670 * segment and thus drop the lock on original fd, but hugepage dir is
671 * now locked so we can take out another one without races.
673 fd
= get_seg_fd(path
, sizeof(path
), hi
, list_idx
, seg_idx
);
677 if (internal_config
.single_file_segments
) {
678 map_offset
= seg_idx
* ms
->len
;
679 if (resize_hugefile(fd
, path
, list_idx
, seg_idx
, map_offset
,
684 /* if we're able to take out a write lock, we're the last one
685 * holding onto this page.
687 ret
= lock(fd
, LOCK_EX
);
689 /* no one else is using this page */
693 /* closing fd will drop the lock */
697 memset(ms
, 0, sizeof(*ms
));
699 return ret
< 0 ? -1 : 0;
702 struct alloc_walk_param
{
703 struct hugepage_info
*hi
;
704 struct rte_memseg
**ms
;
706 unsigned int segs_allocated
;
712 alloc_seg_walk(const struct rte_memseg_list
*msl
, void *arg
)
714 struct rte_mem_config
*mcfg
= rte_eal_get_configuration()->mem_config
;
715 struct alloc_walk_param
*wa
= arg
;
716 struct rte_memseg_list
*cur_msl
;
718 int cur_idx
, start_idx
, j
, dir_fd
= -1;
719 unsigned int msl_idx
, need
, i
;
721 if (msl
->page_sz
!= wa
->page_sz
)
723 if (msl
->socket_id
!= wa
->socket
)
726 page_sz
= (size_t)msl
->page_sz
;
728 msl_idx
= msl
- mcfg
->memsegs
;
729 cur_msl
= &mcfg
->memsegs
[msl_idx
];
733 /* try finding space in memseg list */
734 cur_idx
= rte_fbarray_find_next_n_free(&cur_msl
->memseg_arr
, 0, need
);
739 /* do not allow any page allocations during the time we're allocating,
740 * because file creation and locking operations are not atomic,
741 * and we might be the first or the last ones to use a particular page,
742 * so we need to ensure atomicity of every operation.
744 * during init, we already hold a write lock, so don't try to take out
747 if (wa
->hi
->lock_descriptor
== -1 && !internal_config
.in_memory
) {
748 dir_fd
= open(wa
->hi
->hugedir
, O_RDONLY
);
750 RTE_LOG(ERR
, EAL
, "%s(): Cannot open '%s': %s\n",
751 __func__
, wa
->hi
->hugedir
, strerror(errno
));
754 /* blocking writelock */
755 if (flock(dir_fd
, LOCK_EX
)) {
756 RTE_LOG(ERR
, EAL
, "%s(): Cannot lock '%s': %s\n",
757 __func__
, wa
->hi
->hugedir
, strerror(errno
));
763 for (i
= 0; i
< need
; i
++, cur_idx
++) {
764 struct rte_memseg
*cur
;
767 cur
= rte_fbarray_get(&cur_msl
->memseg_arr
, cur_idx
);
768 map_addr
= RTE_PTR_ADD(cur_msl
->base_va
,
771 if (alloc_seg(cur
, map_addr
, wa
->socket
, wa
->hi
,
773 RTE_LOG(DEBUG
, EAL
, "attempted to allocate %i segments, but only %i were allocated\n",
776 /* if exact number wasn't requested, stop */
781 for (j
= start_idx
; j
< cur_idx
; j
++) {
782 struct rte_memseg
*tmp
;
783 struct rte_fbarray
*arr
=
784 &cur_msl
->memseg_arr
;
786 tmp
= rte_fbarray_get(arr
, j
);
787 rte_fbarray_set_free(arr
, j
);
789 /* free_seg may attempt to create a file, which
792 if (free_seg(tmp
, wa
->hi
, msl_idx
, j
))
793 RTE_LOG(DEBUG
, EAL
, "Cannot free page\n");
797 memset(wa
->ms
, 0, sizeof(*wa
->ms
) * wa
->n_segs
);
806 rte_fbarray_set_used(&cur_msl
->memseg_arr
, cur_idx
);
809 wa
->segs_allocated
= i
;
817 struct free_walk_param
{
818 struct hugepage_info
*hi
;
819 struct rte_memseg
*ms
;
822 free_seg_walk(const struct rte_memseg_list
*msl
, void *arg
)
824 struct rte_mem_config
*mcfg
= rte_eal_get_configuration()->mem_config
;
825 struct rte_memseg_list
*found_msl
;
826 struct free_walk_param
*wa
= arg
;
827 uintptr_t start_addr
, end_addr
;
828 int msl_idx
, seg_idx
, ret
, dir_fd
= -1;
830 start_addr
= (uintptr_t) msl
->base_va
;
831 end_addr
= start_addr
+ msl
->memseg_arr
.len
* (size_t)msl
->page_sz
;
833 if ((uintptr_t)wa
->ms
->addr
< start_addr
||
834 (uintptr_t)wa
->ms
->addr
>= end_addr
)
837 msl_idx
= msl
- mcfg
->memsegs
;
838 seg_idx
= RTE_PTR_DIFF(wa
->ms
->addr
, start_addr
) / msl
->page_sz
;
841 found_msl
= &mcfg
->memsegs
[msl_idx
];
843 /* do not allow any page allocations during the time we're freeing,
844 * because file creation and locking operations are not atomic,
845 * and we might be the first or the last ones to use a particular page,
846 * so we need to ensure atomicity of every operation.
848 * during init, we already hold a write lock, so don't try to take out
851 if (wa
->hi
->lock_descriptor
== -1 && !internal_config
.in_memory
) {
852 dir_fd
= open(wa
->hi
->hugedir
, O_RDONLY
);
854 RTE_LOG(ERR
, EAL
, "%s(): Cannot open '%s': %s\n",
855 __func__
, wa
->hi
->hugedir
, strerror(errno
));
858 /* blocking writelock */
859 if (flock(dir_fd
, LOCK_EX
)) {
860 RTE_LOG(ERR
, EAL
, "%s(): Cannot lock '%s': %s\n",
861 __func__
, wa
->hi
->hugedir
, strerror(errno
));
867 found_msl
->version
++;
869 rte_fbarray_set_free(&found_msl
->memseg_arr
, seg_idx
);
871 ret
= free_seg(wa
->ms
, wa
->hi
, msl_idx
, seg_idx
);
883 eal_memalloc_alloc_seg_bulk(struct rte_memseg
**ms
, int n_segs
, size_t page_sz
,
884 int socket
, bool exact
)
887 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
888 bool have_numa
= false;
890 struct bitmask
*oldmask
;
892 struct alloc_walk_param wa
;
893 struct hugepage_info
*hi
= NULL
;
895 memset(&wa
, 0, sizeof(wa
));
897 /* dynamic allocation not supported in legacy mode */
898 if (internal_config
.legacy_mem
)
901 for (i
= 0; i
< (int) RTE_DIM(internal_config
.hugepage_info
); i
++) {
903 internal_config
.hugepage_info
[i
].hugepage_sz
) {
904 hi
= &internal_config
.hugepage_info
[i
];
909 RTE_LOG(ERR
, EAL
, "%s(): can't find relevant hugepage_info entry\n",
914 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
916 oldmask
= numa_allocate_nodemask();
917 prepare_numa(&oldpolicy
, oldmask
, socket
);
926 wa
.page_sz
= page_sz
;
928 wa
.segs_allocated
= 0;
930 /* memalloc is locked, so it's safe to use thread-unsafe version */
931 ret
= rte_memseg_list_walk_thread_unsafe(alloc_seg_walk
, &wa
);
933 RTE_LOG(ERR
, EAL
, "%s(): couldn't find suitable memseg_list\n",
936 } else if (ret
> 0) {
937 ret
= (int)wa
.segs_allocated
;
940 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
942 restore_numa(&oldpolicy
, oldmask
);
948 eal_memalloc_alloc_seg(size_t page_sz
, int socket
)
950 struct rte_memseg
*ms
;
951 if (eal_memalloc_alloc_seg_bulk(&ms
, 1, page_sz
, socket
, true) < 0)
953 /* return pointer to newly allocated memseg */
958 eal_memalloc_free_seg_bulk(struct rte_memseg
**ms
, int n_segs
)
962 /* dynamic free not supported in legacy mode */
963 if (internal_config
.legacy_mem
)
966 for (seg
= 0; seg
< n_segs
; seg
++) {
967 struct rte_memseg
*cur
= ms
[seg
];
968 struct hugepage_info
*hi
= NULL
;
969 struct free_walk_param wa
;
972 /* if this page is marked as unfreeable, fail */
973 if (cur
->flags
& RTE_MEMSEG_FLAG_DO_NOT_FREE
) {
974 RTE_LOG(DEBUG
, EAL
, "Page is not allowed to be freed\n");
979 memset(&wa
, 0, sizeof(wa
));
981 for (i
= 0; i
< (int)RTE_DIM(internal_config
.hugepage_info
);
983 hi
= &internal_config
.hugepage_info
[i
];
984 if (cur
->hugepage_sz
== hi
->hugepage_sz
)
987 if (i
== (int)RTE_DIM(internal_config
.hugepage_info
)) {
988 RTE_LOG(ERR
, EAL
, "Can't find relevant hugepage_info entry\n");
996 /* memalloc is locked, so it's safe to use thread-unsafe version
998 walk_res
= rte_memseg_list_walk_thread_unsafe(free_seg_walk
,
1003 RTE_LOG(ERR
, EAL
, "Couldn't find memseg list\n");
1010 eal_memalloc_free_seg(struct rte_memseg
*ms
)
1012 /* dynamic free not supported in legacy mode */
1013 if (internal_config
.legacy_mem
)
1016 return eal_memalloc_free_seg_bulk(&ms
, 1);
1020 sync_chunk(struct rte_memseg_list
*primary_msl
,
1021 struct rte_memseg_list
*local_msl
, struct hugepage_info
*hi
,
1022 unsigned int msl_idx
, bool used
, int start
, int end
)
1024 struct rte_fbarray
*l_arr
, *p_arr
;
1025 int i
, ret
, chunk_len
, diff_len
;
1027 l_arr
= &local_msl
->memseg_arr
;
1028 p_arr
= &primary_msl
->memseg_arr
;
1030 /* we need to aggregate allocations/deallocations into bigger chunks,
1031 * as we don't want to spam the user with per-page callbacks.
1033 * to avoid any potential issues, we also want to trigger
1034 * deallocation callbacks *before* we actually deallocate
1035 * memory, so that the user application could wrap up its use
1036 * before it goes away.
1039 chunk_len
= end
- start
;
1041 /* find how many contiguous pages we can map/unmap for this chunk */
1043 rte_fbarray_find_contig_free(l_arr
, start
) :
1044 rte_fbarray_find_contig_used(l_arr
, start
);
1046 /* has to be at least one page */
1050 diff_len
= RTE_MIN(chunk_len
, diff_len
);
1052 /* if we are freeing memory, notify the application */
1054 struct rte_memseg
*ms
;
1056 size_t len
, page_sz
;
1058 ms
= rte_fbarray_get(l_arr
, start
);
1059 start_va
= ms
->addr
;
1060 page_sz
= (size_t)primary_msl
->page_sz
;
1061 len
= page_sz
* diff_len
;
1063 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE
,
1067 for (i
= 0; i
< diff_len
; i
++) {
1068 struct rte_memseg
*p_ms
, *l_ms
;
1069 int seg_idx
= start
+ i
;
1071 l_ms
= rte_fbarray_get(l_arr
, seg_idx
);
1072 p_ms
= rte_fbarray_get(p_arr
, seg_idx
);
1074 if (l_ms
== NULL
|| p_ms
== NULL
)
1078 ret
= alloc_seg(l_ms
, p_ms
->addr
,
1079 p_ms
->socket_id
, hi
,
1083 rte_fbarray_set_used(l_arr
, seg_idx
);
1085 ret
= free_seg(l_ms
, hi
, msl_idx
, seg_idx
);
1086 rte_fbarray_set_free(l_arr
, seg_idx
);
1092 /* if we just allocated memory, notify the application */
1094 struct rte_memseg
*ms
;
1096 size_t len
, page_sz
;
1098 ms
= rte_fbarray_get(l_arr
, start
);
1099 start_va
= ms
->addr
;
1100 page_sz
= (size_t)primary_msl
->page_sz
;
1101 len
= page_sz
* diff_len
;
1103 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC
,
1107 /* calculate how much we can advance until next chunk */
1109 rte_fbarray_find_contig_used(l_arr
, start
) :
1110 rte_fbarray_find_contig_free(l_arr
, start
);
1111 ret
= RTE_MIN(chunk_len
, diff_len
);
1117 sync_status(struct rte_memseg_list
*primary_msl
,
1118 struct rte_memseg_list
*local_msl
, struct hugepage_info
*hi
,
1119 unsigned int msl_idx
, bool used
)
1121 struct rte_fbarray
*l_arr
, *p_arr
;
1122 int p_idx
, l_chunk_len
, p_chunk_len
, ret
;
1125 /* this is a little bit tricky, but the basic idea is - walk both lists
1126 * and spot any places where there are discrepancies. walking both lists
1127 * and noting discrepancies in a single go is a hard problem, so we do
1128 * it in two passes - first we spot any places where allocated segments
1129 * mismatch (i.e. ensure that everything that's allocated in the primary
1130 * is also allocated in the secondary), and then we do it by looking at
1131 * free segments instead.
1133 * we also need to aggregate changes into chunks, as we have to call
1134 * callbacks per allocation, not per page.
1136 l_arr
= &local_msl
->memseg_arr
;
1137 p_arr
= &primary_msl
->memseg_arr
;
1140 p_idx
= rte_fbarray_find_next_used(p_arr
, 0);
1142 p_idx
= rte_fbarray_find_next_free(p_arr
, 0);
1144 while (p_idx
>= 0) {
1145 int next_chunk_search_idx
;
1148 p_chunk_len
= rte_fbarray_find_contig_used(p_arr
,
1150 l_chunk_len
= rte_fbarray_find_contig_used(l_arr
,
1153 p_chunk_len
= rte_fbarray_find_contig_free(p_arr
,
1155 l_chunk_len
= rte_fbarray_find_contig_free(l_arr
,
1158 /* best case scenario - no differences (or bigger, which will be
1159 * fixed during next iteration), look for next chunk
1161 if (l_chunk_len
>= p_chunk_len
) {
1162 next_chunk_search_idx
= p_idx
+ p_chunk_len
;
1166 /* if both chunks start at the same point, skip parts we know
1167 * are identical, and sync the rest. each call to sync_chunk
1168 * will only sync contiguous segments, so we need to call this
1169 * until we are sure there are no more differences in this
1172 start
= p_idx
+ l_chunk_len
;
1173 end
= p_idx
+ p_chunk_len
;
1175 ret
= sync_chunk(primary_msl
, local_msl
, hi
, msl_idx
,
1178 } while (start
< end
&& ret
>= 0);
1179 /* if ret is negative, something went wrong */
1183 next_chunk_search_idx
= p_idx
+ p_chunk_len
;
1185 /* skip to end of this chunk */
1187 p_idx
= rte_fbarray_find_next_used(p_arr
,
1188 next_chunk_search_idx
);
1190 p_idx
= rte_fbarray_find_next_free(p_arr
,
1191 next_chunk_search_idx
);
1198 sync_existing(struct rte_memseg_list
*primary_msl
,
1199 struct rte_memseg_list
*local_msl
, struct hugepage_info
*hi
,
1200 unsigned int msl_idx
)
1204 /* do not allow any page allocations during the time we're allocating,
1205 * because file creation and locking operations are not atomic,
1206 * and we might be the first or the last ones to use a particular page,
1207 * so we need to ensure atomicity of every operation.
1209 dir_fd
= open(hi
->hugedir
, O_RDONLY
);
1211 RTE_LOG(ERR
, EAL
, "%s(): Cannot open '%s': %s\n", __func__
,
1212 hi
->hugedir
, strerror(errno
));
1215 /* blocking writelock */
1216 if (flock(dir_fd
, LOCK_EX
)) {
1217 RTE_LOG(ERR
, EAL
, "%s(): Cannot lock '%s': %s\n", __func__
,
1218 hi
->hugedir
, strerror(errno
));
1223 /* ensure all allocated space is the same in both lists */
1224 ret
= sync_status(primary_msl
, local_msl
, hi
, msl_idx
, true);
1228 /* ensure all unallocated space is the same in both lists */
1229 ret
= sync_status(primary_msl
, local_msl
, hi
, msl_idx
, false);
1233 /* update version number */
1234 local_msl
->version
= primary_msl
->version
;
1245 sync_walk(const struct rte_memseg_list
*msl
, void *arg __rte_unused
)
1247 struct rte_mem_config
*mcfg
= rte_eal_get_configuration()->mem_config
;
1248 struct rte_memseg_list
*primary_msl
, *local_msl
;
1249 struct hugepage_info
*hi
= NULL
;
1253 msl_idx
= msl
- mcfg
->memsegs
;
1254 primary_msl
= &mcfg
->memsegs
[msl_idx
];
1255 local_msl
= &local_memsegs
[msl_idx
];
1257 for (i
= 0; i
< RTE_DIM(internal_config
.hugepage_info
); i
++) {
1259 internal_config
.hugepage_info
[i
].hugepage_sz
;
1260 uint64_t msl_sz
= primary_msl
->page_sz
;
1261 if (msl_sz
== cur_sz
) {
1262 hi
= &internal_config
.hugepage_info
[i
];
1267 RTE_LOG(ERR
, EAL
, "Can't find relevant hugepage_info entry\n");
1271 /* if versions don't match, synchronize everything */
1272 if (local_msl
->version
!= primary_msl
->version
&&
1273 sync_existing(primary_msl
, local_msl
, hi
, msl_idx
))
1280 eal_memalloc_sync_with_primary(void)
1282 /* nothing to be done in primary */
1283 if (rte_eal_process_type() == RTE_PROC_PRIMARY
)
1286 /* memalloc is locked, so it's safe to call thread-unsafe version */
1287 if (rte_memseg_list_walk_thread_unsafe(sync_walk
, NULL
))
1293 secondary_msl_create_walk(const struct rte_memseg_list
*msl
,
1294 void *arg __rte_unused
)
1296 struct rte_mem_config
*mcfg
= rte_eal_get_configuration()->mem_config
;
1297 struct rte_memseg_list
*primary_msl
, *local_msl
;
1298 char name
[PATH_MAX
];
1301 msl_idx
= msl
- mcfg
->memsegs
;
1302 primary_msl
= &mcfg
->memsegs
[msl_idx
];
1303 local_msl
= &local_memsegs
[msl_idx
];
1305 /* create distinct fbarrays for each secondary */
1306 snprintf(name
, RTE_FBARRAY_NAME_LEN
, "%s_%i",
1307 primary_msl
->memseg_arr
.name
, getpid());
1309 ret
= rte_fbarray_init(&local_msl
->memseg_arr
, name
,
1310 primary_msl
->memseg_arr
.len
,
1311 primary_msl
->memseg_arr
.elt_sz
);
1313 RTE_LOG(ERR
, EAL
, "Cannot initialize local memory map\n");
1316 local_msl
->base_va
= primary_msl
->base_va
;
1322 secondary_lock_list_create_walk(const struct rte_memseg_list
*msl
,
1323 void *arg __rte_unused
)
1325 struct rte_mem_config
*mcfg
= rte_eal_get_configuration()->mem_config
;
1326 unsigned int i
, len
;
1330 msl_idx
= msl
- mcfg
->memsegs
;
1331 len
= msl
->memseg_arr
.len
;
1333 /* ensure we have space to store lock fd per each possible segment */
1334 data
= malloc(sizeof(int) * len
);
1336 RTE_LOG(ERR
, EAL
, "Unable to allocate space for lock descriptors\n");
1339 /* set all fd's as invalid */
1340 for (i
= 0; i
< len
; i
++)
1343 lock_fds
[msl_idx
].fds
= data
;
1344 lock_fds
[msl_idx
].len
= len
;
1345 lock_fds
[msl_idx
].count
= 0;
1346 lock_fds
[msl_idx
].memseg_list_fd
= -1;
1352 eal_memalloc_init(void)
1354 if (rte_eal_process_type() == RTE_PROC_SECONDARY
)
1355 if (rte_memseg_list_walk(secondary_msl_create_walk
, NULL
) < 0)
1358 /* initialize all of the lock fd lists */
1359 if (internal_config
.single_file_segments
)
1360 if (rte_memseg_list_walk(secondary_lock_list_create_walk
, NULL
))