]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/src/core/memory.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / seastar / src / core / memory.cc
1 /*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18 /*
19 * Copyright (C) 2014 Cloudius Systems, Ltd.
20 */
21
22
23 /// \cond internal
24
25 //
26 // Seastar memory allocator
27 //
28 // This is a share-nothing allocator (memory allocated on one cpu must
29 // be freed on the same cpu).
30 //
31 // Inspired by gperftools' tcmalloc.
32 //
33 // Memory map:
34 //
35 // 0x0000'sccc'vvvv'vvvv
36 //
37 // 0000 - required by architecture (only 48 bits of address space)
38 // s - chosen to satisfy system allocator (1-7)
39 // ccc - cpu number (0-12 bits allocated vary according to system)
40 // v - virtual address within cpu (32-44 bits, according to how much ccc
41 // leaves us
42 //
43 // Each page has a page structure that describes it. Within a cpu's
44 // memory pool, the page array starts at offset 0, describing all pages
45 // within that pool. Page 0 does not describe a valid page.
46 //
47 // Each pool can contain at most 2^32 pages (or 44 address bits), so we can
48 // use a 32-bit integer to identify a page.
49 //
50 // Runs of pages are organized into spans. Free spans are organized into lists,
51 // by size. When spans are broken up or coalesced, they may move into new lists.
52 // Spans have a size that is a power-of-two and are naturally aligned (aka buddy
53 // allocator)
54
55 #include <seastar/core/cacheline.hh>
56 #include <seastar/core/memory.hh>
57 #include <seastar/core/print.hh>
58 #include <seastar/util/alloc_failure_injector.hh>
59 #include <seastar/util/memory_diagnostics.hh>
60 #include <seastar/util/std-compat.hh>
61 #include <seastar/util/log.hh>
62 #include <seastar/core/aligned_buffer.hh>
63 #include <unordered_set>
64 #include <iostream>
65 #include <thread>
66
67 #include <dlfcn.h>
68
69 namespace seastar {
70
71 extern seastar::logger seastar_logger;
72
73 void* internal::allocate_aligned_buffer_impl(size_t size, size_t align) {
74 void *ret;
75 auto r = posix_memalign(&ret, align, size);
76 if (r == ENOMEM) {
77 throw std::bad_alloc();
78 } else if (r == EINVAL) {
79 throw std::runtime_error(format("Invalid alignment of {:d}; allocating {:d} bytes", align, size));
80 } else {
81 assert(r == 0);
82 return ret;
83 }
84 }
85
86 namespace memory {
87
88 // We always create the logger object for memory disagnostics, even in
89 // in SEASTAR_DEFAULT_ALLOCATOR builds, though it only logs when the
90 // seastar allocator is enabled.
91 seastar::logger seastar_memory_logger("seastar_memory");
92
93 static thread_local int abort_on_alloc_failure_suppressed = 0;
94
95 disable_abort_on_alloc_failure_temporarily::disable_abort_on_alloc_failure_temporarily() {
96 ++abort_on_alloc_failure_suppressed;
97 }
98
99 disable_abort_on_alloc_failure_temporarily::~disable_abort_on_alloc_failure_temporarily() noexcept {
100 --abort_on_alloc_failure_suppressed;
101 }
102
103 static std::pmr::polymorphic_allocator<char> static_malloc_allocator{std::pmr::get_default_resource()};;
104 std::pmr::polymorphic_allocator<char>* malloc_allocator{&static_malloc_allocator};
105
106 namespace internal {
107
108 #ifdef __cpp_constinit
109 #define SEASTAR_CONSTINIT constinit
110 #else
111 #define SEASTAR_CONSTINIT
112 #endif
113
114 #ifdef SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION
115
116 #ifdef __cpp_constinit
117 thread_local constinit volatile int critical_alloc_section = 0;
118 #else
119 __thread volatile int critical_alloc_section = 0;
120 #endif
121
122 #endif // SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION
123
124 } // namespace internal
125
126 }
127
128 }
129
130 #ifndef SEASTAR_DEFAULT_ALLOCATOR
131
132 #include <seastar/core/bitops.hh>
133 #include <seastar/core/align.hh>
134 #include <seastar/core/posix.hh>
135 #include <seastar/core/shared_ptr.hh>
136 #include <new>
137 #include <cstdint>
138 #include <algorithm>
139 #include <limits>
140 #include <cassert>
141 #include <atomic>
142 #include <mutex>
143 #include <seastar/util/std-compat.hh>
144 #include <functional>
145 #include <cstring>
146 #include <boost/intrusive/list.hpp>
147 #include <sys/mman.h>
148 #include <seastar/util/backtrace.hh>
149
150 #ifdef SEASTAR_HAVE_NUMA
151 #include <numaif.h>
152 #endif
153
154 namespace seastar {
155
156 struct allocation_site {
157 mutable size_t count = 0; // number of live objects allocated at backtrace.
158 mutable size_t size = 0; // amount of bytes in live objects allocated at backtrace.
159 mutable const allocation_site* next = nullptr;
160 saved_backtrace backtrace;
161
162 bool operator==(const allocation_site& o) const {
163 return backtrace == o.backtrace;
164 }
165
166 bool operator!=(const allocation_site& o) const {
167 return !(*this == o);
168 }
169 };
170
171 }
172
173 namespace std {
174
175 template<>
176 struct hash<seastar::allocation_site> {
177 size_t operator()(const seastar::allocation_site& bi) const {
178 return std::hash<seastar::saved_backtrace>()(bi.backtrace);
179 }
180 };
181
182 }
183
184 #if FMT_VERSION >= 90000
185 namespace seastar::memory {
186 struct human_readable_value;
187 }
188 template <> struct fmt::formatter<struct seastar::memory::human_readable_value> : fmt::ostream_formatter {};
189 #endif
190
191 namespace seastar {
192
193 using allocation_site_ptr = const allocation_site*;
194
195 namespace memory {
196
197 [[gnu::unused]]
198 static allocation_site_ptr get_allocation_site();
199
200 static void on_allocation_failure(size_t size);
201
202 static constexpr unsigned cpu_id_shift = 36; // FIXME: make dynamic
203 static constexpr unsigned max_cpus = 256;
204 static constexpr uintptr_t cpu_id_and_mem_base_mask = ~((uintptr_t(1) << cpu_id_shift) - 1);
205
206 using pageidx = uint32_t;
207
208 struct page;
209 class page_list;
210
211 static std::atomic<bool> live_cpus[max_cpus];
212
213 using std::optional;
214
215 // is_reactor_thread gets set to true when memory::configure() gets called
216 // it is used to identify seastar threads and hence use system memory allocator
217 // for those threads
218 static thread_local bool is_reactor_thread = false;
219
220
221 namespace alloc_stats {
222
223 enum class types { allocs, frees, cross_cpu_frees, reclaims, large_allocs, failed_allocs,
224 foreign_mallocs, foreign_frees, foreign_cross_frees, enum_size };
225
226 using stats_array = std::array<uint64_t, static_cast<std::size_t>(types::enum_size)>;
227 using stats_atomic_array = std::array<std::atomic_uint64_t, static_cast<std::size_t>(types::enum_size)>;
228
229 static thread_local SEASTAR_CONSTINIT stats_array stats{};
230 std::array<stats_atomic_array, max_cpus> alien_stats{};
231
232 static void increment_local(types stat_type, uint64_t size = 1) {
233 stats[static_cast<std::size_t>(stat_type)] += size;
234 }
235
236 static void increment(types stat_type, uint64_t size=1)
237 {
238 // fast path, reactor threads takes thread local statistics
239 if (is_reactor_thread) {
240 increment_local(stat_type, size);
241 } else {
242 auto hash = std::hash<std::thread::id>()(std::this_thread::get_id());
243 auto i = static_cast<std::size_t>(stat_type);
244 alien_stats[hash % alien_stats.size()][i].fetch_add(size, std::memory_order_relaxed);
245 }
246 }
247
248 static uint64_t get(types stat_type)
249 {
250 auto i = static_cast<std::size_t>(stat_type);
251 // fast path, reactor threads takes thread local statistics
252 if (is_reactor_thread) {
253 return stats[i];
254 } else {
255 auto hash = std::hash<std::thread::id>()(std::this_thread::get_id());
256 return alien_stats[hash % alien_stats.size()][i].load();
257 }
258 }
259
260 }
261
262 // original memory allocator support
263 // note: allocations before calling the constructor would use seastar allocator
264 using malloc_func_type = void * (*)(size_t);
265 using free_func_type = void * (*)(void *);
266 using realloc_func_type = void * (*)(void *, size_t);
267 using aligned_alloc_type = void * (*)(size_t alignment, size_t size);
268 using malloc_trim_type = int (*)(size_t);
269 using malloc_usable_size_type = size_t (*)(void *);
270
271 malloc_func_type original_malloc_func = reinterpret_cast<malloc_func_type>(dlsym(RTLD_NEXT, "malloc"));
272 free_func_type original_free_func = reinterpret_cast<free_func_type>(dlsym(RTLD_NEXT, "free"));
273 realloc_func_type original_realloc_func = reinterpret_cast<realloc_func_type>(dlsym(RTLD_NEXT, "realloc"));
274 aligned_alloc_type original_aligned_alloc_func = reinterpret_cast<aligned_alloc_type>(dlsym(RTLD_NEXT, "aligned_alloc"));
275 malloc_trim_type original_malloc_trim_func = reinterpret_cast<malloc_trim_type>(dlsym(RTLD_NEXT, "malloc_trim"));
276 malloc_usable_size_type original_malloc_usable_size_func = reinterpret_cast<malloc_usable_size_type>(dlsym(RTLD_NEXT, "malloc_usable_size"));
277
278 using allocate_system_memory_fn
279 = std::function<mmap_area (void* where, size_t how_much)>;
280
281 namespace bi = boost::intrusive;
282
283 static thread_local uintptr_t local_expected_cpu_id = std::numeric_limits<uintptr_t>::max();
284
285 inline
286 unsigned object_cpu_id(const void* ptr) {
287 return (reinterpret_cast<uintptr_t>(ptr) >> cpu_id_shift) & 0xff;
288 }
289
290 class page_list_link {
291 uint32_t _prev;
292 uint32_t _next;
293 friend class page_list;
294 friend seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator);
295 };
296
297 constexpr size_t mem_base_alloc = size_t(1) << 44;
298
299 static char* mem_base() {
300 static char* known;
301 static std::once_flag flag;
302 std::call_once(flag, [] {
303 auto r = ::mmap(NULL, 2 * mem_base_alloc,
304 PROT_NONE,
305 MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
306 -1, 0);
307 if (r == MAP_FAILED) {
308 abort();
309 }
310 ::madvise(r, 2 * mem_base_alloc, MADV_DONTDUMP);
311 auto cr = reinterpret_cast<char*>(r);
312 known = align_up(cr, mem_base_alloc);
313 ::munmap(cr, known - cr);
314 ::munmap(known + mem_base_alloc, cr + 2 * mem_base_alloc - (known + mem_base_alloc));
315 });
316 return known;
317 }
318
319 bool is_seastar_memory(void * ptr)
320 {
321 auto begin = mem_base();
322 auto end = begin + mem_base_alloc;
323 return ptr >= begin && ptr < end;
324 }
325
326 constexpr bool is_page_aligned(size_t size) {
327 return (size & (page_size - 1)) == 0;
328 }
329
330 constexpr size_t next_page_aligned(size_t size) {
331 return (size + (page_size - 1)) & ~(page_size - 1);
332 }
333
334 class small_pool;
335
336 struct free_object {
337 free_object* next;
338 };
339
340 struct page {
341 bool free;
342 uint8_t offset_in_span;
343 uint16_t nr_small_alloc;
344 uint32_t span_size; // in pages, if we're the head or the tail
345 page_list_link link;
346 small_pool* pool; // if used in a small_pool
347 free_object* freelist;
348 #ifdef SEASTAR_HEAPPROF
349 allocation_site_ptr alloc_site; // for objects whose size is multiple of page size, valid for head only
350 #endif
351 };
352
353 class page_list {
354 uint32_t _front = 0;
355 uint32_t _back = 0;
356 public:
357 page& front(page* ary) { return ary[_front]; }
358 page& back(page* ary) { return ary[_back]; }
359 bool empty() const { return !_front; }
360 void erase(page* ary, page& span) {
361 if (span.link._next) {
362 ary[span.link._next].link._prev = span.link._prev;
363 } else {
364 _back = span.link._prev;
365 }
366 if (span.link._prev) {
367 ary[span.link._prev].link._next = span.link._next;
368 } else {
369 _front = span.link._next;
370 }
371 }
372 void push_front(page* ary, page& span) {
373 auto idx = &span - ary;
374 if (_front) {
375 ary[_front].link._prev = idx;
376 } else {
377 _back = idx;
378 }
379 span.link._next = _front;
380 span.link._prev = 0;
381 _front = idx;
382 }
383 void pop_front(page* ary) {
384 if (ary[_front].link._next) {
385 ary[ary[_front].link._next].link._prev = 0;
386 } else {
387 _back = 0;
388 }
389 _front = ary[_front].link._next;
390 }
391 friend seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator);
392 };
393
394 class small_pool {
395 struct span_sizes {
396 uint8_t preferred;
397 uint8_t fallback;
398 };
399 unsigned _object_size;
400 span_sizes _span_sizes;
401 free_object* _free = nullptr;
402 size_t _free_count = 0;
403 unsigned _min_free;
404 unsigned _max_free;
405 unsigned _pages_in_use = 0;
406 page_list _span_list;
407 static constexpr unsigned idx_frac_bits = 2;
408 public:
409 explicit small_pool(unsigned object_size) noexcept;
410 ~small_pool();
411 void* allocate();
412 void deallocate(void* object);
413 unsigned object_size() const { return _object_size; }
414 bool objects_page_aligned() const { return is_page_aligned(_object_size); }
415 static constexpr unsigned size_to_idx(unsigned size);
416 static constexpr unsigned idx_to_size(unsigned idx);
417 allocation_site_ptr& alloc_site_holder(void* ptr);
418 private:
419 void add_more_objects();
420 void trim_free_list();
421 friend seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator);
422 };
423
424 // index 0b0001'1100 -> size (1 << 4) + 0b11 << (4 - 2)
425
426 constexpr unsigned
427 small_pool::idx_to_size(unsigned idx) {
428 size_t s = (((1 << idx_frac_bits) | (idx & ((1 << idx_frac_bits) - 1)))
429 << (idx >> idx_frac_bits))
430 >> idx_frac_bits;
431 // If size is larger than max_align_t, force it to be a multiple of
432 // max_align_t. Clang relies in this property to use aligned mov
433 // instructions (e.g. movaps)
434 //
435 // Note this function is used at initialization time only, so it doesn't
436 // need to be especially fast.
437 if (s > alignof(std::max_align_t)) {
438 s = align_up(s, alignof(std::max_align_t));
439 }
440 return s;
441 }
442
443 constexpr unsigned
444 small_pool::size_to_idx(unsigned size) {
445 return ((log2floor(size) << idx_frac_bits) - ((1 << idx_frac_bits) - 1))
446 + ((size - 1) >> (log2floor(size) - idx_frac_bits));
447 }
448
449 class small_pool_array {
450 public:
451 static constexpr unsigned nr_small_pools = small_pool::size_to_idx(4 * page_size) + 1;
452 private:
453 union u {
454 small_pool a[nr_small_pools];
455 u() {
456 for (unsigned i = 0; i < nr_small_pools; ++i) {
457 new (&a[i]) small_pool(small_pool::idx_to_size(i));
458 }
459 }
460 ~u() {
461 // cannot really call destructor, since other
462 // objects may be freed after we are gone.
463 }
464 } _u;
465 public:
466 small_pool& operator[](unsigned idx) { return _u.a[idx]; }
467 };
468
469 static constexpr size_t max_small_allocation
470 = small_pool::idx_to_size(small_pool_array::nr_small_pools - 1);
471
472 constexpr size_t object_size_with_alloc_site(size_t size) {
473 #ifdef SEASTAR_HEAPPROF
474 // For page-aligned sizes, allocation_site* lives in page::alloc_site, not with the object.
475 static_assert(is_page_aligned(max_small_allocation), "assuming that max_small_allocation is page aligned so that we"
476 " don't need to add allocation_site_ptr to objects of size close to it");
477 size_t next_page_aligned_size = next_page_aligned(size);
478 if (next_page_aligned_size - size > sizeof(allocation_site_ptr)) {
479 size += sizeof(allocation_site_ptr);
480 } else {
481 return next_page_aligned_size;
482 }
483 #endif
484 return size;
485 }
486
487 #ifdef SEASTAR_HEAPPROF
488 // Ensure that object_size_with_alloc_site() does not exceed max_small_allocation
489 static_assert(object_size_with_alloc_site(max_small_allocation) == max_small_allocation, "");
490 static_assert(object_size_with_alloc_site(max_small_allocation - 1) == max_small_allocation, "");
491 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) + 1) == max_small_allocation, "");
492 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr)) == max_small_allocation, "");
493 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 1) == max_small_allocation - 1, "");
494 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 2) == max_small_allocation - 2, "");
495 #endif
496
497 struct cross_cpu_free_item {
498 cross_cpu_free_item* next;
499 };
500
501 struct cpu_pages {
502 uint32_t min_free_pages = 20000000 / page_size;
503 char* memory;
504 page* pages;
505 uint32_t nr_pages;
506 uint32_t nr_free_pages;
507 uint32_t current_min_free_pages = 0;
508 size_t large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
509 unsigned cpu_id = -1U;
510 std::function<void (std::function<void ()>)> reclaim_hook;
511 std::vector<reclaimer*> reclaimers;
512 static constexpr unsigned nr_span_lists = 32;
513 page_list free_spans[nr_span_lists]; // contains aligned spans with span_size == 2^idx
514 small_pool_array small_pools;
515 alignas(seastar::cache_line_size) std::atomic<cross_cpu_free_item*> xcpu_freelist;
516 static std::atomic<unsigned> cpu_id_gen;
517 static cpu_pages* all_cpus[max_cpus];
518 union asu {
519 using alloc_sites_type = std::unordered_set<allocation_site>;
520 asu() : alloc_sites{} {
521 }
522 ~asu() {} // alloc_sites live forever
523 alloc_sites_type alloc_sites;
524 } asu;
525 allocation_site_ptr alloc_site_list_head = nullptr; // For easy traversal of asu.alloc_sites from scylla-gdb.py
526 bool collect_backtrace = false;
527 char* mem() { return memory; }
528
529 void link(page_list& list, page* span);
530 void unlink(page_list& list, page* span);
531 struct trim {
532 unsigned offset;
533 unsigned nr_pages;
534 };
535 void maybe_reclaim();
536 void* allocate_large_and_trim(unsigned nr_pages);
537 void* allocate_large(unsigned nr_pages);
538 void* allocate_large_aligned(unsigned align_pages, unsigned nr_pages);
539 page* find_and_unlink_span(unsigned nr_pages);
540 page* find_and_unlink_span_reclaiming(unsigned n_pages);
541 void free_large(void* ptr);
542 bool grow_span(pageidx& start, uint32_t& nr_pages, unsigned idx);
543 void free_span(pageidx start, uint32_t nr_pages);
544 void free_span_no_merge(pageidx start, uint32_t nr_pages);
545 void free_span_unaligned(pageidx start, uint32_t nr_pages);
546 void* allocate_small(unsigned size);
547 void free(void* ptr);
548 void free(void* ptr, size_t size);
549 static bool try_foreign_free(void* ptr);
550 void shrink(void* ptr, size_t new_size);
551 static void free_cross_cpu(unsigned cpu_id, void* ptr);
552 bool drain_cross_cpu_freelist();
553 size_t object_size(void* ptr);
554 page* to_page(void* p) {
555 return &pages[(reinterpret_cast<char*>(p) - mem()) / page_size];
556 }
557
558 bool is_initialized() const;
559 bool initialize();
560 reclaiming_result run_reclaimers(reclaimer_scope, size_t pages_to_reclaim);
561 void schedule_reclaim();
562 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook);
563 void set_min_free_pages(size_t pages);
564 void resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
565 void do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
566 void replace_memory_backing(allocate_system_memory_fn alloc_sys_mem);
567 void check_large_allocation(size_t size);
568 void warn_large_allocation(size_t size);
569 memory::memory_layout memory_layout();
570 ~cpu_pages();
571 };
572
573 static thread_local cpu_pages cpu_mem;
574 std::atomic<unsigned> cpu_pages::cpu_id_gen;
575 cpu_pages* cpu_pages::all_cpus[max_cpus];
576
577 static cpu_pages& get_cpu_mem();
578
579 #ifdef SEASTAR_HEAPPROF
580
581 void set_heap_profiling_enabled(bool enable) {
582 bool is_enabled = get_cpu_mem().collect_backtrace;
583 if (enable) {
584 if (!is_enabled) {
585 seastar_logger.info("Enabling heap profiler");
586 }
587 } else {
588 if (is_enabled) {
589 seastar_logger.info("Disabling heap profiler");
590 }
591 }
592 get_cpu_mem().collect_backtrace = enable;
593 }
594
595 static thread_local int64_t scoped_heap_profiling_embed_count = 0;
596
597 scoped_heap_profiling::scoped_heap_profiling() noexcept {
598 ++scoped_heap_profiling_embed_count;
599 set_heap_profiling_enabled(true);
600 }
601
602 scoped_heap_profiling::~scoped_heap_profiling() {
603 if (!--scoped_heap_profiling_embed_count) {
604 set_heap_profiling_enabled(false);
605 }
606 }
607
608 #else
609
610 void set_heap_profiling_enabled(bool enable) {
611 seastar_logger.warn("Seastar compiled without heap profiling support, heap profiler not supported;"
612 " compile with the Seastar_HEAP_PROFILING=ON CMake option to add heap profiling support");
613 }
614
615 scoped_heap_profiling::scoped_heap_profiling() noexcept {
616 set_heap_profiling_enabled(true); // let it print the warning
617 }
618
619 scoped_heap_profiling::~scoped_heap_profiling() {
620 }
621
622 #endif
623
624 // Smallest index i such that all spans stored in the index are >= pages.
625 static inline
626 unsigned index_of(unsigned pages) {
627 if (pages == 1) {
628 return 0;
629 }
630 return std::numeric_limits<unsigned>::digits - count_leading_zeros(pages - 1);
631 }
632
633 void
634 cpu_pages::unlink(page_list& list, page* span) {
635 list.erase(pages, *span);
636 }
637
638 void
639 cpu_pages::link(page_list& list, page* span) {
640 list.push_front(pages, *span);
641 }
642
643 void cpu_pages::free_span_no_merge(uint32_t span_start, uint32_t nr_pages) {
644 assert(nr_pages);
645 nr_free_pages += nr_pages;
646 auto span = &pages[span_start];
647 auto span_end = &pages[span_start + nr_pages - 1];
648 span->free = span_end->free = true;
649 span->span_size = span_end->span_size = nr_pages;
650 auto idx = index_of(nr_pages);
651 link(free_spans[idx], span);
652 }
653
654 bool cpu_pages::grow_span(uint32_t& span_start, uint32_t& nr_pages, unsigned idx) {
655 auto which = (span_start >> idx) & 1; // 0=lower, 1=upper
656 // locate first page of upper buddy or last page of lower buddy
657 // examples: span_start = 0x10 nr_pages = 0x08 -> buddy = 0x18 (which = 0)
658 // span_start = 0x18 nr_pages = 0x08 -> buddy = 0x17 (which = 1)
659 // delta = which ? -1u : nr_pages
660 auto delta = ((which ^ 1) << idx) | -which;
661 auto buddy = span_start + delta;
662 if (pages[buddy].free && pages[buddy].span_size == nr_pages) {
663 unlink(free_spans[idx], &pages[span_start ^ nr_pages]);
664 nr_free_pages -= nr_pages; // free_span_no_merge() will restore
665 span_start &= ~nr_pages;
666 nr_pages *= 2;
667 return true;
668 }
669 return false;
670 }
671
672 void cpu_pages::free_span(uint32_t span_start, uint32_t nr_pages) {
673 auto idx = index_of(nr_pages);
674 while (grow_span(span_start, nr_pages, idx)) {
675 ++idx;
676 }
677 free_span_no_merge(span_start, nr_pages);
678 }
679
680 // Internal, used during startup. Span is not aligned so needs to be broken up
681 void cpu_pages::free_span_unaligned(uint32_t span_start, uint32_t nr_pages) {
682 while (nr_pages) {
683 auto start_nr_bits = span_start ? count_trailing_zeros(span_start) : 32;
684 auto size_nr_bits = count_trailing_zeros(nr_pages);
685 auto now = 1u << std::min(start_nr_bits, size_nr_bits);
686 free_span(span_start, now);
687 span_start += now;
688 nr_pages -= now;
689 }
690 }
691
692 page*
693 cpu_pages::find_and_unlink_span(unsigned n_pages) {
694 auto idx = index_of(n_pages);
695 if (n_pages >= (2u << idx)) {
696 return nullptr;
697 }
698 while (idx < nr_span_lists && free_spans[idx].empty()) {
699 ++idx;
700 }
701 if (idx == nr_span_lists) {
702 if (initialize()) {
703 return find_and_unlink_span(n_pages);
704 }
705 return nullptr;
706 }
707 auto& list = free_spans[idx];
708 page* span = &list.front(pages);
709 unlink(list, span);
710 return span;
711 }
712
713 page*
714 cpu_pages::find_and_unlink_span_reclaiming(unsigned n_pages) {
715 while (true) {
716 auto span = find_and_unlink_span(n_pages);
717 if (span) {
718 return span;
719 }
720 if (run_reclaimers(reclaimer_scope::sync, n_pages) == reclaiming_result::reclaimed_nothing) {
721 return nullptr;
722 }
723 }
724 }
725
726 void cpu_pages::maybe_reclaim() {
727 if (nr_free_pages < current_min_free_pages) {
728 drain_cross_cpu_freelist();
729 if (nr_free_pages < current_min_free_pages) {
730 run_reclaimers(reclaimer_scope::sync, current_min_free_pages - nr_free_pages);
731 }
732 if (nr_free_pages < current_min_free_pages) {
733 schedule_reclaim();
734 }
735 }
736 }
737
738 void*
739 cpu_pages::allocate_large_and_trim(unsigned n_pages) {
740 // Avoid exercising the reclaimers for requests we'll not be able to satisfy
741 // nr_pages might be zero during startup, so check for that too
742 if (nr_pages && n_pages >= nr_pages) {
743 return nullptr;
744 }
745 page* span = find_and_unlink_span_reclaiming(n_pages);
746 if (!span) {
747 return nullptr;
748 }
749 auto span_size = span->span_size;
750 auto span_idx = span - pages;
751 nr_free_pages -= span->span_size;
752 while (span_size >= n_pages * 2) {
753 span_size /= 2;
754 auto other_span_idx = span_idx + span_size;
755 free_span_no_merge(other_span_idx, span_size);
756 }
757 auto span_end = &pages[span_idx + span_size - 1];
758 span->free = span_end->free = false;
759 span->span_size = span_end->span_size = span_size;
760 span->pool = nullptr;
761 #ifdef SEASTAR_HEAPPROF
762 auto alloc_site = get_allocation_site();
763 span->alloc_site = alloc_site;
764 if (alloc_site) {
765 ++alloc_site->count;
766 alloc_site->size += span->span_size * page_size;
767 }
768 #endif
769 maybe_reclaim();
770 return mem() + span_idx * page_size;
771 }
772
773 void
774 cpu_pages::warn_large_allocation(size_t size) {
775 alloc_stats::increment_local(alloc_stats::types::large_allocs);
776 seastar_memory_logger.warn("oversized allocation: {} bytes. This is non-fatal, but could lead to latency and/or fragmentation issues. Please report: at {}", size, current_backtrace());
777 large_allocation_warning_threshold *= 1.618; // prevent spam
778 }
779
780 void
781 inline
782 cpu_pages::check_large_allocation(size_t size) {
783 if (size >= large_allocation_warning_threshold) {
784 warn_large_allocation(size);
785 }
786 }
787
788 void*
789 cpu_pages::allocate_large(unsigned n_pages) {
790 check_large_allocation(n_pages * page_size);
791 return allocate_large_and_trim(n_pages);
792 }
793
794 void*
795 cpu_pages::allocate_large_aligned(unsigned align_pages, unsigned n_pages) {
796 check_large_allocation(n_pages * page_size);
797 // buddy allocation is always aligned
798 return allocate_large_and_trim(n_pages);
799 }
800
801 disable_backtrace_temporarily::disable_backtrace_temporarily() {
802 _old = get_cpu_mem().collect_backtrace;
803 get_cpu_mem().collect_backtrace = false;
804 }
805
806 disable_backtrace_temporarily::~disable_backtrace_temporarily() {
807 get_cpu_mem().collect_backtrace = _old;
808 }
809
810 static
811 saved_backtrace get_backtrace() noexcept {
812 disable_backtrace_temporarily dbt;
813 return current_backtrace();
814 }
815
816 static
817 allocation_site_ptr get_allocation_site() {
818 if (!cpu_mem.is_initialized() || !cpu_mem.collect_backtrace) {
819 return nullptr;
820 }
821 disable_backtrace_temporarily dbt;
822 allocation_site new_alloc_site;
823 new_alloc_site.backtrace = get_backtrace();
824 auto insert_result = cpu_mem.asu.alloc_sites.insert(std::move(new_alloc_site));
825 allocation_site_ptr alloc_site = &*insert_result.first;
826 if (insert_result.second) {
827 alloc_site->next = cpu_mem.alloc_site_list_head;
828 cpu_mem.alloc_site_list_head = alloc_site;
829 }
830 return alloc_site;
831 }
832
833 #ifdef SEASTAR_HEAPPROF
834
835 allocation_site_ptr&
836 small_pool::alloc_site_holder(void* ptr) {
837 if (objects_page_aligned()) {
838 return get_cpu_mem().to_page(ptr)->alloc_site;
839 } else {
840 return *reinterpret_cast<allocation_site_ptr*>(reinterpret_cast<char*>(ptr) + _object_size - sizeof(allocation_site_ptr));
841 }
842 }
843
844 #endif
845
846 void*
847 cpu_pages::allocate_small(unsigned size) {
848 auto idx = small_pool::size_to_idx(size);
849 auto& pool = small_pools[idx];
850 assert(size <= pool.object_size());
851 auto ptr = pool.allocate();
852 #ifdef SEASTAR_HEAPPROF
853 if (!ptr) {
854 return nullptr;
855 }
856 allocation_site_ptr alloc_site = get_allocation_site();
857 if (alloc_site) {
858 ++alloc_site->count;
859 alloc_site->size += pool.object_size();
860 }
861 new (&pool.alloc_site_holder(ptr)) allocation_site_ptr{alloc_site};
862 #endif
863 return ptr;
864 }
865
866 void cpu_pages::free_large(void* ptr) {
867 pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
868 page* span = &pages[idx];
869 #ifdef SEASTAR_HEAPPROF
870 auto alloc_site = span->alloc_site;
871 if (alloc_site) {
872 --alloc_site->count;
873 alloc_site->size -= span->span_size * page_size;
874 }
875 #endif
876 free_span(idx, span->span_size);
877 }
878
879 size_t cpu_pages::object_size(void* ptr) {
880 page* span = to_page(ptr);
881 if (span->pool) {
882 auto s = span->pool->object_size();
883 #ifdef SEASTAR_HEAPPROF
884 // We must not allow the object to be extended onto the allocation_site_ptr field.
885 if (!span->pool->objects_page_aligned()) {
886 s -= sizeof(allocation_site_ptr);
887 }
888 #endif
889 return s;
890 } else {
891 return size_t(span->span_size) * page_size;
892 }
893 }
894
895 void cpu_pages::free_cross_cpu(unsigned cpu_id, void* ptr) {
896 if (!live_cpus[cpu_id].load(std::memory_order_relaxed)) {
897 // Thread was destroyed; leak object
898 // should only happen for boost unit-tests.
899 return;
900 }
901 auto p = reinterpret_cast<cross_cpu_free_item*>(ptr);
902 auto& list = all_cpus[cpu_id]->xcpu_freelist;
903 auto old = list.load(std::memory_order_relaxed);
904 do {
905 p->next = old;
906 } while (!list.compare_exchange_weak(old, p, std::memory_order_release, std::memory_order_relaxed));
907 alloc_stats::increment(alloc_stats::types::cross_cpu_frees);
908 }
909
910 bool cpu_pages::drain_cross_cpu_freelist() {
911 if (!xcpu_freelist.load(std::memory_order_relaxed)) {
912 return false;
913 }
914 auto p = xcpu_freelist.exchange(nullptr, std::memory_order_acquire);
915 while (p) {
916 auto n = p->next;
917 alloc_stats::increment_local(alloc_stats::types::frees);
918 free(p);
919 p = n;
920 }
921 return true;
922 }
923
924 void cpu_pages::free(void* ptr) {
925 page* span = to_page(ptr);
926 if (span->pool) {
927 small_pool& pool = *span->pool;
928 #ifdef SEASTAR_HEAPPROF
929 allocation_site_ptr alloc_site = pool.alloc_site_holder(ptr);
930 if (alloc_site) {
931 --alloc_site->count;
932 alloc_site->size -= pool.object_size();
933 }
934 #endif
935 pool.deallocate(ptr);
936 } else {
937 free_large(ptr);
938 }
939 }
940
941 void cpu_pages::free(void* ptr, size_t size) {
942 // match action on allocate() so hit the right pool
943 if (size <= sizeof(free_object)) {
944 size = sizeof(free_object);
945 }
946 if (size <= max_small_allocation) {
947 size = object_size_with_alloc_site(size);
948 auto pool = &small_pools[small_pool::size_to_idx(size)];
949 #ifdef SEASTAR_HEAPPROF
950 allocation_site_ptr alloc_site = pool->alloc_site_holder(ptr);
951 if (alloc_site) {
952 --alloc_site->count;
953 alloc_site->size -= pool->object_size();
954 }
955 #endif
956 pool->deallocate(ptr);
957 } else {
958 free_large(ptr);
959 }
960 }
961
962 bool
963 cpu_pages::try_foreign_free(void* ptr) {
964 // fast path for local free
965 if (__builtin_expect((reinterpret_cast<uintptr_t>(ptr) & cpu_id_and_mem_base_mask) == local_expected_cpu_id, true)) {
966 return false;
967 }
968 if (!is_seastar_memory(ptr)) {
969 if (is_reactor_thread) {
970 alloc_stats::increment_local(alloc_stats::types::foreign_cross_frees);
971 } else {
972 alloc_stats::increment(alloc_stats::types::foreign_frees);
973 }
974 original_free_func(ptr);
975 return true;
976 }
977 free_cross_cpu(object_cpu_id(ptr), ptr);
978 return true;
979 }
980
981 void cpu_pages::shrink(void* ptr, size_t new_size) {
982 auto obj_cpu = object_cpu_id(ptr);
983 assert(obj_cpu == cpu_id);
984 page* span = to_page(ptr);
985 if (span->pool) {
986 return;
987 }
988 auto old_size_pages = span->span_size;
989 size_t new_size_pages = old_size_pages;
990 while (new_size_pages / 2 * page_size >= new_size) {
991 new_size_pages /= 2;
992 }
993 if (new_size_pages == old_size_pages) {
994 return;
995 }
996 #ifdef SEASTAR_HEAPPROF
997 auto alloc_site = span->alloc_site;
998 if (alloc_site) {
999 alloc_site->size -= span->span_size * page_size;
1000 alloc_site->size += new_size_pages * page_size;
1001 }
1002 #endif
1003 span->span_size = new_size_pages;
1004 span[new_size_pages - 1].free = false;
1005 span[new_size_pages - 1].span_size = new_size_pages;
1006 pageidx idx = span - pages;
1007 free_span_unaligned(idx + new_size_pages, old_size_pages - new_size_pages);
1008 }
1009
1010 cpu_pages::~cpu_pages() {
1011 if (is_initialized()) {
1012 live_cpus[cpu_id].store(false, std::memory_order_relaxed);
1013 }
1014 }
1015
1016 bool cpu_pages::is_initialized() const {
1017 return bool(nr_pages);
1018 }
1019
1020 bool cpu_pages::initialize() {
1021 if (is_initialized()) {
1022 return false;
1023 }
1024 cpu_id = cpu_id_gen.fetch_add(1, std::memory_order_relaxed);
1025 local_expected_cpu_id = (static_cast<uint64_t>(cpu_id) << cpu_id_shift)
1026 | reinterpret_cast<uintptr_t>(mem_base());
1027 assert(cpu_id < max_cpus);
1028 all_cpus[cpu_id] = this;
1029 auto base = mem_base() + (size_t(cpu_id) << cpu_id_shift);
1030 auto size = 32 << 20; // Small size for bootstrap
1031 auto r = ::mmap(base, size,
1032 PROT_READ | PROT_WRITE,
1033 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
1034 -1, 0);
1035 if (r == MAP_FAILED) {
1036 abort();
1037 }
1038 ::madvise(base, size, MADV_HUGEPAGE);
1039 pages = reinterpret_cast<page*>(base);
1040 memory = base;
1041 nr_pages = size / page_size;
1042 // we reserve the end page so we don't have to special case
1043 // the last span.
1044 auto reserved = align_up(sizeof(page) * (nr_pages + 1), page_size) / page_size;
1045 reserved = 1u << log2ceil(reserved);
1046 for (pageidx i = 0; i < reserved; ++i) {
1047 pages[i].free = false;
1048 }
1049 pages[nr_pages].free = false;
1050 free_span_unaligned(reserved, nr_pages - reserved);
1051 live_cpus[cpu_id].store(true, std::memory_order_relaxed);
1052 return true;
1053 }
1054
1055 mmap_area
1056 static allocate_anonymous_memory(void* where, size_t how_much) {
1057 return mmap_anonymous(where,
1058 how_much,
1059 PROT_READ | PROT_WRITE,
1060 MAP_PRIVATE | MAP_FIXED);
1061 }
1062
1063 mmap_area
1064 allocate_hugetlbfs_memory(file_desc& fd, void* where, size_t how_much) {
1065 auto pos = fd.size();
1066 fd.truncate(pos + how_much);
1067 auto ret = fd.map(
1068 how_much,
1069 PROT_READ | PROT_WRITE,
1070 MAP_SHARED | MAP_POPULATE | (where ? MAP_FIXED : 0),
1071 pos,
1072 where);
1073 return ret;
1074 }
1075
1076 void cpu_pages::replace_memory_backing(allocate_system_memory_fn alloc_sys_mem) {
1077 // We would like to use ::mremap() to atomically replace the old anonymous
1078 // memory with hugetlbfs backed memory, but mremap() does not support hugetlbfs
1079 // (for no reason at all). So we must copy the anonymous memory to some other
1080 // place, map hugetlbfs in place, and copy it back, without modifying it during
1081 // the operation.
1082 auto bytes = nr_pages * page_size;
1083 auto old_mem = mem();
1084 auto relocated_old_mem = mmap_anonymous(nullptr, bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE);
1085 std::memcpy(relocated_old_mem.get(), old_mem, bytes);
1086 alloc_sys_mem(old_mem, bytes).release();
1087 std::memcpy(old_mem, relocated_old_mem.get(), bytes);
1088 }
1089
1090 void cpu_pages::do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem) {
1091 auto new_pages = new_size / page_size;
1092 if (new_pages <= nr_pages) {
1093 return;
1094 }
1095 auto old_size = nr_pages * page_size;
1096 auto mmap_start = memory + old_size;
1097 auto mmap_size = new_size - old_size;
1098 auto mem = alloc_sys_mem(mmap_start, mmap_size);
1099 mem.release();
1100 ::madvise(mmap_start, mmap_size, MADV_HUGEPAGE);
1101 // one past last page structure is a sentinel
1102 auto new_page_array_pages = align_up(sizeof(page[new_pages + 1]), page_size) / page_size;
1103 auto new_page_array
1104 = reinterpret_cast<page*>(allocate_large(new_page_array_pages));
1105 if (!new_page_array) {
1106 throw std::bad_alloc();
1107 }
1108 std::copy(pages, pages + nr_pages, new_page_array);
1109 // mark new one-past-last page as taken to avoid boundary conditions
1110 new_page_array[new_pages].free = false;
1111 auto old_pages = reinterpret_cast<char*>(pages);
1112 auto old_nr_pages = nr_pages;
1113 auto old_pages_size = align_up(sizeof(page[nr_pages + 1]), page_size);
1114 old_pages_size = size_t(1) << log2ceil(old_pages_size);
1115 pages = new_page_array;
1116 nr_pages = new_pages;
1117 auto old_pages_start = (old_pages - memory) / page_size;
1118 if (old_pages_start == 0) {
1119 // keep page 0 allocated
1120 old_pages_start = 1;
1121 old_pages_size -= page_size;
1122 }
1123 if (old_pages_size != 0) {
1124 free_span_unaligned(old_pages_start, old_pages_size / page_size);
1125 }
1126 free_span_unaligned(old_nr_pages, new_pages - old_nr_pages);
1127 }
1128
1129 void cpu_pages::resize(size_t new_size, allocate_system_memory_fn alloc_memory) {
1130 new_size = align_down(new_size, huge_page_size);
1131 while (nr_pages * page_size < new_size) {
1132 // don't reallocate all at once, since there might not
1133 // be enough free memory available to relocate the pages array
1134 auto tmp_size = std::min(new_size, 4 * nr_pages * page_size);
1135 do_resize(tmp_size, alloc_memory);
1136 }
1137 }
1138
1139 reclaiming_result cpu_pages::run_reclaimers(reclaimer_scope scope, size_t n_pages) {
1140 auto target = std::max<size_t>(nr_free_pages + n_pages, min_free_pages);
1141 reclaiming_result result = reclaiming_result::reclaimed_nothing;
1142 while (nr_free_pages < target) {
1143 bool made_progress = false;
1144 alloc_stats::increment_local(alloc_stats::types::reclaims);
1145 for (auto&& r : reclaimers) {
1146 if (r->scope() >= scope) {
1147 made_progress |= r->do_reclaim((target - nr_free_pages) * page_size) == reclaiming_result::reclaimed_something;
1148 }
1149 }
1150 if (!made_progress) {
1151 return result;
1152 }
1153 result = reclaiming_result::reclaimed_something;
1154 }
1155 return result;
1156 }
1157
1158 void cpu_pages::schedule_reclaim() {
1159 current_min_free_pages = 0;
1160 reclaim_hook([this] {
1161 if (nr_free_pages < min_free_pages) {
1162 try {
1163 run_reclaimers(reclaimer_scope::async, min_free_pages - nr_free_pages);
1164 } catch (...) {
1165 current_min_free_pages = min_free_pages;
1166 throw;
1167 }
1168 }
1169 current_min_free_pages = min_free_pages;
1170 });
1171 }
1172
1173 memory::memory_layout cpu_pages::memory_layout() {
1174 assert(is_initialized());
1175 return {
1176 reinterpret_cast<uintptr_t>(memory),
1177 reinterpret_cast<uintptr_t>(memory) + nr_pages * page_size
1178 };
1179 }
1180
1181 void cpu_pages::set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1182 reclaim_hook = hook;
1183 current_min_free_pages = min_free_pages;
1184 }
1185
1186 void cpu_pages::set_min_free_pages(size_t pages) {
1187 if (pages > std::numeric_limits<decltype(min_free_pages)>::max()) {
1188 throw std::runtime_error("Number of pages too large");
1189 }
1190 min_free_pages = pages;
1191 maybe_reclaim();
1192 }
1193
1194 small_pool::small_pool(unsigned object_size) noexcept
1195 : _object_size(object_size) {
1196 unsigned span_size = 1;
1197 auto span_bytes = [&] { return span_size * page_size; };
1198 auto waste = [&] { return (span_bytes() % _object_size) / (1.0 * span_bytes()); };
1199 while (object_size > span_bytes()) {
1200 ++span_size;
1201 }
1202 _span_sizes.fallback = span_size;
1203
1204 // Choose a preferred span size which keeps waste (internal fragmentation) below
1205 // 5% and fits at least 4 objects. If there is no span size (up to 32 pages) that
1206 // satisfies this, just go with the minimum waste out of the checked span sizes.
1207 float min_waste = std::numeric_limits<float>::max();
1208 unsigned min_waste_span_size = 0;
1209 for (span_size = 1; span_size <= 32; span_size *= 2) {
1210 if (span_bytes() / object_size >= 4) {
1211 auto w = waste();
1212 if (w < min_waste) {
1213 min_waste = w;
1214 min_waste_span_size = span_size;
1215 if (w < 0.05) {
1216 break;
1217 }
1218 }
1219 }
1220 }
1221 _span_sizes.preferred = min_waste_span_size ? min_waste_span_size : _span_sizes.fallback;
1222
1223 _max_free = std::max<unsigned>(100, span_bytes() * 2 / _object_size);
1224 _min_free = _max_free / 2;
1225 }
1226
1227 small_pool::~small_pool() {
1228 _min_free = _max_free = 0;
1229 trim_free_list();
1230 }
1231
1232 // Should not throw in case of running out of memory to avoid infinite recursion,
1233 // becaue throwing std::bad_alloc requires allocation. __cxa_allocate_exception
1234 // falls back to the emergency pool in case malloc() returns nullptr.
1235 void*
1236 small_pool::allocate() {
1237 if (!_free) {
1238 add_more_objects();
1239 }
1240 if (!_free) {
1241 return nullptr;
1242 }
1243 auto* obj = _free;
1244 _free = _free->next;
1245 --_free_count;
1246 return obj;
1247 }
1248
1249 void
1250 small_pool::deallocate(void* object) {
1251 auto o = reinterpret_cast<free_object*>(object);
1252 o->next = _free;
1253 _free = o;
1254 ++_free_count;
1255 if (_free_count >= _max_free) {
1256 trim_free_list();
1257 }
1258 }
1259
1260 void
1261 small_pool::add_more_objects() {
1262 auto goal = (_min_free + _max_free) / 2;
1263 while (!_span_list.empty() && _free_count < goal) {
1264 page& span = _span_list.front(get_cpu_mem().pages);
1265 _span_list.pop_front(get_cpu_mem().pages);
1266 while (span.freelist) {
1267 auto obj = span.freelist;
1268 span.freelist = span.freelist->next;
1269 obj->next = _free;
1270 _free = obj;
1271 ++_free_count;
1272 ++span.nr_small_alloc;
1273 }
1274 }
1275 while (_free_count < goal) {
1276 disable_backtrace_temporarily dbt;
1277 auto span_size = _span_sizes.preferred;
1278 auto data = reinterpret_cast<char*>(get_cpu_mem().allocate_large(span_size));
1279 if (!data) {
1280 span_size = _span_sizes.fallback;
1281 data = reinterpret_cast<char*>(get_cpu_mem().allocate_large(span_size));
1282 if (!data) {
1283 return;
1284 }
1285 }
1286 auto span = get_cpu_mem().to_page(data);
1287 span_size = span->span_size;
1288 _pages_in_use += span_size;
1289 for (unsigned i = 0; i < span_size; ++i) {
1290 span[i].offset_in_span = i;
1291 span[i].pool = this;
1292 }
1293 span->nr_small_alloc = 0;
1294 span->freelist = nullptr;
1295 for (unsigned offset = 0; offset <= span_size * page_size - _object_size; offset += _object_size) {
1296 auto h = reinterpret_cast<free_object*>(data + offset);
1297 h->next = _free;
1298 _free = h;
1299 ++_free_count;
1300 ++span->nr_small_alloc;
1301 }
1302 }
1303 }
1304
1305 void
1306 small_pool::trim_free_list() {
1307 auto goal = (_min_free + _max_free) / 2;
1308 while (_free && _free_count > goal) {
1309 auto obj = _free;
1310 _free = _free->next;
1311 --_free_count;
1312 page* span = get_cpu_mem().to_page(obj);
1313 span -= span->offset_in_span;
1314 if (!span->freelist) {
1315 new (&span->link) page_list_link();
1316 _span_list.push_front(get_cpu_mem().pages, *span);
1317 }
1318 obj->next = span->freelist;
1319 span->freelist = obj;
1320 if (--span->nr_small_alloc == 0) {
1321 _pages_in_use -= span->span_size;
1322 _span_list.erase(get_cpu_mem().pages, *span);
1323 get_cpu_mem().free_span(span - get_cpu_mem().pages, span->span_size);
1324 }
1325 }
1326 }
1327
1328 void
1329 abort_on_underflow(size_t size) {
1330 if (std::make_signed_t<size_t>(size) < 0) {
1331 // probably a logic error, stop hard
1332 abort();
1333 }
1334 }
1335
1336 void* allocate_large(size_t size) {
1337 abort_on_underflow(size);
1338 unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1339 if ((size_t(size_in_pages) << page_bits) < size) {
1340 return nullptr; // (size + page_size - 1) caused an overflow
1341 }
1342 return get_cpu_mem().allocate_large(size_in_pages);
1343
1344 }
1345
1346 void* allocate_large_aligned(size_t align, size_t size) {
1347 abort_on_underflow(size);
1348 unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1349 unsigned align_in_pages = std::max(align, page_size) >> page_bits;
1350 return get_cpu_mem().allocate_large_aligned(align_in_pages, size_in_pages);
1351 }
1352
1353 void free_large(void* ptr) {
1354 return get_cpu_mem().free_large(ptr);
1355 }
1356
1357 size_t object_size(void* ptr) {
1358 return cpu_pages::all_cpus[object_cpu_id(ptr)]->object_size(ptr);
1359 }
1360
1361 static thread_local cpu_pages* cpu_mem_ptr = nullptr;
1362
1363 // Mark as cold so that GCC8+ can move to .text.unlikely.
1364 [[gnu::cold]]
1365 static void init_cpu_mem() {
1366 cpu_mem_ptr = &cpu_mem;
1367 cpu_mem.initialize();
1368 }
1369
1370 [[gnu::always_inline]]
1371 static inline cpu_pages& get_cpu_mem()
1372 {
1373 // cpu_pages has a non-trivial constructor which means that the compiler
1374 // must make sure the instance local to the current thread has been
1375 // constructed before each access. So instead we access cpu_mem_ptr
1376 // which has been initialized by calls to init_cpu_mem() before it is
1377 // accessed.
1378 return *cpu_mem_ptr;
1379 }
1380
1381 #ifdef SEASTAR_DEBUG_ALLOCATIONS
1382 static constexpr int debug_allocation_pattern = 0xab;
1383 #endif
1384
1385 void* allocate(size_t size) {
1386 if (!is_reactor_thread) {
1387 if (original_malloc_func) {
1388 alloc_stats::increment(alloc_stats::types::foreign_mallocs);
1389 return original_malloc_func(size);
1390 }
1391 // original_malloc_func might be null for allocations before main
1392 // in constructors before original_malloc_func ctor is called
1393 init_cpu_mem();
1394 }
1395 if (size <= sizeof(free_object)) {
1396 size = sizeof(free_object);
1397 }
1398 void* ptr;
1399 if (size <= max_small_allocation) {
1400 size = object_size_with_alloc_site(size);
1401 ptr = get_cpu_mem().allocate_small(size);
1402 } else {
1403 ptr = allocate_large(size);
1404 }
1405 if (!ptr) {
1406 on_allocation_failure(size);
1407 } else {
1408 #ifdef SEASTAR_DEBUG_ALLOCATIONS
1409 std::memset(ptr, debug_allocation_pattern, size);
1410 #endif
1411 }
1412 alloc_stats::increment_local(alloc_stats::types::allocs);
1413 return ptr;
1414 }
1415
1416 void* allocate_aligned(size_t align, size_t size) {
1417 if (!is_reactor_thread) {
1418 if (original_aligned_alloc_func) {
1419 alloc_stats::increment(alloc_stats::types::foreign_mallocs);
1420 return original_aligned_alloc_func(align, size);
1421 }
1422 // original_realloc_func might be null for allocations before main
1423 // in constructors before original_realloc_func ctor is called
1424 init_cpu_mem();
1425 }
1426 if (size <= sizeof(free_object)) {
1427 size = std::max(sizeof(free_object), align);
1428 }
1429 void* ptr;
1430 if (size <= max_small_allocation && align <= page_size) {
1431 // Our small allocator only guarantees alignment for power-of-two
1432 // allocations which are not larger than a page.
1433 size = 1 << log2ceil(object_size_with_alloc_site(size));
1434 ptr = get_cpu_mem().allocate_small(size);
1435 } else {
1436 ptr = allocate_large_aligned(align, size);
1437 }
1438 if (!ptr) {
1439 on_allocation_failure(size);
1440 } else {
1441 #ifdef SEASTAR_DEBUG_ALLOCATIONS
1442 std::memset(ptr, debug_allocation_pattern, size);
1443 #endif
1444 }
1445 alloc_stats::increment_local(alloc_stats::types::allocs);
1446 return ptr;
1447 }
1448
1449 void free(void* obj) {
1450 if (cpu_pages::try_foreign_free(obj)) {
1451 return;
1452 }
1453 alloc_stats::increment_local(alloc_stats::types::frees);
1454 get_cpu_mem().free(obj);
1455 }
1456
1457 void free(void* obj, size_t size) {
1458 if (cpu_pages::try_foreign_free(obj)) {
1459 return;
1460 }
1461 alloc_stats::increment_local(alloc_stats::types::frees);
1462 get_cpu_mem().free(obj, size);
1463 }
1464
1465 void free_aligned(void* obj, size_t align, size_t size) {
1466 if (size <= sizeof(free_object)) {
1467 size = sizeof(free_object);
1468 }
1469 if (size <= max_small_allocation && align <= page_size) {
1470 // Same adjustment as allocate_aligned()
1471 size = 1 << log2ceil(object_size_with_alloc_site(size));
1472 }
1473 free(obj, size);
1474 }
1475
1476 void shrink(void* obj, size_t new_size) {
1477 alloc_stats::increment_local(alloc_stats::types::frees);
1478 alloc_stats::increment_local(alloc_stats::types::allocs); // keep them balanced
1479 get_cpu_mem().shrink(obj, new_size);
1480 }
1481
1482 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1483 get_cpu_mem().set_reclaim_hook(hook);
1484 }
1485
1486 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope scope)
1487 : reclaimer([reclaim = std::move(reclaim)] (request) {
1488 return reclaim();
1489 }, scope) {
1490 }
1491
1492 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope scope)
1493 : _reclaim(std::move(reclaim))
1494 , _scope(scope) {
1495 get_cpu_mem().reclaimers.push_back(this);
1496 }
1497
1498 reclaimer::~reclaimer() {
1499 auto& r = get_cpu_mem().reclaimers;
1500 r.erase(std::find(r.begin(), r.end(), this));
1501 }
1502
1503 void set_large_allocation_warning_threshold(size_t threshold) {
1504 get_cpu_mem().large_allocation_warning_threshold = threshold;
1505 }
1506
1507 size_t get_large_allocation_warning_threshold() {
1508 return get_cpu_mem().large_allocation_warning_threshold;
1509 }
1510
1511 void disable_large_allocation_warning() {
1512 get_cpu_mem().large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
1513 }
1514
1515 void configure(std::vector<resource::memory> m, bool mbind,
1516 optional<std::string> hugetlbfs_path) {
1517 // we need to make sure cpu_mem is initialize since configure calls cpu_mem.resize
1518 // and we might reach configure without ever allocating, hence without ever calling
1519 // cpu_pages::initialize.
1520 // The correct solution is to add a condition inside cpu_mem.resize, but since all
1521 // other paths to cpu_pages::resize are already verifying initialize was called, we
1522 // verify that here.
1523 init_cpu_mem();
1524 is_reactor_thread = true;
1525 size_t total = 0;
1526 for (auto&& x : m) {
1527 total += x.bytes;
1528 }
1529 allocate_system_memory_fn sys_alloc = allocate_anonymous_memory;
1530 if (hugetlbfs_path) {
1531 // std::function is copyable, but file_desc is not, so we must use
1532 // a shared_ptr to allow sys_alloc to be copied around
1533 auto fdp = make_lw_shared<file_desc>(file_desc::temporary(*hugetlbfs_path));
1534 sys_alloc = [fdp] (void* where, size_t how_much) {
1535 return allocate_hugetlbfs_memory(*fdp, where, how_much);
1536 };
1537 get_cpu_mem().replace_memory_backing(sys_alloc);
1538 }
1539 get_cpu_mem().resize(total, sys_alloc);
1540 size_t pos = 0;
1541 for (auto&& x : m) {
1542 #ifdef SEASTAR_HAVE_NUMA
1543 unsigned long nodemask = 1UL << x.nodeid;
1544 if (mbind) {
1545 auto r = ::mbind(get_cpu_mem().mem() + pos, x.bytes,
1546 MPOL_PREFERRED,
1547 &nodemask, std::numeric_limits<unsigned long>::digits,
1548 MPOL_MF_MOVE);
1549
1550 if (r == -1) {
1551 char err[1000] = {};
1552 strerror_r(errno, err, sizeof(err));
1553 std::cerr << "WARNING: unable to mbind shard memory; performance may suffer: "
1554 << err << std::endl;
1555 }
1556 }
1557 #endif
1558 pos += x.bytes;
1559 }
1560 }
1561
1562 statistics stats() {
1563 return statistics{alloc_stats::get(alloc_stats::types::allocs), alloc_stats::get(alloc_stats::types::frees), alloc_stats::get(alloc_stats::types::cross_cpu_frees),
1564 cpu_mem.nr_pages * page_size, cpu_mem.nr_free_pages * page_size, alloc_stats::get(alloc_stats::types::reclaims), alloc_stats::get(alloc_stats::types::large_allocs),
1565 alloc_stats::get(alloc_stats::types::failed_allocs), alloc_stats::get(alloc_stats::types::foreign_mallocs), alloc_stats::get(alloc_stats::types::foreign_frees),
1566 alloc_stats::get(alloc_stats::types::foreign_cross_frees)};
1567 }
1568
1569 size_t free_memory() {
1570 return get_cpu_mem().nr_free_pages * page_size;
1571 }
1572
1573 bool drain_cross_cpu_freelist() {
1574 return get_cpu_mem().drain_cross_cpu_freelist();
1575 }
1576
1577 memory_layout get_memory_layout() {
1578 return get_cpu_mem().memory_layout();
1579 }
1580
1581 size_t min_free_memory() {
1582 return get_cpu_mem().min_free_pages * page_size;
1583 }
1584
1585 void set_min_free_pages(size_t pages) {
1586 get_cpu_mem().set_min_free_pages(pages);
1587 }
1588
1589 static thread_local int report_on_alloc_failure_suppressed = 0;
1590
1591 class disable_report_on_alloc_failure_temporarily {
1592 public:
1593 disable_report_on_alloc_failure_temporarily() {
1594 ++report_on_alloc_failure_suppressed;
1595 };
1596 ~disable_report_on_alloc_failure_temporarily() noexcept {
1597 --report_on_alloc_failure_suppressed;
1598 }
1599 };
1600
1601 static std::atomic<bool> abort_on_allocation_failure{false};
1602 static std::atomic<alloc_failure_kind> dump_diagnostics_on_alloc_failure_kind{alloc_failure_kind::critical};
1603
1604 void enable_abort_on_allocation_failure() {
1605 abort_on_allocation_failure.store(true, std::memory_order_seq_cst);
1606 }
1607
1608 void set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind kind) {
1609 dump_diagnostics_on_alloc_failure_kind.store(kind, std::memory_order_seq_cst);
1610 }
1611
1612 void set_dump_memory_diagnostics_on_alloc_failure_kind(std::string_view str) {
1613 if (str == "none") {
1614 set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind::none);
1615 } else if (str == "critical") {
1616 set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind::critical);
1617 } else if (str == "all") {
1618 set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind::all);
1619 } else {
1620 seastar_logger.error("Ignoring invalid option '{}' for the allocation failure kind to dump seastar memory diagnostics for, valid options are: none, critical and all", str);
1621 }
1622 }
1623
1624 static thread_local noncopyable_function<void(memory_diagnostics_writer)> additional_diagnostics_producer;
1625
1626 void set_additional_diagnostics_producer(noncopyable_function<void(memory_diagnostics_writer)> producer) {
1627 additional_diagnostics_producer = std::move(producer);
1628 }
1629
1630 struct human_readable_value {
1631 uint16_t value; // [0, 1024)
1632 char suffix; // 0 -> no suffix
1633 };
1634
1635 std::ostream& operator<<(std::ostream& os, const human_readable_value& val) {
1636 os << val.value;
1637 if (val.suffix) {
1638 os << val.suffix;
1639 }
1640 return os;
1641 }
1642
1643 static human_readable_value to_human_readable_value(uint64_t value, uint64_t step, uint64_t precision, const std::array<char, 5>& suffixes) {
1644 if (!value) {
1645 return {0, suffixes[0]};
1646 }
1647
1648 uint64_t result = value;
1649 uint64_t remainder = 0;
1650 unsigned i = 0;
1651 // If there is no remainder we go below precision because we don't loose any.
1652 while (((!remainder && result >= step) || result >= precision)) {
1653 remainder = result % step;
1654 result /= step;
1655 if (i == suffixes.size()) {
1656 break;
1657 } else {
1658 ++i;
1659 }
1660 }
1661 return {uint16_t(remainder < (step / 2) ? result : result + 1), suffixes[i]};
1662 }
1663
1664 static human_readable_value to_hr_size(uint64_t size) {
1665 const std::array<char, 5> suffixes = {'B', 'K', 'M', 'G', 'T'};
1666 return to_human_readable_value(size, 1024, 8192, suffixes);
1667 }
1668
1669 static human_readable_value to_hr_number(uint64_t number) {
1670 const std::array<char, 5> suffixes = {'\0', 'k', 'm', 'b', 't'};
1671 return to_human_readable_value(number, 1000, 10000, suffixes);
1672 }
1673
1674 seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator it) {
1675 auto free_mem = get_cpu_mem().nr_free_pages * page_size;
1676 auto total_mem = get_cpu_mem().nr_pages * page_size;
1677 it = fmt::format_to(it, "Dumping seastar memory diagnostics\n");
1678
1679 it = fmt::format_to(it, "Used memory: {}\n", to_hr_size(total_mem - free_mem));
1680 it = fmt::format_to(it, "Free memory: {}\n", to_hr_size(free_mem));
1681 it = fmt::format_to(it, "Total memory: {}\n", to_hr_size(total_mem));
1682 it = fmt::format_to(it, "Hard failures: {}\n\n", alloc_stats::get(alloc_stats::types::failed_allocs));
1683
1684 if (additional_diagnostics_producer) {
1685 additional_diagnostics_producer([&it] (std::string_view v) mutable {
1686 #if FMT_VERSION >= 80000
1687 it = fmt::format_to(it, fmt::runtime(v));
1688 #else
1689 it = fmt::format_to(it, v);
1690 #endif
1691 });
1692 }
1693
1694 it = fmt::format_to(it, "Small pools:\n");
1695 it = fmt::format_to(it, "objsz spansz usedobj memory unused wst%\n");
1696 for (unsigned i = 0; i < get_cpu_mem().small_pools.nr_small_pools; i++) {
1697 auto& sp = get_cpu_mem().small_pools[i];
1698 // We don't use pools too small to fit a free_object, so skip these, they
1699 // are always empty.
1700 if (sp.object_size() < sizeof(free_object)) {
1701 continue;
1702 }
1703
1704 // For the small pools, there are two types of free objects:
1705 // Pool freelist objects are poitned to by sp._free and their count is sp._free_count
1706 // Span freelist objects are those removed from the pool freelist when that list
1707 // becomes too large: they are instead attached to the spans allocated to this
1708 // pool. To count this second category, we iterate over the spans below.
1709 uint32_t span_freelist_objs = 0;
1710 auto front = sp._span_list._front;
1711 while (front) {
1712 auto& span = get_cpu_mem().pages[front];
1713 auto capacity_in_objects = span.span_size * page_size / sp.object_size();
1714 span_freelist_objs += capacity_in_objects - span.nr_small_alloc;
1715 front = span.link._next;
1716 }
1717 const auto free_objs = sp._free_count + span_freelist_objs; // pool + span free objects
1718 const auto use_count = sp._pages_in_use * page_size / sp.object_size() - free_objs;
1719 auto memory = sp._pages_in_use * page_size;
1720 const auto unused = free_objs * sp.object_size();
1721 const auto wasted_percent = memory ? unused * 100 / memory : 0;
1722 it = fmt::format_to(it,
1723 "{:>5} {:>5} {:>5} {:>5} {:>5} {:>4}\n",
1724 sp.object_size(),
1725 to_hr_size(sp._span_sizes.preferred * page_size),
1726 to_hr_number(use_count),
1727 to_hr_size(memory),
1728 to_hr_size(unused),
1729 unsigned(wasted_percent));
1730 }
1731 it = fmt::format_to(it, "\nPage spans:\n");
1732 it = fmt::format_to(it, "index size free used spans\n");
1733
1734 std::array<uint32_t, cpu_pages::nr_span_lists> span_size_histogram;
1735 span_size_histogram.fill(0);
1736
1737 for (unsigned i = 0; i < get_cpu_mem().nr_pages;) {
1738 const auto span_size = get_cpu_mem().pages[i].span_size;
1739 if (!span_size) {
1740 ++i;
1741 continue;
1742 }
1743 ++span_size_histogram[log2ceil(span_size)];
1744 i += span_size;
1745 }
1746
1747 for (unsigned i = 0; i< get_cpu_mem().nr_span_lists; i++) {
1748 auto& span_list = get_cpu_mem().free_spans[i];
1749 auto front = span_list._front;
1750 uint32_t free_pages = 0;
1751 while (front) {
1752 auto& span = get_cpu_mem().pages[front];
1753 free_pages += span.span_size;
1754 front = span.link._next;
1755 }
1756 const auto total_spans = span_size_histogram[i];
1757 const auto total_pages = total_spans * (1 << i);
1758 it = fmt::format_to(it,
1759 "{:>5} {:>5} {:>5} {:>5} {:>5}\n",
1760 i,
1761 to_hr_size((uint64_t(1) << i) * page_size),
1762 to_hr_size(free_pages * page_size),
1763 to_hr_size((total_pages - free_pages) * page_size),
1764 to_hr_number(total_spans));
1765 }
1766
1767 return it;
1768 }
1769
1770 void dump_memory_diagnostics(log_level lvl, logger::rate_limit& rate_limit) {
1771 logger::lambda_log_writer writer([] (seastar::internal::log_buf::inserter_iterator it) {
1772 return do_dump_memory_diagnostics(it);
1773 });
1774 seastar_memory_logger.log(lvl, rate_limit, writer);
1775 }
1776
1777 void internal::log_memory_diagnostics_report(log_level lvl) {
1778 logger::rate_limit rl{std::chrono::seconds(0)}; // never limit for explicit dump requests
1779 dump_memory_diagnostics(lvl, rl);
1780 }
1781
1782 void maybe_dump_memory_diagnostics(size_t size, bool is_aborting) {
1783 if (report_on_alloc_failure_suppressed) {
1784 return;
1785 }
1786
1787 disable_report_on_alloc_failure_temporarily guard;
1788 if (seastar_memory_logger.is_enabled(log_level::debug)) {
1789 seastar_memory_logger.debug("Failed to allocate {} bytes at {}", size, current_backtrace());
1790 }
1791
1792 auto lvl = log_level::debug;
1793 switch (dump_diagnostics_on_alloc_failure_kind.load(std::memory_order_relaxed)) {
1794 case alloc_failure_kind::none:
1795 lvl = log_level::debug;
1796 break;
1797 case alloc_failure_kind::critical:
1798 lvl = is_critical_alloc_section() ? log_level::error : log_level::debug;
1799 break;
1800 case alloc_failure_kind::all:
1801 lvl = log_level::error;
1802 break;
1803 }
1804
1805 if (is_aborting) {
1806 // if we are about to abort, always report the memory diagnositics at error level
1807 lvl = log_level::error;
1808 }
1809
1810 static thread_local logger::rate_limit rate_limit(std::chrono::seconds(10));
1811 dump_memory_diagnostics(lvl, rate_limit);
1812
1813
1814 }
1815
1816 void on_allocation_failure(size_t size) {
1817 alloc_stats::increment(alloc_stats::types::failed_allocs);
1818
1819 bool will_abort = !abort_on_alloc_failure_suppressed
1820 && abort_on_allocation_failure.load(std::memory_order_relaxed);
1821
1822 maybe_dump_memory_diagnostics(size, will_abort);
1823
1824 if (will_abort) {
1825 seastar_logger.error("Failed to allocate {} bytes", size);
1826 abort();
1827 }
1828 }
1829
1830 sstring generate_memory_diagnostics_report() {
1831 seastar::internal::log_buf buf;
1832 auto it = buf.back_insert_begin();
1833 do_dump_memory_diagnostics(it);
1834 return sstring(buf.data(), buf.size());
1835 }
1836
1837 static void trigger_error_injector() {
1838 on_alloc_point();
1839 }
1840
1841 static bool try_trigger_error_injector() {
1842 try {
1843 on_alloc_point();
1844 return false;
1845 } catch (...) {
1846 return true;
1847 }
1848 }
1849
1850 }
1851
1852 }
1853
1854 using namespace seastar::memory;
1855
1856 extern "C"
1857 [[gnu::visibility("default")]]
1858 [[gnu::used]]
1859 void* malloc(size_t n) throw () {
1860 if (try_trigger_error_injector()) {
1861 return nullptr;
1862 }
1863 return allocate(n);
1864 }
1865
1866 extern "C"
1867 [[gnu::alias("malloc")]]
1868 [[gnu::visibility("default")]]
1869 [[gnu::malloc]]
1870 [[gnu::alloc_size(1)]]
1871 #ifndef __clang__
1872 [[gnu::leaf]]
1873 #endif
1874 void* __libc_malloc(size_t n) throw ();
1875
1876 extern "C"
1877 [[gnu::visibility("default")]]
1878 [[gnu::used]]
1879 void free(void* ptr) {
1880 if (ptr) {
1881 seastar::memory::free(ptr);
1882 }
1883 }
1884
1885 extern "C"
1886 [[gnu::alias("free")]]
1887 [[gnu::visibility("default")]]
1888 #ifndef __clang__
1889 [[gnu::leaf]]
1890 #endif
1891 void __libc_free(void* obj) throw ();
1892
1893 extern "C"
1894 [[gnu::visibility("default")]]
1895 void* calloc(size_t nmemb, size_t size) {
1896 if (try_trigger_error_injector()) {
1897 return nullptr;
1898 }
1899 auto s1 = __int128(nmemb) * __int128(size);
1900 assert(s1 == size_t(s1));
1901 size_t s = s1;
1902 auto p = malloc(s);
1903 if (p) {
1904 std::memset(p, 0, s);
1905 }
1906 return p;
1907 }
1908
1909 extern "C"
1910 [[gnu::alias("calloc")]]
1911 [[gnu::visibility("default")]]
1912 [[gnu::alloc_size(1, 2)]]
1913 [[gnu::malloc]]
1914 #ifndef __clang__
1915 [[gnu::leaf]]
1916 #endif
1917 void* __libc_calloc(size_t n, size_t m) throw ();
1918
1919 extern "C"
1920 [[gnu::visibility("default")]]
1921 void* realloc(void* ptr, size_t size) {
1922 if (try_trigger_error_injector()) {
1923 return nullptr;
1924 }
1925 if (ptr == nullptr) {
1926 // https://en.cppreference.com/w/cpp/memory/c/realloc
1927 // If ptr is a null pointer, the behavior is the same as calling std::malloc(new_size).
1928 return malloc(size);
1929 } else if (!is_seastar_memory(ptr)) {
1930 // we can't realloc foreign memory on a shard
1931 if (is_reactor_thread) {
1932 abort();
1933 }
1934 // original_realloc_func might be null when previous ctor allocates
1935 if (original_realloc_func) {
1936 return original_realloc_func(ptr, size);
1937 }
1938 }
1939 // if we're here, it's a non-null seastar memory ptr
1940 // or original functions aren't available.
1941 // at any rate, using the seastar allocator is OK now.
1942 auto old_size = ptr ? object_size(ptr) : 0;
1943 if (size == old_size) {
1944 return ptr;
1945 }
1946 if (size == 0) {
1947 ::free(ptr);
1948 return nullptr;
1949 }
1950 if (size < old_size) {
1951 seastar::memory::shrink(ptr, size);
1952 return ptr;
1953 }
1954 auto nptr = malloc(size);
1955 if (!nptr) {
1956 return nptr;
1957 }
1958 if (ptr) {
1959 std::memcpy(nptr, ptr, std::min(size, old_size));
1960 ::free(ptr);
1961 }
1962 return nptr;
1963 }
1964
1965 extern "C"
1966 [[gnu::alias("realloc")]]
1967 [[gnu::visibility("default")]]
1968 [[gnu::alloc_size(2)]]
1969 #ifndef __clang__
1970 [[gnu::leaf]]
1971 #endif
1972 void* __libc_realloc(void* obj, size_t size) throw ();
1973
1974 extern "C"
1975 [[gnu::visibility("default")]]
1976 [[gnu::used]]
1977 #ifndef __clang__
1978 [[gnu::leaf]]
1979 #endif
1980 [[gnu::nonnull(1)]]
1981 int posix_memalign(void** ptr, size_t align, size_t size) throw () {
1982 if (try_trigger_error_injector()) {
1983 return ENOMEM;
1984 }
1985 *ptr = allocate_aligned(align, size);
1986 if (!*ptr) {
1987 return ENOMEM;
1988 }
1989 return 0;
1990 }
1991
1992 extern "C"
1993 [[gnu::alias("posix_memalign")]]
1994 [[gnu::visibility("default")]]
1995 #ifndef __clang__
1996 [[gnu::leaf]]
1997 #endif
1998 [[gnu::nonnull(1)]]
1999 int __libc_posix_memalign(void** ptr, size_t align, size_t size) throw ();
2000
2001 extern "C"
2002 [[gnu::visibility("default")]]
2003 [[gnu::malloc]]
2004 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
2005 [[gnu::alloc_size(2)]]
2006 #endif
2007 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 35)
2008 [[gnu::alloc_align(1)]]
2009 #endif
2010 void* memalign(size_t align, size_t size) throw () {
2011 if (try_trigger_error_injector()) {
2012 return nullptr;
2013 }
2014 size = seastar::align_up(size, align);
2015 return allocate_aligned(align, size);
2016 }
2017
2018 extern "C"
2019 [[gnu::visibility("default")]]
2020 void *aligned_alloc(size_t align, size_t size) throw () {
2021 if (try_trigger_error_injector()) {
2022 return nullptr;
2023 }
2024 return allocate_aligned(align, size);
2025 }
2026
2027 extern "C"
2028 [[gnu::alias("memalign")]]
2029 [[gnu::visibility("default")]]
2030 [[gnu::malloc]]
2031 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
2032 [[gnu::alloc_size(2)]]
2033 #endif
2034 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 35)
2035 [[gnu::alloc_align(1)]]
2036 #endif
2037 void* __libc_memalign(size_t align, size_t size) throw ();
2038
2039 extern "C"
2040 [[gnu::visibility("default")]]
2041 void cfree(void* obj) throw () {
2042 return ::free(obj);
2043 }
2044
2045 extern "C"
2046 [[gnu::alias("cfree")]]
2047 [[gnu::visibility("default")]]
2048 void __libc_cfree(void* obj) throw ();
2049
2050 extern "C"
2051 [[gnu::visibility("default")]]
2052 size_t malloc_usable_size(void* obj) {
2053 if (!is_seastar_memory(obj)) {
2054 return original_malloc_usable_size_func(obj);
2055 }
2056 return object_size(obj);
2057 }
2058
2059 extern "C"
2060 [[gnu::visibility("default")]]
2061 int malloc_trim(size_t pad) {
2062 if (!is_reactor_thread) {
2063 return original_malloc_trim_func(pad);
2064 }
2065 return 0;
2066 }
2067
2068 static inline
2069 void* throw_if_null(void* ptr) {
2070 if (!ptr) {
2071 throw std::bad_alloc();
2072 }
2073 return ptr;
2074 }
2075
2076 [[gnu::visibility("default")]]
2077 void* operator new(size_t size) {
2078 trigger_error_injector();
2079 if (size == 0) {
2080 size = 1;
2081 }
2082 return throw_if_null(allocate(size));
2083 }
2084
2085 [[gnu::visibility("default")]]
2086 void* operator new[](size_t size) {
2087 trigger_error_injector();
2088 if (size == 0) {
2089 size = 1;
2090 }
2091 return throw_if_null(allocate(size));
2092 }
2093
2094 [[gnu::visibility("default")]]
2095 void operator delete(void* ptr) throw () {
2096 if (ptr) {
2097 seastar::memory::free(ptr);
2098 }
2099 }
2100
2101 [[gnu::visibility("default")]]
2102 void operator delete[](void* ptr) throw () {
2103 if (ptr) {
2104 seastar::memory::free(ptr);
2105 }
2106 }
2107
2108 [[gnu::visibility("default")]]
2109 void operator delete(void* ptr, size_t size) throw () {
2110 if (ptr) {
2111 seastar::memory::free(ptr, size);
2112 }
2113 }
2114
2115 [[gnu::visibility("default")]]
2116 void operator delete[](void* ptr, size_t size) throw () {
2117 if (ptr) {
2118 seastar::memory::free(ptr, size);
2119 }
2120 }
2121
2122 [[gnu::visibility("default")]]
2123 void* operator new(size_t size, std::nothrow_t) throw () {
2124 if (try_trigger_error_injector()) {
2125 return nullptr;
2126 }
2127 if (size == 0) {
2128 size = 1;
2129 }
2130 return allocate(size);
2131 }
2132
2133 [[gnu::visibility("default")]]
2134 void* operator new[](size_t size, std::nothrow_t) throw () {
2135 if (size == 0) {
2136 size = 1;
2137 }
2138 return allocate(size);
2139 }
2140
2141 [[gnu::visibility("default")]]
2142 void operator delete(void* ptr, std::nothrow_t) throw () {
2143 if (ptr) {
2144 seastar::memory::free(ptr);
2145 }
2146 }
2147
2148 [[gnu::visibility("default")]]
2149 void operator delete[](void* ptr, std::nothrow_t) throw () {
2150 if (ptr) {
2151 seastar::memory::free(ptr);
2152 }
2153 }
2154
2155 [[gnu::visibility("default")]]
2156 void operator delete(void* ptr, size_t size, std::nothrow_t) throw () {
2157 if (ptr) {
2158 seastar::memory::free(ptr, size);
2159 }
2160 }
2161
2162 [[gnu::visibility("default")]]
2163 void operator delete[](void* ptr, size_t size, std::nothrow_t) throw () {
2164 if (ptr) {
2165 seastar::memory::free(ptr, size);
2166 }
2167 }
2168
2169 #ifdef __cpp_aligned_new
2170
2171 [[gnu::visibility("default")]]
2172 void* operator new(size_t size, std::align_val_t a) {
2173 trigger_error_injector();
2174 auto ptr = allocate_aligned(size_t(a), size);
2175 return throw_if_null(ptr);
2176 }
2177
2178 [[gnu::visibility("default")]]
2179 void* operator new[](size_t size, std::align_val_t a) {
2180 trigger_error_injector();
2181 auto ptr = allocate_aligned(size_t(a), size);
2182 return throw_if_null(ptr);
2183 }
2184
2185 [[gnu::visibility("default")]]
2186 void* operator new(size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
2187 if (try_trigger_error_injector()) {
2188 return nullptr;
2189 }
2190 return allocate_aligned(size_t(a), size);
2191 }
2192
2193 [[gnu::visibility("default")]]
2194 void* operator new[](size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
2195 if (try_trigger_error_injector()) {
2196 return nullptr;
2197 }
2198 return allocate_aligned(size_t(a), size);
2199 }
2200
2201
2202 [[gnu::visibility("default")]]
2203 void operator delete(void* ptr, std::align_val_t a) noexcept {
2204 if (ptr) {
2205 seastar::memory::free(ptr);
2206 }
2207 }
2208
2209 [[gnu::visibility("default")]]
2210 void operator delete[](void* ptr, std::align_val_t a) noexcept {
2211 if (ptr) {
2212 seastar::memory::free(ptr);
2213 }
2214 }
2215
2216 [[gnu::visibility("default")]]
2217 void operator delete(void* ptr, size_t size, std::align_val_t a) noexcept {
2218 if (ptr) {
2219 seastar::memory::free_aligned(ptr, size_t(a), size);
2220 }
2221 }
2222
2223 [[gnu::visibility("default")]]
2224 void operator delete[](void* ptr, size_t size, std::align_val_t a) noexcept {
2225 if (ptr) {
2226 seastar::memory::free_aligned(ptr, size_t(a), size);
2227 }
2228 }
2229
2230 [[gnu::visibility("default")]]
2231 void operator delete(void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
2232 if (ptr) {
2233 seastar::memory::free(ptr);
2234 }
2235 }
2236
2237 [[gnu::visibility("default")]]
2238 void operator delete[](void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
2239 if (ptr) {
2240 seastar::memory::free(ptr);
2241 }
2242 }
2243
2244 #endif
2245
2246 namespace seastar {
2247
2248 #else
2249
2250 namespace seastar {
2251
2252 namespace memory {
2253
2254 disable_backtrace_temporarily::disable_backtrace_temporarily() {
2255 (void)_old;
2256 }
2257
2258 disable_backtrace_temporarily::~disable_backtrace_temporarily() {
2259 }
2260
2261 void set_heap_profiling_enabled(bool enabled) {
2262 seastar_logger.warn("Seastar compiled with default allocator, heap profiler not supported");
2263 }
2264
2265 scoped_heap_profiling::scoped_heap_profiling() noexcept {
2266 set_heap_profiling_enabled(true); // let it print the warning
2267 }
2268
2269 scoped_heap_profiling::~scoped_heap_profiling() {
2270 }
2271
2272 void enable_abort_on_allocation_failure() {
2273 seastar_logger.warn("Seastar compiled with default allocator, will not abort on bad_alloc");
2274 }
2275
2276 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope) {
2277 }
2278
2279 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope) {
2280 }
2281
2282 reclaimer::~reclaimer() {
2283 }
2284
2285 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
2286 }
2287
2288 void configure(std::vector<resource::memory> m, bool mbind, std::optional<std::string> hugepages_path) {
2289 }
2290
2291 statistics stats() {
2292 return statistics{0, 0, 0, 1 << 30, 1 << 30, 0, 0, 0, 0, 0, 0};
2293 }
2294
2295 size_t free_memory() {
2296 return stats().free_memory();
2297 }
2298
2299 bool drain_cross_cpu_freelist() {
2300 return false;
2301 }
2302
2303 memory_layout get_memory_layout() {
2304 throw std::runtime_error("get_memory_layout() not supported");
2305 }
2306
2307 size_t min_free_memory() {
2308 return 0;
2309 }
2310
2311 void set_min_free_pages(size_t pages) {
2312 // Ignore, reclaiming not supported for default allocator.
2313 }
2314
2315 void set_large_allocation_warning_threshold(size_t) {
2316 // Ignore, not supported for default allocator.
2317 }
2318
2319 size_t get_large_allocation_warning_threshold() {
2320 // Ignore, not supported for default allocator.
2321 return std::numeric_limits<size_t>::max();
2322 }
2323
2324 void disable_large_allocation_warning() {
2325 // Ignore, not supported for default allocator.
2326 }
2327
2328
2329 void set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind) {
2330 // Ignore, not supported for default allocator.
2331 }
2332
2333 void set_dump_memory_diagnostics_on_alloc_failure_kind(std::string_view) {
2334 // Ignore, not supported for default allocator.
2335 }
2336
2337 void set_additional_diagnostics_producer(noncopyable_function<void(memory_diagnostics_writer)>) {
2338 // Ignore, not supported for default allocator.
2339 }
2340
2341 sstring generate_memory_diagnostics_report() {
2342 // Ignore, not supported for default allocator.
2343 return {};
2344 }
2345
2346 }
2347
2348 }
2349
2350 namespace seastar {
2351
2352 #endif
2353
2354 /// \endcond
2355
2356 }