]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/src/core/memory.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / seastar / src / core / memory.cc
1 /*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18 /*
19 * Copyright (C) 2014 Cloudius Systems, Ltd.
20 */
21
22
23 /// \cond internal
24
25 //
26 // Seastar memory allocator
27 //
28 // This is a share-nothing allocator (memory allocated on one cpu must
29 // be freed on the same cpu).
30 //
31 // Inspired by gperftools' tcmalloc.
32 //
33 // Memory map:
34 //
35 // 0x0000'sccc'vvvv'vvvv
36 //
37 // 0000 - required by architecture (only 48 bits of address space)
38 // s - chosen to satisfy system allocator (1-7)
39 // ccc - cpu number (0-12 bits allocated vary according to system)
40 // v - virtual address within cpu (32-44 bits, according to how much ccc
41 // leaves us
42 //
43 // Each page has a page structure that describes it. Within a cpu's
44 // memory pool, the page array starts at offset 0, describing all pages
45 // within that pool. Page 0 does not describe a valid page.
46 //
47 // Each pool can contain at most 2^32 pages (or 44 address bits), so we can
48 // use a 32-bit integer to identify a page.
49 //
50 // Runs of pages are organized into spans. Free spans are organized into lists,
51 // by size. When spans are broken up or coalesced, they may move into new lists.
52 // Spans have a size that is a power-of-two and are naturally aligned (aka buddy
53 // allocator)
54
55 #include <seastar/core/cacheline.hh>
56 #include <seastar/core/memory.hh>
57 #include <seastar/core/reactor.hh>
58 #include <seastar/core/print.hh>
59 #include <seastar/util/alloc_failure_injector.hh>
60 #include <seastar/util/std-compat.hh>
61 #include <iostream>
62
63 namespace seastar {
64
65 void* internal::allocate_aligned_buffer_impl(size_t size, size_t align) {
66 void *ret;
67 auto r = posix_memalign(&ret, align, size);
68 if (r == ENOMEM) {
69 throw std::bad_alloc();
70 } else if (r == EINVAL) {
71 throw std::runtime_error(format("Invalid alignment of {:d}; allocating {:d} bytes", align, size));
72 } else {
73 assert(r == 0);
74 return ret;
75 }
76 }
77
78 namespace memory {
79
80 static thread_local int abort_on_alloc_failure_suppressed = 0;
81
82 disable_abort_on_alloc_failure_temporarily::disable_abort_on_alloc_failure_temporarily() {
83 ++abort_on_alloc_failure_suppressed;
84 }
85
86 disable_abort_on_alloc_failure_temporarily::~disable_abort_on_alloc_failure_temporarily() noexcept {
87 --abort_on_alloc_failure_suppressed;
88 }
89
90 static compat::polymorphic_allocator<char> static_malloc_allocator{compat::pmr_get_default_resource()};;
91 compat::polymorphic_allocator<char>* malloc_allocator{&static_malloc_allocator};
92
93 }
94
95 }
96
97 #ifndef SEASTAR_DEFAULT_ALLOCATOR
98
99 #include <seastar/core/bitops.hh>
100 #include <seastar/core/align.hh>
101 #include <seastar/core/posix.hh>
102 #include <seastar/core/shared_ptr.hh>
103 #include <new>
104 #include <cstdint>
105 #include <algorithm>
106 #include <limits>
107 #include <cassert>
108 #include <atomic>
109 #include <mutex>
110 #include <seastar/util/std-compat.hh>
111 #include <functional>
112 #include <cstring>
113 #include <boost/intrusive/list.hpp>
114 #include <sys/mman.h>
115 #include <seastar/util/defer.hh>
116 #include <seastar/util/backtrace.hh>
117
118 #ifdef SEASTAR_HAVE_NUMA
119 #include <numaif.h>
120 #endif
121
122 namespace seastar {
123
124 struct allocation_site {
125 mutable size_t count = 0; // number of live objects allocated at backtrace.
126 mutable size_t size = 0; // amount of bytes in live objects allocated at backtrace.
127 mutable const allocation_site* next = nullptr;
128 saved_backtrace backtrace;
129
130 bool operator==(const allocation_site& o) const {
131 return backtrace == o.backtrace;
132 }
133
134 bool operator!=(const allocation_site& o) const {
135 return !(*this == o);
136 }
137 };
138
139 }
140
141 namespace std {
142
143 template<>
144 struct hash<seastar::allocation_site> {
145 size_t operator()(const seastar::allocation_site& bi) const {
146 return std::hash<seastar::saved_backtrace>()(bi.backtrace);
147 }
148 };
149
150 }
151
152 namespace seastar {
153
154 using allocation_site_ptr = const allocation_site*;
155
156 namespace memory {
157
158 seastar::logger seastar_memory_logger("seastar_memory");
159
160 [[gnu::unused]]
161 static allocation_site_ptr get_allocation_site();
162
163 static void on_allocation_failure(size_t size);
164
165 static constexpr unsigned cpu_id_shift = 36; // FIXME: make dynamic
166 static constexpr unsigned max_cpus = 256;
167
168 using pageidx = uint32_t;
169
170 struct page;
171 class page_list;
172
173 static std::atomic<bool> live_cpus[max_cpus];
174
175 static thread_local uint64_t g_allocs;
176 static thread_local uint64_t g_frees;
177 static thread_local uint64_t g_cross_cpu_frees;
178 static thread_local uint64_t g_reclaims;
179 static thread_local uint64_t g_large_allocs;
180
181 using compat::optional;
182
183 using allocate_system_memory_fn
184 = std::function<mmap_area (optional<void*> where, size_t how_much)>;
185
186 namespace bi = boost::intrusive;
187
188 inline
189 unsigned object_cpu_id(const void* ptr) {
190 return (reinterpret_cast<uintptr_t>(ptr) >> cpu_id_shift) & 0xff;
191 }
192
193 class page_list_link {
194 uint32_t _prev;
195 uint32_t _next;
196 friend class page_list;
197 friend void on_allocation_failure(size_t);
198 };
199
200 static char* mem_base() {
201 static char* known;
202 static std::once_flag flag;
203 std::call_once(flag, [] {
204 size_t alloc = size_t(1) << 44;
205 auto r = ::mmap(NULL, 2 * alloc,
206 PROT_NONE,
207 MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
208 -1, 0);
209 if (r == MAP_FAILED) {
210 abort();
211 }
212 ::madvise(r, 2 * alloc, MADV_DONTDUMP);
213 auto cr = reinterpret_cast<char*>(r);
214 known = align_up(cr, alloc);
215 ::munmap(cr, known - cr);
216 ::munmap(known + alloc, cr + 2 * alloc - (known + alloc));
217 });
218 return known;
219 }
220
221 constexpr bool is_page_aligned(size_t size) {
222 return (size & (page_size - 1)) == 0;
223 }
224
225 constexpr size_t next_page_aligned(size_t size) {
226 return (size + (page_size - 1)) & ~(page_size - 1);
227 }
228
229 class small_pool;
230
231 struct free_object {
232 free_object* next;
233 };
234
235 struct page {
236 bool free;
237 uint8_t offset_in_span;
238 uint16_t nr_small_alloc;
239 uint32_t span_size; // in pages, if we're the head or the tail
240 page_list_link link;
241 small_pool* pool; // if used in a small_pool
242 free_object* freelist;
243 #ifdef SEASTAR_HEAPPROF
244 allocation_site_ptr alloc_site; // for objects whose size is multiple of page size, valid for head only
245 #endif
246 };
247
248 class page_list {
249 uint32_t _front = 0;
250 uint32_t _back = 0;
251 public:
252 page& front(page* ary) { return ary[_front]; }
253 page& back(page* ary) { return ary[_back]; }
254 bool empty() const { return !_front; }
255 void erase(page* ary, page& span) {
256 if (span.link._next) {
257 ary[span.link._next].link._prev = span.link._prev;
258 } else {
259 _back = span.link._prev;
260 }
261 if (span.link._prev) {
262 ary[span.link._prev].link._next = span.link._next;
263 } else {
264 _front = span.link._next;
265 }
266 }
267 void push_front(page* ary, page& span) {
268 auto idx = &span - ary;
269 if (_front) {
270 ary[_front].link._prev = idx;
271 } else {
272 _back = idx;
273 }
274 span.link._next = _front;
275 span.link._prev = 0;
276 _front = idx;
277 }
278 void pop_front(page* ary) {
279 if (ary[_front].link._next) {
280 ary[ary[_front].link._next].link._prev = 0;
281 } else {
282 _back = 0;
283 }
284 _front = ary[_front].link._next;
285 }
286 friend void on_allocation_failure(size_t);
287 };
288
289 class small_pool {
290 struct span_sizes {
291 uint8_t preferred;
292 uint8_t fallback;
293 };
294 unsigned _object_size;
295 span_sizes _span_sizes;
296 free_object* _free = nullptr;
297 size_t _free_count = 0;
298 unsigned _min_free;
299 unsigned _max_free;
300 unsigned _pages_in_use = 0;
301 page_list _span_list;
302 static constexpr unsigned idx_frac_bits = 2;
303 public:
304 explicit small_pool(unsigned object_size) noexcept;
305 ~small_pool();
306 void* allocate();
307 void deallocate(void* object);
308 unsigned object_size() const { return _object_size; }
309 bool objects_page_aligned() const { return is_page_aligned(_object_size); }
310 static constexpr unsigned size_to_idx(unsigned size);
311 static constexpr unsigned idx_to_size(unsigned idx);
312 allocation_site_ptr& alloc_site_holder(void* ptr);
313 private:
314 void add_more_objects();
315 void trim_free_list();
316 friend void on_allocation_failure(size_t);
317 };
318
319 // index 0b0001'1100 -> size (1 << 4) + 0b11 << (4 - 2)
320
321 constexpr unsigned
322 small_pool::idx_to_size(unsigned idx) {
323 return (((1 << idx_frac_bits) | (idx & ((1 << idx_frac_bits) - 1)))
324 << (idx >> idx_frac_bits))
325 >> idx_frac_bits;
326 }
327
328 constexpr unsigned
329 small_pool::size_to_idx(unsigned size) {
330 return ((log2floor(size) << idx_frac_bits) - ((1 << idx_frac_bits) - 1))
331 + ((size - 1) >> (log2floor(size) - idx_frac_bits));
332 }
333
334 class small_pool_array {
335 public:
336 static constexpr unsigned nr_small_pools = small_pool::size_to_idx(4 * page_size) + 1;
337 private:
338 union u {
339 small_pool a[nr_small_pools];
340 u() {
341 for (unsigned i = 0; i < nr_small_pools; ++i) {
342 new (&a[i]) small_pool(small_pool::idx_to_size(i));
343 }
344 }
345 ~u() {
346 // cannot really call destructor, since other
347 // objects may be freed after we are gone.
348 }
349 } _u;
350 public:
351 small_pool& operator[](unsigned idx) { return _u.a[idx]; }
352 };
353
354 static constexpr size_t max_small_allocation
355 = small_pool::idx_to_size(small_pool_array::nr_small_pools - 1);
356
357 constexpr size_t object_size_with_alloc_site(size_t size) {
358 #ifdef SEASTAR_HEAPPROF
359 // For page-aligned sizes, allocation_site* lives in page::alloc_site, not with the object.
360 static_assert(is_page_aligned(max_small_allocation), "assuming that max_small_allocation is page aligned so that we"
361 " don't need to add allocation_site_ptr to objects of size close to it");
362 size_t next_page_aligned_size = next_page_aligned(size);
363 if (next_page_aligned_size - size > sizeof(allocation_site_ptr)) {
364 size += sizeof(allocation_site_ptr);
365 } else {
366 return next_page_aligned_size;
367 }
368 #endif
369 return size;
370 }
371
372 #ifdef SEASTAR_HEAPPROF
373 // Ensure that object_size_with_alloc_site() does not exceed max_small_allocation
374 static_assert(object_size_with_alloc_site(max_small_allocation) == max_small_allocation, "");
375 static_assert(object_size_with_alloc_site(max_small_allocation - 1) == max_small_allocation, "");
376 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) + 1) == max_small_allocation, "");
377 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr)) == max_small_allocation, "");
378 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 1) == max_small_allocation - 1, "");
379 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 2) == max_small_allocation - 2, "");
380 #endif
381
382 struct cross_cpu_free_item {
383 cross_cpu_free_item* next;
384 };
385
386 struct cpu_pages {
387 uint32_t min_free_pages = 20000000 / page_size;
388 char* memory;
389 page* pages;
390 uint32_t nr_pages;
391 uint32_t nr_free_pages;
392 uint32_t current_min_free_pages = 0;
393 size_t large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
394 unsigned cpu_id = -1U;
395 std::function<void (std::function<void ()>)> reclaim_hook;
396 std::vector<reclaimer*> reclaimers;
397 static constexpr unsigned nr_span_lists = 32;
398 page_list free_spans[nr_span_lists]; // contains aligned spans with span_size == 2^idx
399 small_pool_array small_pools;
400 alignas(seastar::cache_line_size) std::atomic<cross_cpu_free_item*> xcpu_freelist;
401 static std::atomic<unsigned> cpu_id_gen;
402 static cpu_pages* all_cpus[max_cpus];
403 union asu {
404 using alloc_sites_type = std::unordered_set<allocation_site>;
405 asu() : alloc_sites{} {
406 }
407 ~asu() {} // alloc_sites live forever
408 alloc_sites_type alloc_sites;
409 } asu;
410 allocation_site_ptr alloc_site_list_head = nullptr; // For easy traversal of asu.alloc_sites from scylla-gdb.py
411 bool collect_backtrace = false;
412 char* mem() { return memory; }
413
414 void link(page_list& list, page* span);
415 void unlink(page_list& list, page* span);
416 struct trim {
417 unsigned offset;
418 unsigned nr_pages;
419 };
420 void maybe_reclaim();
421 void* allocate_large_and_trim(unsigned nr_pages);
422 void* allocate_large(unsigned nr_pages);
423 void* allocate_large_aligned(unsigned align_pages, unsigned nr_pages);
424 page* find_and_unlink_span(unsigned nr_pages);
425 page* find_and_unlink_span_reclaiming(unsigned n_pages);
426 void free_large(void* ptr);
427 bool grow_span(pageidx& start, uint32_t& nr_pages, unsigned idx);
428 void free_span(pageidx start, uint32_t nr_pages);
429 void free_span_no_merge(pageidx start, uint32_t nr_pages);
430 void free_span_unaligned(pageidx start, uint32_t nr_pages);
431 void* allocate_small(unsigned size);
432 void free(void* ptr);
433 void free(void* ptr, size_t size);
434 bool try_cross_cpu_free(void* ptr);
435 void shrink(void* ptr, size_t new_size);
436 void free_cross_cpu(unsigned cpu_id, void* ptr);
437 bool drain_cross_cpu_freelist();
438 size_t object_size(void* ptr);
439 page* to_page(void* p) {
440 return &pages[(reinterpret_cast<char*>(p) - mem()) / page_size];
441 }
442
443 bool is_initialized() const;
444 bool initialize();
445 reclaiming_result run_reclaimers(reclaimer_scope, size_t pages_to_reclaim);
446 void schedule_reclaim();
447 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook);
448 void set_min_free_pages(size_t pages);
449 void resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
450 void do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
451 void replace_memory_backing(allocate_system_memory_fn alloc_sys_mem);
452 void check_large_allocation(size_t size);
453 void warn_large_allocation(size_t size);
454 memory::memory_layout memory_layout();
455 ~cpu_pages();
456 };
457
458 static thread_local cpu_pages cpu_mem;
459 std::atomic<unsigned> cpu_pages::cpu_id_gen;
460 cpu_pages* cpu_pages::all_cpus[max_cpus];
461
462 #ifdef SEASTAR_HEAPPROF
463
464 void set_heap_profiling_enabled(bool enable) {
465 bool is_enabled = cpu_mem.collect_backtrace;
466 if (enable) {
467 if (!is_enabled) {
468 seastar_logger.info("Enabling heap profiler");
469 }
470 } else {
471 if (is_enabled) {
472 seastar_logger.info("Disabling heap profiler");
473 }
474 }
475 cpu_mem.collect_backtrace = enable;
476 }
477
478 static thread_local int64_t scoped_heap_profiling_embed_count = 0;
479
480 scoped_heap_profiling::scoped_heap_profiling() noexcept {
481 ++scoped_heap_profiling_embed_count;
482 set_heap_profiling_enabled(true);
483 }
484
485 scoped_heap_profiling::~scoped_heap_profiling() {
486 if (!--scoped_heap_profiling_embed_count) {
487 set_heap_profiling_enabled(false);
488 }
489 }
490
491 #else
492
493 void set_heap_profiling_enabled(bool enable) {
494 seastar_logger.warn("Seastar compiled without heap profiling support, heap profiler not supported;"
495 " compile with the Seastar_HEAP_PROFILING=ON CMake option to add heap profiling support");
496 }
497
498 scoped_heap_profiling::scoped_heap_profiling() noexcept {
499 set_heap_profiling_enabled(true); // let it print the warning
500 }
501
502 scoped_heap_profiling::~scoped_heap_profiling() {
503 }
504
505 #endif
506
507 // Smallest index i such that all spans stored in the index are >= pages.
508 static inline
509 unsigned index_of(unsigned pages) {
510 if (pages == 1) {
511 return 0;
512 }
513 return std::numeric_limits<unsigned>::digits - count_leading_zeros(pages - 1);
514 }
515
516 void
517 cpu_pages::unlink(page_list& list, page* span) {
518 list.erase(pages, *span);
519 }
520
521 void
522 cpu_pages::link(page_list& list, page* span) {
523 list.push_front(pages, *span);
524 }
525
526 void cpu_pages::free_span_no_merge(uint32_t span_start, uint32_t nr_pages) {
527 assert(nr_pages);
528 nr_free_pages += nr_pages;
529 auto span = &pages[span_start];
530 auto span_end = &pages[span_start + nr_pages - 1];
531 span->free = span_end->free = true;
532 span->span_size = span_end->span_size = nr_pages;
533 auto idx = index_of(nr_pages);
534 link(free_spans[idx], span);
535 }
536
537 bool cpu_pages::grow_span(uint32_t& span_start, uint32_t& nr_pages, unsigned idx) {
538 auto which = (span_start >> idx) & 1; // 0=lower, 1=upper
539 // locate first page of upper buddy or last page of lower buddy
540 // examples: span_start = 0x10 nr_pages = 0x08 -> buddy = 0x18 (which = 0)
541 // span_start = 0x18 nr_pages = 0x08 -> buddy = 0x17 (which = 1)
542 // delta = which ? -1u : nr_pages
543 auto delta = ((which ^ 1) << idx) | -which;
544 auto buddy = span_start + delta;
545 if (pages[buddy].free && pages[buddy].span_size == nr_pages) {
546 unlink(free_spans[idx], &pages[span_start ^ nr_pages]);
547 nr_free_pages -= nr_pages; // free_span_no_merge() will restore
548 span_start &= ~nr_pages;
549 nr_pages *= 2;
550 return true;
551 }
552 return false;
553 }
554
555 void cpu_pages::free_span(uint32_t span_start, uint32_t nr_pages) {
556 auto idx = index_of(nr_pages);
557 while (grow_span(span_start, nr_pages, idx)) {
558 ++idx;
559 }
560 free_span_no_merge(span_start, nr_pages);
561 }
562
563 // Internal, used during startup. Span is not aligned so needs to be broken up
564 void cpu_pages::free_span_unaligned(uint32_t span_start, uint32_t nr_pages) {
565 while (nr_pages) {
566 auto start_nr_bits = span_start ? count_trailing_zeros(span_start) : 32;
567 auto size_nr_bits = count_trailing_zeros(nr_pages);
568 auto now = 1u << std::min(start_nr_bits, size_nr_bits);
569 free_span(span_start, now);
570 span_start += now;
571 nr_pages -= now;
572 }
573 }
574
575 page*
576 cpu_pages::find_and_unlink_span(unsigned n_pages) {
577 auto idx = index_of(n_pages);
578 if (n_pages >= (2u << idx)) {
579 return nullptr;
580 }
581 while (idx < nr_span_lists && free_spans[idx].empty()) {
582 ++idx;
583 }
584 if (idx == nr_span_lists) {
585 if (initialize()) {
586 return find_and_unlink_span(n_pages);
587 }
588 return nullptr;
589 }
590 auto& list = free_spans[idx];
591 page* span = &list.front(pages);
592 unlink(list, span);
593 return span;
594 }
595
596 page*
597 cpu_pages::find_and_unlink_span_reclaiming(unsigned n_pages) {
598 while (true) {
599 auto span = find_and_unlink_span(n_pages);
600 if (span) {
601 return span;
602 }
603 if (run_reclaimers(reclaimer_scope::sync, n_pages) == reclaiming_result::reclaimed_nothing) {
604 return nullptr;
605 }
606 }
607 }
608
609 void cpu_pages::maybe_reclaim() {
610 if (nr_free_pages < current_min_free_pages) {
611 drain_cross_cpu_freelist();
612 if (nr_free_pages < current_min_free_pages) {
613 run_reclaimers(reclaimer_scope::sync, current_min_free_pages - nr_free_pages);
614 }
615 if (nr_free_pages < current_min_free_pages) {
616 schedule_reclaim();
617 }
618 }
619 }
620
621 void*
622 cpu_pages::allocate_large_and_trim(unsigned n_pages) {
623 // Avoid exercising the reclaimers for requests we'll not be able to satisfy
624 // nr_pages might be zero during startup, so check for that too
625 if (nr_pages && n_pages >= nr_pages) {
626 return nullptr;
627 }
628 page* span = find_and_unlink_span_reclaiming(n_pages);
629 if (!span) {
630 return nullptr;
631 }
632 auto span_size = span->span_size;
633 auto span_idx = span - pages;
634 nr_free_pages -= span->span_size;
635 while (span_size >= n_pages * 2) {
636 span_size /= 2;
637 auto other_span_idx = span_idx + span_size;
638 free_span_no_merge(other_span_idx, span_size);
639 }
640 auto span_end = &pages[span_idx + span_size - 1];
641 span->free = span_end->free = false;
642 span->span_size = span_end->span_size = span_size;
643 span->pool = nullptr;
644 #ifdef SEASTAR_HEAPPROF
645 auto alloc_site = get_allocation_site();
646 span->alloc_site = alloc_site;
647 if (alloc_site) {
648 ++alloc_site->count;
649 alloc_site->size += span->span_size * page_size;
650 }
651 #endif
652 maybe_reclaim();
653 return mem() + span_idx * page_size;
654 }
655
656 void
657 cpu_pages::warn_large_allocation(size_t size) {
658 ++g_large_allocs;
659 seastar_memory_logger.warn("oversized allocation: {} bytes. This is non-fatal, but could lead to latency and/or fragmentation issues. Please report: at {}", size, current_backtrace());
660 large_allocation_warning_threshold *= 1.618; // prevent spam
661 }
662
663 void
664 inline
665 cpu_pages::check_large_allocation(size_t size) {
666 if (size > large_allocation_warning_threshold) {
667 warn_large_allocation(size);
668 }
669 }
670
671 void*
672 cpu_pages::allocate_large(unsigned n_pages) {
673 check_large_allocation(n_pages * page_size);
674 return allocate_large_and_trim(n_pages);
675 }
676
677 void*
678 cpu_pages::allocate_large_aligned(unsigned align_pages, unsigned n_pages) {
679 check_large_allocation(n_pages * page_size);
680 // buddy allocation is always aligned
681 return allocate_large_and_trim(n_pages);
682 }
683
684 #ifdef SEASTAR_HEAPPROF
685
686 class disable_backtrace_temporarily {
687 bool _old;
688 public:
689 disable_backtrace_temporarily() {
690 _old = cpu_mem.collect_backtrace;
691 cpu_mem.collect_backtrace = false;
692 }
693 ~disable_backtrace_temporarily() {
694 cpu_mem.collect_backtrace = _old;
695 }
696 };
697
698 #else
699
700 struct disable_backtrace_temporarily {
701 ~disable_backtrace_temporarily() {}
702 };
703
704 #endif
705
706 static
707 saved_backtrace get_backtrace() noexcept {
708 disable_backtrace_temporarily dbt;
709 return current_backtrace();
710 }
711
712 static
713 allocation_site_ptr get_allocation_site() {
714 if (!cpu_mem.is_initialized() || !cpu_mem.collect_backtrace) {
715 return nullptr;
716 }
717 disable_backtrace_temporarily dbt;
718 allocation_site new_alloc_site;
719 new_alloc_site.backtrace = get_backtrace();
720 auto insert_result = cpu_mem.asu.alloc_sites.insert(std::move(new_alloc_site));
721 allocation_site_ptr alloc_site = &*insert_result.first;
722 if (insert_result.second) {
723 alloc_site->next = cpu_mem.alloc_site_list_head;
724 cpu_mem.alloc_site_list_head = alloc_site;
725 }
726 return alloc_site;
727 }
728
729 #ifdef SEASTAR_HEAPPROF
730
731 allocation_site_ptr&
732 small_pool::alloc_site_holder(void* ptr) {
733 if (objects_page_aligned()) {
734 return cpu_mem.to_page(ptr)->alloc_site;
735 } else {
736 return *reinterpret_cast<allocation_site_ptr*>(reinterpret_cast<char*>(ptr) + _object_size - sizeof(allocation_site_ptr));
737 }
738 }
739
740 #endif
741
742 void*
743 cpu_pages::allocate_small(unsigned size) {
744 auto idx = small_pool::size_to_idx(size);
745 auto& pool = small_pools[idx];
746 assert(size <= pool.object_size());
747 auto ptr = pool.allocate();
748 #ifdef SEASTAR_HEAPPROF
749 if (!ptr) {
750 return nullptr;
751 }
752 allocation_site_ptr alloc_site = get_allocation_site();
753 if (alloc_site) {
754 ++alloc_site->count;
755 alloc_site->size += pool.object_size();
756 }
757 new (&pool.alloc_site_holder(ptr)) allocation_site_ptr{alloc_site};
758 #endif
759 return ptr;
760 }
761
762 void cpu_pages::free_large(void* ptr) {
763 pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
764 page* span = &pages[idx];
765 #ifdef SEASTAR_HEAPPROF
766 auto alloc_site = span->alloc_site;
767 if (alloc_site) {
768 --alloc_site->count;
769 alloc_site->size -= span->span_size * page_size;
770 }
771 #endif
772 free_span(idx, span->span_size);
773 }
774
775 size_t cpu_pages::object_size(void* ptr) {
776 page* span = to_page(ptr);
777 if (span->pool) {
778 auto s = span->pool->object_size();
779 #ifdef SEASTAR_HEAPPROF
780 // We must not allow the object to be extended onto the allocation_site_ptr field.
781 if (!span->pool->objects_page_aligned()) {
782 s -= sizeof(allocation_site_ptr);
783 }
784 #endif
785 return s;
786 } else {
787 return size_t(span->span_size) * page_size;
788 }
789 }
790
791 void cpu_pages::free_cross_cpu(unsigned cpu_id, void* ptr) {
792 if (!live_cpus[cpu_id].load(std::memory_order_relaxed)) {
793 // Thread was destroyed; leak object
794 // should only happen for boost unit-tests.
795 return;
796 }
797 auto p = reinterpret_cast<cross_cpu_free_item*>(ptr);
798 auto& list = all_cpus[cpu_id]->xcpu_freelist;
799 auto old = list.load(std::memory_order_relaxed);
800 do {
801 p->next = old;
802 } while (!list.compare_exchange_weak(old, p, std::memory_order_release, std::memory_order_relaxed));
803 ++g_cross_cpu_frees;
804 }
805
806 bool cpu_pages::drain_cross_cpu_freelist() {
807 if (!xcpu_freelist.load(std::memory_order_relaxed)) {
808 return false;
809 }
810 auto p = xcpu_freelist.exchange(nullptr, std::memory_order_acquire);
811 while (p) {
812 auto n = p->next;
813 ++g_frees;
814 free(p);
815 p = n;
816 }
817 return true;
818 }
819
820 void cpu_pages::free(void* ptr) {
821 page* span = to_page(ptr);
822 if (span->pool) {
823 small_pool& pool = *span->pool;
824 #ifdef SEASTAR_HEAPPROF
825 allocation_site_ptr alloc_site = pool.alloc_site_holder(ptr);
826 if (alloc_site) {
827 --alloc_site->count;
828 alloc_site->size -= pool.object_size();
829 }
830 #endif
831 pool.deallocate(ptr);
832 } else {
833 free_large(ptr);
834 }
835 }
836
837 void cpu_pages::free(void* ptr, size_t size) {
838 // match action on allocate() so hit the right pool
839 if (size <= sizeof(free_object)) {
840 size = sizeof(free_object);
841 }
842 if (size <= max_small_allocation) {
843 size = object_size_with_alloc_site(size);
844 auto pool = &small_pools[small_pool::size_to_idx(size)];
845 #ifdef SEASTAR_HEAPPROF
846 allocation_site_ptr alloc_site = pool->alloc_site_holder(ptr);
847 if (alloc_site) {
848 --alloc_site->count;
849 alloc_site->size -= pool->object_size();
850 }
851 #endif
852 pool->deallocate(ptr);
853 } else {
854 free_large(ptr);
855 }
856 }
857
858 bool
859 cpu_pages::try_cross_cpu_free(void* ptr) {
860 auto obj_cpu = object_cpu_id(ptr);
861 if (obj_cpu != cpu_id) {
862 free_cross_cpu(obj_cpu, ptr);
863 return true;
864 }
865 return false;
866 }
867
868 void cpu_pages::shrink(void* ptr, size_t new_size) {
869 auto obj_cpu = object_cpu_id(ptr);
870 assert(obj_cpu == cpu_id);
871 page* span = to_page(ptr);
872 if (span->pool) {
873 return;
874 }
875 auto old_size_pages = span->span_size;
876 size_t new_size_pages = old_size_pages;
877 while (new_size_pages / 2 * page_size >= new_size) {
878 new_size_pages /= 2;
879 }
880 if (new_size_pages == old_size_pages) {
881 return;
882 }
883 #ifdef SEASTAR_HEAPPROF
884 auto alloc_site = span->alloc_site;
885 if (alloc_site) {
886 alloc_site->size -= span->span_size * page_size;
887 alloc_site->size += new_size_pages * page_size;
888 }
889 #endif
890 span->span_size = new_size_pages;
891 span[new_size_pages - 1].free = false;
892 span[new_size_pages - 1].span_size = new_size_pages;
893 pageidx idx = span - pages;
894 free_span_unaligned(idx + new_size_pages, old_size_pages - new_size_pages);
895 }
896
897 cpu_pages::~cpu_pages() {
898 live_cpus[cpu_id].store(false, std::memory_order_relaxed);
899 }
900
901 bool cpu_pages::is_initialized() const {
902 return bool(nr_pages);
903 }
904
905 bool cpu_pages::initialize() {
906 if (is_initialized()) {
907 return false;
908 }
909 cpu_id = cpu_id_gen.fetch_add(1, std::memory_order_relaxed);
910 assert(cpu_id < max_cpus);
911 all_cpus[cpu_id] = this;
912 auto base = mem_base() + (size_t(cpu_id) << cpu_id_shift);
913 auto size = 32 << 20; // Small size for bootstrap
914 auto r = ::mmap(base, size,
915 PROT_READ | PROT_WRITE,
916 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
917 -1, 0);
918 if (r == MAP_FAILED) {
919 abort();
920 }
921 ::madvise(base, size, MADV_HUGEPAGE);
922 pages = reinterpret_cast<page*>(base);
923 memory = base;
924 nr_pages = size / page_size;
925 // we reserve the end page so we don't have to special case
926 // the last span.
927 auto reserved = align_up(sizeof(page) * (nr_pages + 1), page_size) / page_size;
928 reserved = 1u << log2ceil(reserved);
929 for (pageidx i = 0; i < reserved; ++i) {
930 pages[i].free = false;
931 }
932 pages[nr_pages].free = false;
933 free_span_unaligned(reserved, nr_pages - reserved);
934 live_cpus[cpu_id].store(true, std::memory_order_relaxed);
935 return true;
936 }
937
938 mmap_area
939 allocate_anonymous_memory(compat::optional<void*> where, size_t how_much) {
940 return mmap_anonymous(where.value_or(nullptr),
941 how_much,
942 PROT_READ | PROT_WRITE,
943 MAP_PRIVATE | (where ? MAP_FIXED : 0));
944 }
945
946 mmap_area
947 allocate_hugetlbfs_memory(file_desc& fd, compat::optional<void*> where, size_t how_much) {
948 auto pos = fd.size();
949 fd.truncate(pos + how_much);
950 auto ret = fd.map(
951 how_much,
952 PROT_READ | PROT_WRITE,
953 MAP_SHARED | MAP_POPULATE | (where ? MAP_FIXED : 0),
954 pos,
955 where.value_or(nullptr));
956 return ret;
957 }
958
959 void cpu_pages::replace_memory_backing(allocate_system_memory_fn alloc_sys_mem) {
960 // We would like to use ::mremap() to atomically replace the old anonymous
961 // memory with hugetlbfs backed memory, but mremap() does not support hugetlbfs
962 // (for no reason at all). So we must copy the anonymous memory to some other
963 // place, map hugetlbfs in place, and copy it back, without modifying it during
964 // the operation.
965 auto bytes = nr_pages * page_size;
966 auto old_mem = mem();
967 auto relocated_old_mem = mmap_anonymous(nullptr, bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE);
968 std::memcpy(relocated_old_mem.get(), old_mem, bytes);
969 alloc_sys_mem({old_mem}, bytes).release();
970 std::memcpy(old_mem, relocated_old_mem.get(), bytes);
971 }
972
973 void cpu_pages::do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem) {
974 auto new_pages = new_size / page_size;
975 if (new_pages <= nr_pages) {
976 return;
977 }
978 auto old_size = nr_pages * page_size;
979 auto mmap_start = memory + old_size;
980 auto mmap_size = new_size - old_size;
981 auto mem = alloc_sys_mem({mmap_start}, mmap_size);
982 mem.release();
983 ::madvise(mmap_start, mmap_size, MADV_HUGEPAGE);
984 // one past last page structure is a sentinel
985 auto new_page_array_pages = align_up(sizeof(page[new_pages + 1]), page_size) / page_size;
986 auto new_page_array
987 = reinterpret_cast<page*>(allocate_large(new_page_array_pages));
988 if (!new_page_array) {
989 throw std::bad_alloc();
990 }
991 std::copy(pages, pages + nr_pages, new_page_array);
992 // mark new one-past-last page as taken to avoid boundary conditions
993 new_page_array[new_pages].free = false;
994 auto old_pages = reinterpret_cast<char*>(pages);
995 auto old_nr_pages = nr_pages;
996 auto old_pages_size = align_up(sizeof(page[nr_pages + 1]), page_size);
997 old_pages_size = size_t(1) << log2ceil(old_pages_size);
998 pages = new_page_array;
999 nr_pages = new_pages;
1000 auto old_pages_start = (old_pages - memory) / page_size;
1001 if (old_pages_start == 0) {
1002 // keep page 0 allocated
1003 old_pages_start = 1;
1004 old_pages_size -= page_size;
1005 }
1006 if (old_pages_size != 0) {
1007 free_span_unaligned(old_pages_start, old_pages_size / page_size);
1008 }
1009 free_span_unaligned(old_nr_pages, new_pages - old_nr_pages);
1010 }
1011
1012 void cpu_pages::resize(size_t new_size, allocate_system_memory_fn alloc_memory) {
1013 new_size = align_down(new_size, huge_page_size);
1014 while (nr_pages * page_size < new_size) {
1015 // don't reallocate all at once, since there might not
1016 // be enough free memory available to relocate the pages array
1017 auto tmp_size = std::min(new_size, 4 * nr_pages * page_size);
1018 do_resize(tmp_size, alloc_memory);
1019 }
1020 }
1021
1022 reclaiming_result cpu_pages::run_reclaimers(reclaimer_scope scope, size_t n_pages) {
1023 auto target = std::max<size_t>(nr_free_pages + n_pages, min_free_pages);
1024 reclaiming_result result = reclaiming_result::reclaimed_nothing;
1025 while (nr_free_pages < target) {
1026 bool made_progress = false;
1027 ++g_reclaims;
1028 for (auto&& r : reclaimers) {
1029 if (r->scope() >= scope) {
1030 made_progress |= r->do_reclaim((target - nr_free_pages) * page_size) == reclaiming_result::reclaimed_something;
1031 }
1032 }
1033 if (!made_progress) {
1034 return result;
1035 }
1036 result = reclaiming_result::reclaimed_something;
1037 }
1038 return result;
1039 }
1040
1041 void cpu_pages::schedule_reclaim() {
1042 current_min_free_pages = 0;
1043 reclaim_hook([this] {
1044 if (nr_free_pages < min_free_pages) {
1045 try {
1046 run_reclaimers(reclaimer_scope::async, min_free_pages - nr_free_pages);
1047 } catch (...) {
1048 current_min_free_pages = min_free_pages;
1049 throw;
1050 }
1051 }
1052 current_min_free_pages = min_free_pages;
1053 });
1054 }
1055
1056 memory::memory_layout cpu_pages::memory_layout() {
1057 assert(is_initialized());
1058 return {
1059 reinterpret_cast<uintptr_t>(memory),
1060 reinterpret_cast<uintptr_t>(memory) + nr_pages * page_size
1061 };
1062 }
1063
1064 void cpu_pages::set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1065 reclaim_hook = hook;
1066 current_min_free_pages = min_free_pages;
1067 }
1068
1069 void cpu_pages::set_min_free_pages(size_t pages) {
1070 if (pages > std::numeric_limits<decltype(min_free_pages)>::max()) {
1071 throw std::runtime_error("Number of pages too large");
1072 }
1073 min_free_pages = pages;
1074 maybe_reclaim();
1075 }
1076
1077 small_pool::small_pool(unsigned object_size) noexcept
1078 : _object_size(object_size) {
1079 unsigned span_size = 1;
1080 auto span_bytes = [&] { return span_size * page_size; };
1081 auto waste = [&] { return (span_bytes() % _object_size) / (1.0 * span_bytes()); };
1082 while (object_size > span_bytes()) {
1083 ++span_size;
1084 }
1085 _span_sizes.fallback = span_size;
1086 span_size = 1;
1087 while (_object_size > span_bytes()
1088 || (span_size < 32 && waste() > 0.05)
1089 || (span_bytes() / object_size < 4)) {
1090 ++span_size;
1091 }
1092 _span_sizes.preferred = span_size;
1093 _max_free = std::max<unsigned>(100, span_bytes() * 2 / _object_size);
1094 _min_free = _max_free / 2;
1095 }
1096
1097 small_pool::~small_pool() {
1098 _min_free = _max_free = 0;
1099 trim_free_list();
1100 }
1101
1102 // Should not throw in case of running out of memory to avoid infinite recursion,
1103 // becaue throwing std::bad_alloc requires allocation. __cxa_allocate_exception
1104 // falls back to the emergency pool in case malloc() returns nullptr.
1105 void*
1106 small_pool::allocate() {
1107 if (!_free) {
1108 add_more_objects();
1109 }
1110 if (!_free) {
1111 return nullptr;
1112 }
1113 auto* obj = _free;
1114 _free = _free->next;
1115 --_free_count;
1116 return obj;
1117 }
1118
1119 void
1120 small_pool::deallocate(void* object) {
1121 auto o = reinterpret_cast<free_object*>(object);
1122 o->next = _free;
1123 _free = o;
1124 ++_free_count;
1125 if (_free_count >= _max_free) {
1126 trim_free_list();
1127 }
1128 }
1129
1130 void
1131 small_pool::add_more_objects() {
1132 auto goal = (_min_free + _max_free) / 2;
1133 while (!_span_list.empty() && _free_count < goal) {
1134 page& span = _span_list.front(cpu_mem.pages);
1135 _span_list.pop_front(cpu_mem.pages);
1136 while (span.freelist) {
1137 auto obj = span.freelist;
1138 span.freelist = span.freelist->next;
1139 obj->next = _free;
1140 _free = obj;
1141 ++_free_count;
1142 ++span.nr_small_alloc;
1143 }
1144 }
1145 while (_free_count < goal) {
1146 disable_backtrace_temporarily dbt;
1147 auto span_size = _span_sizes.preferred;
1148 auto data = reinterpret_cast<char*>(cpu_mem.allocate_large(span_size));
1149 if (!data) {
1150 span_size = _span_sizes.fallback;
1151 data = reinterpret_cast<char*>(cpu_mem.allocate_large(span_size));
1152 if (!data) {
1153 return;
1154 }
1155 }
1156 auto span = cpu_mem.to_page(data);
1157 span_size = span->span_size;
1158 _pages_in_use += span_size;
1159 for (unsigned i = 0; i < span_size; ++i) {
1160 span[i].offset_in_span = i;
1161 span[i].pool = this;
1162 }
1163 span->nr_small_alloc = 0;
1164 span->freelist = nullptr;
1165 for (unsigned offset = 0; offset <= span_size * page_size - _object_size; offset += _object_size) {
1166 auto h = reinterpret_cast<free_object*>(data + offset);
1167 h->next = _free;
1168 _free = h;
1169 ++_free_count;
1170 ++span->nr_small_alloc;
1171 }
1172 }
1173 }
1174
1175 void
1176 small_pool::trim_free_list() {
1177 auto goal = (_min_free + _max_free) / 2;
1178 while (_free && _free_count > goal) {
1179 auto obj = _free;
1180 _free = _free->next;
1181 --_free_count;
1182 page* span = cpu_mem.to_page(obj);
1183 span -= span->offset_in_span;
1184 if (!span->freelist) {
1185 new (&span->link) page_list_link();
1186 _span_list.push_front(cpu_mem.pages, *span);
1187 }
1188 obj->next = span->freelist;
1189 span->freelist = obj;
1190 if (--span->nr_small_alloc == 0) {
1191 _pages_in_use -= span->span_size;
1192 _span_list.erase(cpu_mem.pages, *span);
1193 cpu_mem.free_span(span - cpu_mem.pages, span->span_size);
1194 }
1195 }
1196 }
1197
1198 void
1199 abort_on_underflow(size_t size) {
1200 if (std::make_signed_t<size_t>(size) < 0) {
1201 // probably a logic error, stop hard
1202 abort();
1203 }
1204 }
1205
1206 void* allocate_large(size_t size) {
1207 abort_on_underflow(size);
1208 unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1209 if ((size_t(size_in_pages) << page_bits) < size) {
1210 return nullptr; // (size + page_size - 1) caused an overflow
1211 }
1212 return cpu_mem.allocate_large(size_in_pages);
1213
1214 }
1215
1216 void* allocate_large_aligned(size_t align, size_t size) {
1217 abort_on_underflow(size);
1218 unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1219 unsigned align_in_pages = std::max(align, page_size) >> page_bits;
1220 return cpu_mem.allocate_large_aligned(align_in_pages, size_in_pages);
1221 }
1222
1223 void free_large(void* ptr) {
1224 return cpu_mem.free_large(ptr);
1225 }
1226
1227 size_t object_size(void* ptr) {
1228 return cpu_pages::all_cpus[object_cpu_id(ptr)]->object_size(ptr);
1229 }
1230
1231 // Mark as cold so that GCC8+ can move to .text.unlikely.
1232 [[gnu::cold]]
1233 static void init_cpu_mem_ptr(cpu_pages*& cpu_mem_ptr) {
1234 cpu_mem_ptr = &cpu_mem;
1235 };
1236
1237 [[gnu::always_inline]]
1238 static inline cpu_pages& get_cpu_mem()
1239 {
1240 // cpu_pages has a non-trivial constructor which means that the compiler
1241 // must make sure the instance local to the current thread has been
1242 // constructed before each access.
1243 // Unfortunately, this means that GCC will emit an unconditional call
1244 // to __tls_init(), which may incurr a noticeable overhead in applications
1245 // that are heavy on memory allocations.
1246 // This can be solved by adding an easily predictable branch checking
1247 // whether the object has already been constructed.
1248 static thread_local cpu_pages* cpu_mem_ptr;
1249 if (__builtin_expect(!bool(cpu_mem_ptr), false)) {
1250 init_cpu_mem_ptr(cpu_mem_ptr);
1251 }
1252 return *cpu_mem_ptr;
1253 }
1254
1255 void* allocate(size_t size) {
1256 if (size <= sizeof(free_object)) {
1257 size = sizeof(free_object);
1258 }
1259 void* ptr;
1260 if (size <= max_small_allocation) {
1261 size = object_size_with_alloc_site(size);
1262 ptr = get_cpu_mem().allocate_small(size);
1263 } else {
1264 ptr = allocate_large(size);
1265 }
1266 if (!ptr) {
1267 on_allocation_failure(size);
1268 }
1269 ++g_allocs;
1270 return ptr;
1271 }
1272
1273 void* allocate_aligned(size_t align, size_t size) {
1274 if (size <= sizeof(free_object)) {
1275 size = std::max(sizeof(free_object), align);
1276 }
1277 void* ptr;
1278 if (size <= max_small_allocation && align <= page_size) {
1279 // Our small allocator only guarantees alignment for power-of-two
1280 // allocations which are not larger than a page.
1281 size = 1 << log2ceil(object_size_with_alloc_site(size));
1282 ptr = get_cpu_mem().allocate_small(size);
1283 } else {
1284 ptr = allocate_large_aligned(align, size);
1285 }
1286 if (!ptr) {
1287 on_allocation_failure(size);
1288 }
1289 ++g_allocs;
1290 return ptr;
1291 }
1292
1293 void free(void* obj) {
1294 if (get_cpu_mem().try_cross_cpu_free(obj)) {
1295 return;
1296 }
1297 ++g_frees;
1298 get_cpu_mem().free(obj);
1299 }
1300
1301 void free(void* obj, size_t size) {
1302 if (get_cpu_mem().try_cross_cpu_free(obj)) {
1303 return;
1304 }
1305 ++g_frees;
1306 get_cpu_mem().free(obj, size);
1307 }
1308
1309 void free_aligned(void* obj, size_t align, size_t size) {
1310 if (size <= sizeof(free_object)) {
1311 size = sizeof(free_object);
1312 }
1313 free(obj, size);
1314 }
1315
1316 void shrink(void* obj, size_t new_size) {
1317 ++g_frees;
1318 ++g_allocs; // keep them balanced
1319 cpu_mem.shrink(obj, new_size);
1320 }
1321
1322 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1323 cpu_mem.set_reclaim_hook(hook);
1324 }
1325
1326 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope scope)
1327 : reclaimer([reclaim = std::move(reclaim)] (request) {
1328 return reclaim();
1329 }, scope) {
1330 }
1331
1332 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope scope)
1333 : _reclaim(std::move(reclaim))
1334 , _scope(scope) {
1335 cpu_mem.reclaimers.push_back(this);
1336 }
1337
1338 reclaimer::~reclaimer() {
1339 auto& r = cpu_mem.reclaimers;
1340 r.erase(std::find(r.begin(), r.end(), this));
1341 }
1342
1343 void set_large_allocation_warning_threshold(size_t threshold) {
1344 cpu_mem.large_allocation_warning_threshold = threshold;
1345 }
1346
1347 size_t get_large_allocation_warning_threshold() {
1348 return cpu_mem.large_allocation_warning_threshold;
1349 }
1350
1351 void disable_large_allocation_warning() {
1352 cpu_mem.large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
1353 }
1354
1355 void configure(std::vector<resource::memory> m, bool mbind,
1356 optional<std::string> hugetlbfs_path) {
1357 size_t total = 0;
1358 for (auto&& x : m) {
1359 total += x.bytes;
1360 }
1361 allocate_system_memory_fn sys_alloc = allocate_anonymous_memory;
1362 if (hugetlbfs_path) {
1363 // std::function is copyable, but file_desc is not, so we must use
1364 // a shared_ptr to allow sys_alloc to be copied around
1365 auto fdp = make_lw_shared<file_desc>(file_desc::temporary(*hugetlbfs_path));
1366 sys_alloc = [fdp] (optional<void*> where, size_t how_much) {
1367 return allocate_hugetlbfs_memory(*fdp, where, how_much);
1368 };
1369 cpu_mem.replace_memory_backing(sys_alloc);
1370 }
1371 cpu_mem.resize(total, sys_alloc);
1372 size_t pos = 0;
1373 for (auto&& x : m) {
1374 #ifdef SEASTAR_HAVE_NUMA
1375 unsigned long nodemask = 1UL << x.nodeid;
1376 if (mbind) {
1377 auto r = ::mbind(cpu_mem.mem() + pos, x.bytes,
1378 MPOL_PREFERRED,
1379 &nodemask, std::numeric_limits<unsigned long>::digits,
1380 MPOL_MF_MOVE);
1381
1382 if (r == -1) {
1383 char err[1000] = {};
1384 strerror_r(errno, err, sizeof(err));
1385 std::cerr << "WARNING: unable to mbind shard memory; performance may suffer: "
1386 << err << std::endl;
1387 }
1388 }
1389 #endif
1390 pos += x.bytes;
1391 }
1392 }
1393
1394 statistics stats() {
1395 return statistics{g_allocs, g_frees, g_cross_cpu_frees,
1396 cpu_mem.nr_pages * page_size, cpu_mem.nr_free_pages * page_size, g_reclaims, g_large_allocs};
1397 }
1398
1399 bool drain_cross_cpu_freelist() {
1400 return cpu_mem.drain_cross_cpu_freelist();
1401 }
1402
1403 memory_layout get_memory_layout() {
1404 return cpu_mem.memory_layout();
1405 }
1406
1407 size_t min_free_memory() {
1408 return cpu_mem.min_free_pages * page_size;
1409 }
1410
1411 void set_min_free_pages(size_t pages) {
1412 cpu_mem.set_min_free_pages(pages);
1413 }
1414
1415 static thread_local int report_on_alloc_failure_suppressed = 0;
1416
1417 class disable_report_on_alloc_failure_temporarily {
1418 public:
1419 disable_report_on_alloc_failure_temporarily() {
1420 ++report_on_alloc_failure_suppressed;
1421 };
1422 ~disable_report_on_alloc_failure_temporarily() noexcept {
1423 --report_on_alloc_failure_suppressed;
1424 }
1425 };
1426
1427 static std::atomic<bool> abort_on_allocation_failure{false};
1428
1429 void enable_abort_on_allocation_failure() {
1430 abort_on_allocation_failure.store(true, std::memory_order_seq_cst);
1431 }
1432
1433 void on_allocation_failure(size_t size) {
1434 if (!report_on_alloc_failure_suppressed &&
1435 // report even suppressed failures if trace level is enabled
1436 (seastar_memory_logger.is_enabled(seastar::log_level::trace) ||
1437 (seastar_memory_logger.is_enabled(seastar::log_level::debug) && !abort_on_alloc_failure_suppressed))) {
1438 disable_report_on_alloc_failure_temporarily guard;
1439 seastar_memory_logger.debug("Failed to allocate {} bytes at {}", size, current_backtrace());
1440 auto free_mem = cpu_mem.nr_free_pages * page_size;
1441 auto total_mem = cpu_mem.nr_pages * page_size;
1442 seastar_memory_logger.debug("Used memory: {} Free memory: {} Total memory: {}", total_mem - free_mem, free_mem, total_mem);
1443 seastar_memory_logger.debug("Small pools:");
1444 seastar_memory_logger.debug("objsz spansz usedobj memory wst%");
1445 for (unsigned i = 0; i < cpu_mem.small_pools.nr_small_pools; i++) {
1446 auto& sp = cpu_mem.small_pools[i];
1447 auto use_count = sp._pages_in_use * page_size / sp.object_size() - sp._free_count;
1448 auto memory = sp._pages_in_use * page_size;
1449 auto wasted_percent = memory ? sp._free_count * sp.object_size() * 100.0 / memory : 0;
1450 seastar_memory_logger.debug("{} {} {} {} {}", sp.object_size(), sp._span_sizes.preferred * page_size, use_count, memory, wasted_percent);
1451 }
1452 seastar_memory_logger.debug("Page spans:");
1453 seastar_memory_logger.debug("index size [B] free [B]");
1454 for (unsigned i = 0; i< cpu_mem.nr_span_lists; i++) {
1455 auto& span_list = cpu_mem.free_spans[i];
1456 auto front = span_list._front;
1457 uint32_t total = 0;
1458 while(front) {
1459 auto& span = cpu_mem.pages[front];
1460 total += span.span_size;
1461 front = span.link._next;
1462 }
1463 seastar_memory_logger.debug("{} {} {}", i, (1<<i) * page_size, total * page_size);
1464 }
1465 }
1466
1467 if (!abort_on_alloc_failure_suppressed
1468 && abort_on_allocation_failure.load(std::memory_order_relaxed)) {
1469 seastar_logger.error("Failed to allocate {} bytes", size);
1470 abort();
1471 }
1472 }
1473
1474 static void trigger_error_injector() {
1475 on_alloc_point();
1476 }
1477
1478 static bool try_trigger_error_injector() {
1479 try {
1480 on_alloc_point();
1481 return false;
1482 } catch (...) {
1483 return true;
1484 }
1485 }
1486
1487 }
1488
1489 }
1490
1491 using namespace seastar::memory;
1492
1493 extern "C"
1494 [[gnu::visibility("default")]]
1495 [[gnu::used]]
1496 void* malloc(size_t n) throw () {
1497 if (try_trigger_error_injector()) {
1498 return nullptr;
1499 }
1500 return allocate(n);
1501 }
1502
1503 extern "C"
1504 [[gnu::alias("malloc")]]
1505 [[gnu::visibility("default")]]
1506 [[gnu::malloc]]
1507 [[gnu::alloc_size(1)]]
1508 #ifndef __clang__
1509 [[gnu::leaf]]
1510 #endif
1511 void* __libc_malloc(size_t n) throw ();
1512
1513 extern "C"
1514 [[gnu::visibility("default")]]
1515 [[gnu::used]]
1516 void free(void* ptr) {
1517 if (ptr) {
1518 seastar::memory::free(ptr);
1519 }
1520 }
1521
1522 extern "C"
1523 [[gnu::alias("free")]]
1524 [[gnu::visibility("default")]]
1525 #ifndef __clang__
1526 [[gnu::leaf]]
1527 #endif
1528 void __libc_free(void* obj) throw ();
1529
1530 extern "C"
1531 [[gnu::visibility("default")]]
1532 void* calloc(size_t nmemb, size_t size) {
1533 if (try_trigger_error_injector()) {
1534 return nullptr;
1535 }
1536 auto s1 = __int128(nmemb) * __int128(size);
1537 assert(s1 == size_t(s1));
1538 size_t s = s1;
1539 auto p = malloc(s);
1540 if (p) {
1541 std::memset(p, 0, s);
1542 }
1543 return p;
1544 }
1545
1546 extern "C"
1547 [[gnu::alias("calloc")]]
1548 [[gnu::visibility("default")]]
1549 [[gnu::alloc_size(1, 2)]]
1550 [[gnu::malloc]]
1551 #ifndef __clang__
1552 [[gnu::leaf]]
1553 #endif
1554 void* __libc_calloc(size_t n, size_t m) throw ();
1555
1556 extern "C"
1557 [[gnu::visibility("default")]]
1558 void* realloc(void* ptr, size_t size) {
1559 if (try_trigger_error_injector()) {
1560 return nullptr;
1561 }
1562 auto old_size = ptr ? object_size(ptr) : 0;
1563 if (size == old_size) {
1564 return ptr;
1565 }
1566 if (size == 0) {
1567 ::free(ptr);
1568 return nullptr;
1569 }
1570 if (size < old_size) {
1571 seastar::memory::shrink(ptr, size);
1572 return ptr;
1573 }
1574 auto nptr = malloc(size);
1575 if (!nptr) {
1576 return nptr;
1577 }
1578 if (ptr) {
1579 std::memcpy(nptr, ptr, std::min(size, old_size));
1580 ::free(ptr);
1581 }
1582 return nptr;
1583 }
1584
1585 extern "C"
1586 [[gnu::alias("realloc")]]
1587 [[gnu::visibility("default")]]
1588 [[gnu::alloc_size(2)]]
1589 #ifndef __clang__
1590 [[gnu::leaf]]
1591 #endif
1592 void* __libc_realloc(void* obj, size_t size) throw ();
1593
1594 extern "C"
1595 [[gnu::visibility("default")]]
1596 [[gnu::used]]
1597 #ifndef __clang__
1598 [[gnu::leaf]]
1599 #endif
1600 [[gnu::nonnull(1)]]
1601 int posix_memalign(void** ptr, size_t align, size_t size) throw () {
1602 if (try_trigger_error_injector()) {
1603 return ENOMEM;
1604 }
1605 *ptr = allocate_aligned(align, size);
1606 if (!*ptr) {
1607 return ENOMEM;
1608 }
1609 return 0;
1610 }
1611
1612 extern "C"
1613 [[gnu::alias("posix_memalign")]]
1614 [[gnu::visibility("default")]]
1615 #ifndef __clang__
1616 [[gnu::leaf]]
1617 #endif
1618 [[gnu::nonnull(1)]]
1619 int __libc_posix_memalign(void** ptr, size_t align, size_t size) throw ();
1620
1621 extern "C"
1622 [[gnu::visibility("default")]]
1623 [[gnu::malloc]]
1624 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
1625 [[gnu::alloc_size(2)]]
1626 #endif
1627 void* memalign(size_t align, size_t size) throw () {
1628 if (try_trigger_error_injector()) {
1629 return nullptr;
1630 }
1631 size = seastar::align_up(size, align);
1632 return allocate_aligned(align, size);
1633 }
1634
1635 extern "C"
1636 [[gnu::visibility("default")]]
1637 void *aligned_alloc(size_t align, size_t size) throw () {
1638 if (try_trigger_error_injector()) {
1639 return nullptr;
1640 }
1641 return allocate_aligned(align, size);
1642 }
1643
1644 extern "C"
1645 [[gnu::alias("memalign")]]
1646 [[gnu::visibility("default")]]
1647 [[gnu::malloc]]
1648 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
1649 [[gnu::alloc_size(2)]]
1650 #endif
1651 void* __libc_memalign(size_t align, size_t size) throw ();
1652
1653 extern "C"
1654 [[gnu::visibility("default")]]
1655 void cfree(void* obj) throw () {
1656 return ::free(obj);
1657 }
1658
1659 extern "C"
1660 [[gnu::alias("cfree")]]
1661 [[gnu::visibility("default")]]
1662 void __libc_cfree(void* obj) throw ();
1663
1664 extern "C"
1665 [[gnu::visibility("default")]]
1666 size_t malloc_usable_size(void* obj) {
1667 return object_size(obj);
1668 }
1669
1670 extern "C"
1671 [[gnu::visibility("default")]]
1672 int malloc_trim(size_t pad) {
1673 return 0;
1674 }
1675
1676 static inline
1677 void* throw_if_null(void* ptr) {
1678 if (!ptr) {
1679 throw std::bad_alloc();
1680 }
1681 return ptr;
1682 }
1683
1684 [[gnu::visibility("default")]]
1685 void* operator new(size_t size) {
1686 trigger_error_injector();
1687 if (size == 0) {
1688 size = 1;
1689 }
1690 return throw_if_null(allocate(size));
1691 }
1692
1693 [[gnu::visibility("default")]]
1694 void* operator new[](size_t size) {
1695 trigger_error_injector();
1696 if (size == 0) {
1697 size = 1;
1698 }
1699 return throw_if_null(allocate(size));
1700 }
1701
1702 [[gnu::visibility("default")]]
1703 void operator delete(void* ptr) throw () {
1704 if (ptr) {
1705 seastar::memory::free(ptr);
1706 }
1707 }
1708
1709 [[gnu::visibility("default")]]
1710 void operator delete[](void* ptr) throw () {
1711 if (ptr) {
1712 seastar::memory::free(ptr);
1713 }
1714 }
1715
1716 [[gnu::visibility("default")]]
1717 void operator delete(void* ptr, size_t size) throw () {
1718 if (ptr) {
1719 seastar::memory::free(ptr, size);
1720 }
1721 }
1722
1723 [[gnu::visibility("default")]]
1724 void operator delete[](void* ptr, size_t size) throw () {
1725 if (ptr) {
1726 seastar::memory::free(ptr, size);
1727 }
1728 }
1729
1730 [[gnu::visibility("default")]]
1731 void* operator new(size_t size, std::nothrow_t) throw () {
1732 if (try_trigger_error_injector()) {
1733 return nullptr;
1734 }
1735 if (size == 0) {
1736 size = 1;
1737 }
1738 return allocate(size);
1739 }
1740
1741 [[gnu::visibility("default")]]
1742 void* operator new[](size_t size, std::nothrow_t) throw () {
1743 if (size == 0) {
1744 size = 1;
1745 }
1746 return allocate(size);
1747 }
1748
1749 [[gnu::visibility("default")]]
1750 void operator delete(void* ptr, std::nothrow_t) throw () {
1751 if (ptr) {
1752 seastar::memory::free(ptr);
1753 }
1754 }
1755
1756 [[gnu::visibility("default")]]
1757 void operator delete[](void* ptr, std::nothrow_t) throw () {
1758 if (ptr) {
1759 seastar::memory::free(ptr);
1760 }
1761 }
1762
1763 [[gnu::visibility("default")]]
1764 void operator delete(void* ptr, size_t size, std::nothrow_t) throw () {
1765 if (ptr) {
1766 seastar::memory::free(ptr, size);
1767 }
1768 }
1769
1770 [[gnu::visibility("default")]]
1771 void operator delete[](void* ptr, size_t size, std::nothrow_t) throw () {
1772 if (ptr) {
1773 seastar::memory::free(ptr, size);
1774 }
1775 }
1776
1777 #ifdef __cpp_aligned_new
1778
1779 [[gnu::visibility("default")]]
1780 void* operator new(size_t size, std::align_val_t a) {
1781 trigger_error_injector();
1782 auto ptr = allocate_aligned(size_t(a), size);
1783 return throw_if_null(ptr);
1784 }
1785
1786 [[gnu::visibility("default")]]
1787 void* operator new[](size_t size, std::align_val_t a) {
1788 trigger_error_injector();
1789 auto ptr = allocate_aligned(size_t(a), size);
1790 return throw_if_null(ptr);
1791 }
1792
1793 [[gnu::visibility("default")]]
1794 void* operator new(size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
1795 if (try_trigger_error_injector()) {
1796 return nullptr;
1797 }
1798 return allocate_aligned(size_t(a), size);
1799 }
1800
1801 [[gnu::visibility("default")]]
1802 void* operator new[](size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
1803 if (try_trigger_error_injector()) {
1804 return nullptr;
1805 }
1806 return allocate_aligned(size_t(a), size);
1807 }
1808
1809
1810 [[gnu::visibility("default")]]
1811 void operator delete(void* ptr, std::align_val_t a) noexcept {
1812 if (ptr) {
1813 seastar::memory::free(ptr);
1814 }
1815 }
1816
1817 [[gnu::visibility("default")]]
1818 void operator delete[](void* ptr, std::align_val_t a) noexcept {
1819 if (ptr) {
1820 seastar::memory::free(ptr);
1821 }
1822 }
1823
1824 [[gnu::visibility("default")]]
1825 void operator delete(void* ptr, size_t size, std::align_val_t a) noexcept {
1826 if (ptr) {
1827 seastar::memory::free_aligned(ptr, size_t(a), size);
1828 }
1829 }
1830
1831 [[gnu::visibility("default")]]
1832 void operator delete[](void* ptr, size_t size, std::align_val_t a) noexcept {
1833 if (ptr) {
1834 seastar::memory::free_aligned(ptr, size_t(a), size);
1835 }
1836 }
1837
1838 [[gnu::visibility("default")]]
1839 void operator delete(void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
1840 if (ptr) {
1841 seastar::memory::free(ptr);
1842 }
1843 }
1844
1845 [[gnu::visibility("default")]]
1846 void operator delete[](void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
1847 if (ptr) {
1848 seastar::memory::free(ptr);
1849 }
1850 }
1851
1852 #endif
1853
1854 namespace seastar {
1855
1856 #else
1857
1858 namespace seastar {
1859
1860 namespace memory {
1861
1862 void set_heap_profiling_enabled(bool enabled) {
1863 seastar_logger.warn("Seastar compiled with default allocator, heap profiler not supported");
1864 }
1865
1866 scoped_heap_profiling::scoped_heap_profiling() noexcept {
1867 set_heap_profiling_enabled(true); // let it print the warning
1868 }
1869
1870 scoped_heap_profiling::~scoped_heap_profiling() {
1871 }
1872
1873 void enable_abort_on_allocation_failure() {
1874 seastar_logger.warn("Seastar compiled with default allocator, will not abort on bad_alloc");
1875 }
1876
1877 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope) {
1878 }
1879
1880 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope) {
1881 }
1882
1883 reclaimer::~reclaimer() {
1884 }
1885
1886 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1887 }
1888
1889 void configure(std::vector<resource::memory> m, bool mbind, compat::optional<std::string> hugepages_path) {
1890 }
1891
1892 statistics stats() {
1893 return statistics{0, 0, 0, 1 << 30, 1 << 30, 0, 0};
1894 }
1895
1896 bool drain_cross_cpu_freelist() {
1897 return false;
1898 }
1899
1900 memory_layout get_memory_layout() {
1901 throw std::runtime_error("get_memory_layout() not supported");
1902 }
1903
1904 size_t min_free_memory() {
1905 return 0;
1906 }
1907
1908 void set_min_free_pages(size_t pages) {
1909 // Ignore, reclaiming not supported for default allocator.
1910 }
1911
1912 void set_large_allocation_warning_threshold(size_t) {
1913 // Ignore, not supported for default allocator.
1914 }
1915
1916 size_t get_large_allocation_warning_threshold() {
1917 // Ignore, not supported for default allocator.
1918 return std::numeric_limits<size_t>::max();
1919 }
1920
1921 void disable_large_allocation_warning() {
1922 // Ignore, not supported for default allocator.
1923 }
1924
1925 }
1926
1927 }
1928
1929 namespace seastar {
1930
1931 #endif
1932
1933 /// \endcond
1934
1935 }