ceph/src/seastar/src/core/memory.cc

   1 /*
   2  * This file is open source software, licensed to you under the terms
   3  * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
   4  * distributed with this work for additional information regarding copyright
   5  * ownership.  You may not use this file except in compliance with the License.
   6  *
   7  * You may obtain a copy of the License at
   8  *
   9  *   http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing,
  12  * software distributed under the License is distributed on an
  13  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14  * KIND, either express or implied.  See the License for the
  15  * specific language governing permissions and limitations
  16  * under the License.
  17  */
  18 /*
  19  * Copyright (C) 2014 Cloudius Systems, Ltd.
  20  */
  21
  22
  23 /// \cond internal
  24
  25 //
  26 // Seastar memory allocator
  27 //
  28 // This is a share-nothing allocator (memory allocated on one cpu must
  29 // be freed on the same cpu).
  30 //
  31 // Inspired by gperftools' tcmalloc.
  32 //
  33 // Memory map:
  34 //
  35 // 0x0000'sccc'vvvv'vvvv
  36 //
  37 // 0000 - required by architecture (only 48 bits of address space)
  38 // s    - chosen to satisfy system allocator (1-7)
  39 // ccc  - cpu number (0-12 bits allocated vary according to system)
  40 // v    - virtual address within cpu (32-44 bits, according to how much ccc
  41 //        leaves us
  42 //
  43 // Each page has a page structure that describes it.  Within a cpu's
  44 // memory pool, the page array starts at offset 0, describing all pages
  45 // within that pool.  Page 0 does not describe a valid page.
  46 //
  47 // Each pool can contain at most 2^32 pages (or 44 address bits), so we can
  48 // use a 32-bit integer to identify a page.
  49 //
  50 // Runs of pages are organized into spans.  Free spans are organized into lists,
  51 // by size.  When spans are broken up or coalesced, they may move into new lists.
  52 // Spans have a size that is a power-of-two and are naturally aligned (aka buddy
  53 // allocator)
  54
  55 #include <seastar/core/cacheline.hh>
  56 #include <seastar/core/memory.hh>
  57 #include <seastar/core/print.hh>
  58 #include <seastar/util/alloc_failure_injector.hh>
  59 #include <seastar/util/memory_diagnostics.hh>
  60 #include <seastar/util/std-compat.hh>
  61 #include <seastar/util/log.hh>
  62 #include <seastar/core/aligned_buffer.hh>
  63 #include <unordered_set>
  64 #include <iostream>
  65 #include <thread>
  66
  67 #include <dlfcn.h>
  68
  69 namespace seastar {
  70
  71 extern seastar::logger seastar_logger;
  72
  73 void* internal::allocate_aligned_buffer_impl(size_t size, size_t align) {
  74     void *ret;
  75     auto r = posix_memalign(&ret, align, size);
  76     if (r == ENOMEM) {
  77         throw std::bad_alloc();
  78     } else if (r == EINVAL) {
  79         throw std::runtime_error(format("Invalid alignment of {:d}; allocating {:d} bytes", align, size));
  80     } else {
  81         assert(r == 0);
  82         return ret;
  83     }
  84 }
  85
  86 namespace memory {
  87
  88 // We always create the logger object for memory disagnostics, even in
  89 // in SEASTAR_DEFAULT_ALLOCATOR builds, though it only logs when the
  90 // seastar allocator is enabled.
  91 seastar::logger seastar_memory_logger("seastar_memory");
  92
  93 static thread_local int abort_on_alloc_failure_suppressed = 0;
  94
  95 disable_abort_on_alloc_failure_temporarily::disable_abort_on_alloc_failure_temporarily() {
  96     ++abort_on_alloc_failure_suppressed;
  97 }
  98
  99 disable_abort_on_alloc_failure_temporarily::~disable_abort_on_alloc_failure_temporarily() noexcept {
 100     --abort_on_alloc_failure_suppressed;
 101 }
 102
 103 static std::pmr::polymorphic_allocator<char> static_malloc_allocator{std::pmr::get_default_resource()};;
 104 std::pmr::polymorphic_allocator<char>* malloc_allocator{&static_malloc_allocator};
 105
 106 namespace internal {
 107
 108 #ifdef __cpp_constinit
 109 #define SEASTAR_CONSTINIT constinit
 110 #else
 111 #define SEASTAR_CONSTINIT
 112 #endif
 113
 114 #ifdef SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION
 115
 116 #ifdef __cpp_constinit
 117 thread_local constinit volatile int critical_alloc_section = 0;
 118 #else
 119 __thread volatile int critical_alloc_section = 0;
 120 #endif
 121
 122 #endif  // SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION
 123
 124 } // namespace internal
 125
 126 }
 127
 128 }
 129
 130 #ifndef SEASTAR_DEFAULT_ALLOCATOR
 131
 132 #include <seastar/core/bitops.hh>
 133 #include <seastar/core/align.hh>
 134 #include <seastar/core/posix.hh>
 135 #include <seastar/core/shared_ptr.hh>
 136 #include <new>
 137 #include <cstdint>
 138 #include <algorithm>
 139 #include <limits>
 140 #include <cassert>
 141 #include <atomic>
 142 #include <mutex>
 143 #include <seastar/util/std-compat.hh>
 144 #include <functional>
 145 #include <cstring>
 146 #include <boost/intrusive/list.hpp>
 147 #include <sys/mman.h>
 148 #include <seastar/util/backtrace.hh>
 149
 150 #ifdef SEASTAR_HAVE_NUMA
 151 #include <numaif.h>
 152 #endif
 153
 154 namespace seastar {
 155
 156 struct allocation_site {
 157     mutable size_t count = 0; // number of live objects allocated at backtrace.
 158     mutable size_t size = 0; // amount of bytes in live objects allocated at backtrace.
 159     mutable const allocation_site* next = nullptr;
 160     saved_backtrace backtrace;
 161
 162     bool operator==(const allocation_site& o) const {
 163         return backtrace == o.backtrace;
 164     }
 165
 166     bool operator!=(const allocation_site& o) const {
 167         return !(*this == o);
 168     }
 169 };
 170
 171 }
 172
 173 namespace std {
 174
 175 template<>
 176 struct hash<seastar::allocation_site> {
 177     size_t operator()(const seastar::allocation_site& bi) const {
 178         return std::hash<seastar::saved_backtrace>()(bi.backtrace);
 179     }
 180 };
 181
 182 }
 183
 184 #if FMT_VERSION >= 90000
 185 namespace seastar::memory {
 186 struct human_readable_value;
 187 }
 188 template <> struct fmt::formatter<struct seastar::memory::human_readable_value> : fmt::ostream_formatter {};
 189 #endif
 190
 191 namespace seastar {
 192
 193 using allocation_site_ptr = const allocation_site*;
 194
 195 namespace memory {
 196
 197 [[gnu::unused]]
 198 static allocation_site_ptr get_allocation_site();
 199
 200 static void on_allocation_failure(size_t size);
 201
 202 static constexpr unsigned cpu_id_shift = 36; // FIXME: make dynamic
 203 static constexpr unsigned max_cpus = 256;
 204 static constexpr uintptr_t cpu_id_and_mem_base_mask = ~((uintptr_t(1) << cpu_id_shift) - 1);
 205
 206 using pageidx = uint32_t;
 207
 208 struct page;
 209 class page_list;
 210
 211 static std::atomic<bool> live_cpus[max_cpus];
 212
 213 using std::optional;
 214
 215 // is_reactor_thread gets set to true when memory::configure() gets called
 216 // it is used to identify seastar threads and hence use system memory allocator
 217 // for those threads
 218 static thread_local bool is_reactor_thread = false;
 219
 220
 221 namespace alloc_stats {
 222
 223 enum class types { allocs, frees, cross_cpu_frees, reclaims, large_allocs, failed_allocs,
 224     foreign_mallocs, foreign_frees, foreign_cross_frees, enum_size };
 225
 226 using stats_array = std::array<uint64_t, static_cast<std::size_t>(types::enum_size)>;
 227 using stats_atomic_array = std::array<std::atomic_uint64_t, static_cast<std::size_t>(types::enum_size)>;
 228
 229 static thread_local SEASTAR_CONSTINIT stats_array stats{};
 230 std::array<stats_atomic_array, max_cpus> alien_stats{};
 231
 232 static void increment_local(types stat_type, uint64_t size = 1) {
 233     stats[static_cast<std::size_t>(stat_type)] += size;
 234 }
 235
 236 static void increment(types stat_type, uint64_t size=1)
 237 {
 238     // fast path, reactor threads takes thread local statistics
 239     if (is_reactor_thread) {
 240         increment_local(stat_type, size);
 241     } else {
 242         auto hash = std::hash<std::thread::id>()(std::this_thread::get_id());
 243         auto i = static_cast<std::size_t>(stat_type);
 244         alien_stats[hash % alien_stats.size()][i].fetch_add(size, std::memory_order_relaxed);
 245     }
 246 }
 247
 248 static uint64_t get(types stat_type)
 249 {
 250     auto i = static_cast<std::size_t>(stat_type);
 251     // fast path, reactor threads takes thread local statistics
 252     if (is_reactor_thread) {
 253         return stats[i];
 254     } else {
 255         auto hash = std::hash<std::thread::id>()(std::this_thread::get_id());
 256         return alien_stats[hash % alien_stats.size()][i].load();
 257     }
 258 }
 259
 260 }
 261
 262 // original memory allocator support
 263 // note: allocations before calling the constructor would use seastar allocator
 264 using malloc_func_type = void * (*)(size_t);
 265 using free_func_type = void * (*)(void *);
 266 using realloc_func_type = void * (*)(void *, size_t);
 267 using aligned_alloc_type = void * (*)(size_t alignment, size_t size);
 268 using malloc_trim_type = int (*)(size_t);
 269 using malloc_usable_size_type = size_t (*)(void *);
 270
 271 malloc_func_type original_malloc_func = reinterpret_cast<malloc_func_type>(dlsym(RTLD_NEXT, "malloc"));
 272 free_func_type original_free_func = reinterpret_cast<free_func_type>(dlsym(RTLD_NEXT, "free"));
 273 realloc_func_type original_realloc_func = reinterpret_cast<realloc_func_type>(dlsym(RTLD_NEXT, "realloc"));
 274 aligned_alloc_type original_aligned_alloc_func = reinterpret_cast<aligned_alloc_type>(dlsym(RTLD_NEXT, "aligned_alloc"));
 275 malloc_trim_type original_malloc_trim_func = reinterpret_cast<malloc_trim_type>(dlsym(RTLD_NEXT, "malloc_trim"));
 276 malloc_usable_size_type original_malloc_usable_size_func = reinterpret_cast<malloc_usable_size_type>(dlsym(RTLD_NEXT, "malloc_usable_size"));
 277
 278 using allocate_system_memory_fn
 279         = std::function<mmap_area (void* where, size_t how_much)>;
 280
 281 namespace bi = boost::intrusive;
 282
 283 static thread_local uintptr_t local_expected_cpu_id = std::numeric_limits<uintptr_t>::max();
 284
 285 inline
 286 unsigned object_cpu_id(const void* ptr) {
 287     return (reinterpret_cast<uintptr_t>(ptr) >> cpu_id_shift) & 0xff;
 288 }
 289
 290 class page_list_link {
 291     uint32_t _prev;
 292     uint32_t _next;
 293     friend class page_list;
 294     friend seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator);
 295 };
 296
 297 constexpr size_t mem_base_alloc = size_t(1) << 44;
 298
 299 static char* mem_base() {
 300     static char* known;
 301     static std::once_flag flag;
 302     std::call_once(flag, [] {
 303         auto r = ::mmap(NULL, 2 * mem_base_alloc,
 304                     PROT_NONE,
 305                     MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
 306                     -1, 0);
 307         if (r == MAP_FAILED) {
 308             abort();
 309         }
 310         ::madvise(r, 2 * mem_base_alloc, MADV_DONTDUMP);
 311         auto cr = reinterpret_cast<char*>(r);
 312         known = align_up(cr, mem_base_alloc);
 313         ::munmap(cr, known - cr);
 314         ::munmap(known + mem_base_alloc, cr + 2 * mem_base_alloc - (known + mem_base_alloc));
 315     });
 316     return known;
 317 }
 318
 319 bool is_seastar_memory(void * ptr)
 320 {
 321     auto begin = mem_base();
 322     auto end = begin + mem_base_alloc;
 323     return ptr >= begin && ptr < end;
 324 }
 325
 326 constexpr bool is_page_aligned(size_t size) {
 327     return (size & (page_size - 1)) == 0;
 328 }
 329
 330 constexpr size_t next_page_aligned(size_t size) {
 331     return (size + (page_size - 1)) & ~(page_size - 1);
 332 }
 333
 334 class small_pool;
 335
 336 struct free_object {
 337     free_object* next;
 338 };
 339
 340 struct page {
 341     bool free;
 342     uint8_t offset_in_span;
 343     uint16_t nr_small_alloc;
 344     uint32_t span_size; // in pages, if we're the head or the tail
 345     page_list_link link;
 346     small_pool* pool;  // if used in a small_pool
 347     free_object* freelist;
 348 #ifdef SEASTAR_HEAPPROF
 349     allocation_site_ptr alloc_site; // for objects whose size is multiple of page size, valid for head only
 350 #endif
 351 };
 352
 353 class page_list {
 354     uint32_t _front = 0;
 355     uint32_t _back = 0;
 356 public:
 357     page& front(page* ary) { return ary[_front]; }
 358     page& back(page* ary) { return ary[_back]; }
 359     bool empty() const { return !_front; }
 360     void erase(page* ary, page& span) {
 361         if (span.link._next) {
 362             ary[span.link._next].link._prev = span.link._prev;
 363         } else {
 364             _back = span.link._prev;
 365         }
 366         if (span.link._prev) {
 367             ary[span.link._prev].link._next = span.link._next;
 368         } else {
 369             _front = span.link._next;
 370         }
 371     }
 372     void push_front(page* ary, page& span) {
 373         auto idx = &span - ary;
 374         if (_front) {
 375             ary[_front].link._prev = idx;
 376         } else {
 377             _back = idx;
 378         }
 379         span.link._next = _front;
 380         span.link._prev = 0;
 381         _front = idx;
 382     }
 383     void pop_front(page* ary) {
 384         if (ary[_front].link._next) {
 385             ary[ary[_front].link._next].link._prev = 0;
 386         } else {
 387             _back = 0;
 388         }
 389         _front = ary[_front].link._next;
 390     }
 391     friend seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator);
 392 };
 393
 394 class small_pool {
 395     struct span_sizes {
 396         uint8_t preferred;
 397         uint8_t fallback;
 398     };
 399     unsigned _object_size;
 400     span_sizes _span_sizes;
 401     free_object* _free = nullptr;
 402     size_t _free_count = 0;
 403     unsigned _min_free;
 404     unsigned _max_free;
 405     unsigned _pages_in_use = 0;
 406     page_list _span_list;
 407     static constexpr unsigned idx_frac_bits = 2;
 408 public:
 409     explicit small_pool(unsigned object_size) noexcept;
 410     ~small_pool();
 411     void* allocate();
 412     void deallocate(void* object);
 413     unsigned object_size() const { return _object_size; }
 414     bool objects_page_aligned() const { return is_page_aligned(_object_size); }
 415     static constexpr unsigned size_to_idx(unsigned size);
 416     static constexpr unsigned idx_to_size(unsigned idx);
 417     allocation_site_ptr& alloc_site_holder(void* ptr);
 418 private:
 419     void add_more_objects();
 420     void trim_free_list();
 421     friend seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator);
 422 };
 423
 424 // index 0b0001'1100 -> size (1 << 4) + 0b11 << (4 - 2)
 425
 426 constexpr unsigned
 427 small_pool::idx_to_size(unsigned idx) {
 428     size_t s = (((1 << idx_frac_bits) | (idx & ((1 << idx_frac_bits) - 1)))
 429               << (idx >> idx_frac_bits))
 430                   >> idx_frac_bits;
 431     // If size is larger than max_align_t, force it to be a multiple of
 432     // max_align_t. Clang relies in this property to use aligned mov
 433     // instructions (e.g. movaps)
 434     //
 435     // Note this function is used at initialization time only, so it doesn't
 436     // need to be especially fast.
 437     if (s > alignof(std::max_align_t)) {
 438         s = align_up(s, alignof(std::max_align_t));
 439     }
 440     return s;
 441 }
 442
 443 constexpr unsigned
 444 small_pool::size_to_idx(unsigned size) {
 445     return ((log2floor(size) << idx_frac_bits) - ((1 << idx_frac_bits) - 1))
 446             + ((size - 1) >> (log2floor(size) - idx_frac_bits));
 447 }
 448
 449 class small_pool_array {
 450 public:
 451     static constexpr unsigned nr_small_pools = small_pool::size_to_idx(4 * page_size) + 1;
 452 private:
 453     union u {
 454         small_pool a[nr_small_pools];
 455         u() {
 456             for (unsigned i = 0; i < nr_small_pools; ++i) {
 457                 new (&a[i]) small_pool(small_pool::idx_to_size(i));
 458             }
 459         }
 460         ~u() {
 461             // cannot really call destructor, since other
 462             // objects may be freed after we are gone.
 463         }
 464     } _u;
 465 public:
 466     small_pool& operator[](unsigned idx) { return _u.a[idx]; }
 467 };
 468
 469 static constexpr size_t max_small_allocation
 470     = small_pool::idx_to_size(small_pool_array::nr_small_pools - 1);
 471
 472 constexpr size_t object_size_with_alloc_site(size_t size) {
 473 #ifdef SEASTAR_HEAPPROF
 474     // For page-aligned sizes, allocation_site* lives in page::alloc_site, not with the object.
 475     static_assert(is_page_aligned(max_small_allocation), "assuming that max_small_allocation is page aligned so that we"
 476             " don't need to add allocation_site_ptr to objects of size close to it");
 477     size_t next_page_aligned_size = next_page_aligned(size);
 478     if (next_page_aligned_size - size > sizeof(allocation_site_ptr)) {
 479         size += sizeof(allocation_site_ptr);
 480     } else {
 481         return next_page_aligned_size;
 482     }
 483 #endif
 484     return size;
 485 }
 486
 487 #ifdef SEASTAR_HEAPPROF
 488 // Ensure that object_size_with_alloc_site() does not exceed max_small_allocation
 489 static_assert(object_size_with_alloc_site(max_small_allocation) == max_small_allocation, "");
 490 static_assert(object_size_with_alloc_site(max_small_allocation - 1) == max_small_allocation, "");
 491 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) + 1) == max_small_allocation, "");
 492 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr)) == max_small_allocation, "");
 493 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 1) == max_small_allocation - 1, "");
 494 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 2) == max_small_allocation - 2, "");
 495 #endif
 496
 497 struct cross_cpu_free_item {
 498     cross_cpu_free_item* next;
 499 };
 500
 501 struct cpu_pages {
 502     uint32_t min_free_pages = 20000000 / page_size;
 503     char* memory;
 504     page* pages;
 505     uint32_t nr_pages;
 506     uint32_t nr_free_pages;
 507     uint32_t current_min_free_pages = 0;
 508     size_t large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
 509     unsigned cpu_id = -1U;
 510     std::function<void (std::function<void ()>)> reclaim_hook;
 511     std::vector<reclaimer*> reclaimers;
 512     static constexpr unsigned nr_span_lists = 32;
 513     page_list free_spans[nr_span_lists];  // contains aligned spans with span_size == 2^idx
 514     small_pool_array small_pools;
 515     alignas(seastar::cache_line_size) std::atomic<cross_cpu_free_item*> xcpu_freelist;
 516     static std::atomic<unsigned> cpu_id_gen;
 517     static cpu_pages* all_cpus[max_cpus];
 518     union asu {
 519         using alloc_sites_type = std::unordered_set<allocation_site>;
 520         asu() : alloc_sites{} {
 521         }
 522         ~asu() {} // alloc_sites live forever
 523         alloc_sites_type alloc_sites;
 524     } asu;
 525     allocation_site_ptr alloc_site_list_head = nullptr; // For easy traversal of asu.alloc_sites from scylla-gdb.py
 526     bool collect_backtrace = false;
 527     char* mem() { return memory; }
 528
 529     void link(page_list& list, page* span);
 530     void unlink(page_list& list, page* span);
 531     struct trim {
 532         unsigned offset;
 533         unsigned nr_pages;
 534     };
 535     void maybe_reclaim();
 536     void* allocate_large_and_trim(unsigned nr_pages);
 537     void* allocate_large(unsigned nr_pages);
 538     void* allocate_large_aligned(unsigned align_pages, unsigned nr_pages);
 539     page* find_and_unlink_span(unsigned nr_pages);
 540     page* find_and_unlink_span_reclaiming(unsigned n_pages);
 541     void free_large(void* ptr);
 542     bool grow_span(pageidx& start, uint32_t& nr_pages, unsigned idx);
 543     void free_span(pageidx start, uint32_t nr_pages);
 544     void free_span_no_merge(pageidx start, uint32_t nr_pages);
 545     void free_span_unaligned(pageidx start, uint32_t nr_pages);
 546     void* allocate_small(unsigned size);
 547     void free(void* ptr);
 548     void free(void* ptr, size_t size);
 549     static bool try_foreign_free(void* ptr);
 550     void shrink(void* ptr, size_t new_size);
 551     static void free_cross_cpu(unsigned cpu_id, void* ptr);
 552     bool drain_cross_cpu_freelist();
 553     size_t object_size(void* ptr);
 554     page* to_page(void* p) {
 555         return &pages[(reinterpret_cast<char*>(p) - mem()) / page_size];
 556     }
 557
 558     bool is_initialized() const;
 559     bool initialize();
 560     reclaiming_result run_reclaimers(reclaimer_scope, size_t pages_to_reclaim);
 561     void schedule_reclaim();
 562     void set_reclaim_hook(std::function<void (std::function<void ()>)> hook);
 563     void set_min_free_pages(size_t pages);
 564     void resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
 565     void do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
 566     void replace_memory_backing(allocate_system_memory_fn alloc_sys_mem);
 567     void check_large_allocation(size_t size);
 568     void warn_large_allocation(size_t size);
 569     memory::memory_layout memory_layout();
 570     ~cpu_pages();
 571 };
 572
 573 static thread_local cpu_pages cpu_mem;
 574 std::atomic<unsigned> cpu_pages::cpu_id_gen;
 575 cpu_pages* cpu_pages::all_cpus[max_cpus];
 576
 577 static cpu_pages& get_cpu_mem();
 578
 579 #ifdef SEASTAR_HEAPPROF
 580
 581 void set_heap_profiling_enabled(bool enable) {
 582     bool is_enabled = get_cpu_mem().collect_backtrace;
 583     if (enable) {
 584         if (!is_enabled) {
 585             seastar_logger.info("Enabling heap profiler");
 586         }
 587     } else {
 588         if (is_enabled) {
 589             seastar_logger.info("Disabling heap profiler");
 590         }
 591     }
 592     get_cpu_mem().collect_backtrace = enable;
 593 }
 594
 595 static thread_local int64_t scoped_heap_profiling_embed_count = 0;
 596
 597 scoped_heap_profiling::scoped_heap_profiling() noexcept {
 598     ++scoped_heap_profiling_embed_count;
 599     set_heap_profiling_enabled(true);
 600 }
 601
 602 scoped_heap_profiling::~scoped_heap_profiling() {
 603     if (!--scoped_heap_profiling_embed_count) {
 604         set_heap_profiling_enabled(false);
 605     }
 606 }
 607
 608 #else
 609
 610 void set_heap_profiling_enabled(bool enable) {
 611     seastar_logger.warn("Seastar compiled without heap profiling support, heap profiler not supported;"
 612             " compile with the Seastar_HEAP_PROFILING=ON CMake option to add heap profiling support");
 613 }
 614
 615 scoped_heap_profiling::scoped_heap_profiling() noexcept {
 616     set_heap_profiling_enabled(true); // let it print the warning
 617 }
 618
 619 scoped_heap_profiling::~scoped_heap_profiling() {
 620 }
 621
 622 #endif
 623
 624 // Smallest index i such that all spans stored in the index are >= pages.
 625 static inline
 626 unsigned index_of(unsigned pages) {
 627     if (pages == 1) {
 628         return 0;
 629     }
 630     return std::numeric_limits<unsigned>::digits - count_leading_zeros(pages - 1);
 631 }
 632
 633 void
 634 cpu_pages::unlink(page_list& list, page* span) {
 635     list.erase(pages, *span);
 636 }
 637
 638 void
 639 cpu_pages::link(page_list& list, page* span) {
 640     list.push_front(pages, *span);
 641 }
 642
 643 void cpu_pages::free_span_no_merge(uint32_t span_start, uint32_t nr_pages) {
 644     assert(nr_pages);
 645     nr_free_pages += nr_pages;
 646     auto span = &pages[span_start];
 647     auto span_end = &pages[span_start + nr_pages - 1];
 648     span->free = span_end->free = true;
 649     span->span_size = span_end->span_size = nr_pages;
 650     auto idx = index_of(nr_pages);
 651     link(free_spans[idx], span);
 652 }
 653
 654 bool cpu_pages::grow_span(uint32_t& span_start, uint32_t& nr_pages, unsigned idx) {
 655     auto which = (span_start >> idx) & 1; // 0=lower, 1=upper
 656     // locate first page of upper buddy or last page of lower buddy
 657     // examples: span_start = 0x10 nr_pages = 0x08 -> buddy = 0x18  (which = 0)
 658     //           span_start = 0x18 nr_pages = 0x08 -> buddy = 0x17  (which = 1)
 659     // delta = which ? -1u : nr_pages
 660     auto delta = ((which ^ 1) << idx) | -which;
 661     auto buddy = span_start + delta;
 662     if (pages[buddy].free && pages[buddy].span_size == nr_pages) {
 663         unlink(free_spans[idx], &pages[span_start ^ nr_pages]);
 664         nr_free_pages -= nr_pages; // free_span_no_merge() will restore
 665         span_start &= ~nr_pages;
 666         nr_pages *= 2;
 667         return true;
 668     }
 669     return false;
 670 }
 671
 672 void cpu_pages::free_span(uint32_t span_start, uint32_t nr_pages) {
 673     auto idx = index_of(nr_pages);
 674     while (grow_span(span_start, nr_pages, idx)) {
 675         ++idx;
 676     }
 677     free_span_no_merge(span_start, nr_pages);
 678 }
 679
 680 // Internal, used during startup. Span is not aligned so needs to be broken up
 681 void cpu_pages::free_span_unaligned(uint32_t span_start, uint32_t nr_pages) {
 682     while (nr_pages) {
 683         auto start_nr_bits = span_start ? count_trailing_zeros(span_start) : 32;
 684         auto size_nr_bits = count_trailing_zeros(nr_pages);
 685         auto now = 1u << std::min(start_nr_bits, size_nr_bits);
 686         free_span(span_start, now);
 687         span_start += now;
 688         nr_pages -= now;
 689     }
 690 }
 691
 692 page*
 693 cpu_pages::find_and_unlink_span(unsigned n_pages) {
 694     auto idx = index_of(n_pages);
 695     if (n_pages >= (2u << idx)) {
 696         return nullptr;
 697     }
 698     while (idx < nr_span_lists && free_spans[idx].empty()) {
 699         ++idx;
 700     }
 701     if (idx == nr_span_lists) {
 702         if (initialize()) {
 703             return find_and_unlink_span(n_pages);
 704         }
 705         return nullptr;
 706     }
 707     auto& list = free_spans[idx];
 708     page* span = &list.front(pages);
 709     unlink(list, span);
 710     return span;
 711 }
 712
 713 page*
 714 cpu_pages::find_and_unlink_span_reclaiming(unsigned n_pages) {
 715     while (true) {
 716         auto span = find_and_unlink_span(n_pages);
 717         if (span) {
 718             return span;
 719         }
 720         if (run_reclaimers(reclaimer_scope::sync, n_pages) == reclaiming_result::reclaimed_nothing) {
 721             return nullptr;
 722         }
 723     }
 724 }
 725
 726 void cpu_pages::maybe_reclaim() {
 727     if (nr_free_pages < current_min_free_pages) {
 728         drain_cross_cpu_freelist();
 729         if (nr_free_pages < current_min_free_pages) {
 730             run_reclaimers(reclaimer_scope::sync, current_min_free_pages - nr_free_pages);
 731         }
 732         if (nr_free_pages < current_min_free_pages) {
 733             schedule_reclaim();
 734         }
 735     }
 736 }
 737
 738 void*
 739 cpu_pages::allocate_large_and_trim(unsigned n_pages) {
 740     // Avoid exercising the reclaimers for requests we'll not be able to satisfy
 741     // nr_pages might be zero during startup, so check for that too
 742     if (nr_pages && n_pages >= nr_pages) {
 743         return nullptr;
 744     }
 745     page* span = find_and_unlink_span_reclaiming(n_pages);
 746     if (!span) {
 747         return nullptr;
 748     }
 749     auto span_size = span->span_size;
 750     auto span_idx = span - pages;
 751     nr_free_pages -= span->span_size;
 752     while (span_size >= n_pages * 2) {
 753         span_size /= 2;
 754         auto other_span_idx = span_idx + span_size;
 755         free_span_no_merge(other_span_idx, span_size);
 756     }
 757     auto span_end = &pages[span_idx + span_size - 1];
 758     span->free = span_end->free = false;
 759     span->span_size = span_end->span_size = span_size;
 760     span->pool = nullptr;
 761 #ifdef SEASTAR_HEAPPROF
 762     auto alloc_site = get_allocation_site();
 763     span->alloc_site = alloc_site;
 764     if (alloc_site) {
 765         ++alloc_site->count;
 766         alloc_site->size += span->span_size * page_size;
 767     }
 768 #endif
 769     maybe_reclaim();
 770     return mem() + span_idx * page_size;
 771 }
 772
 773 void
 774 cpu_pages::warn_large_allocation(size_t size) {
 775     alloc_stats::increment_local(alloc_stats::types::large_allocs);
 776     seastar_memory_logger.warn("oversized allocation: {} bytes. This is non-fatal, but could lead to latency and/or fragmentation issues. Please report: at {}", size, current_backtrace());
 777     large_allocation_warning_threshold *= 1.618; // prevent spam
 778 }
 779
 780 void
 781 inline
 782 cpu_pages::check_large_allocation(size_t size) {
 783     if (size >= large_allocation_warning_threshold) {
 784         warn_large_allocation(size);
 785     }
 786 }
 787
 788 void*
 789 cpu_pages::allocate_large(unsigned n_pages) {
 790     check_large_allocation(n_pages * page_size);
 791     return allocate_large_and_trim(n_pages);
 792 }
 793
 794 void*
 795 cpu_pages::allocate_large_aligned(unsigned align_pages, unsigned n_pages) {
 796     check_large_allocation(n_pages * page_size);
 797     // buddy allocation is always aligned
 798     return allocate_large_and_trim(n_pages);
 799 }
 800
 801 disable_backtrace_temporarily::disable_backtrace_temporarily() {
 802     _old = get_cpu_mem().collect_backtrace;
 803     get_cpu_mem().collect_backtrace = false;
 804 }
 805
 806 disable_backtrace_temporarily::~disable_backtrace_temporarily() {
 807     get_cpu_mem().collect_backtrace = _old;
 808 }
 809
 810 static
 811 saved_backtrace get_backtrace() noexcept {
 812     disable_backtrace_temporarily dbt;
 813     return current_backtrace();
 814 }
 815
 816 static
 817 allocation_site_ptr get_allocation_site() {
 818     if (!cpu_mem.is_initialized() || !cpu_mem.collect_backtrace) {
 819         return nullptr;
 820     }
 821     disable_backtrace_temporarily dbt;
 822     allocation_site new_alloc_site;
 823     new_alloc_site.backtrace = get_backtrace();
 824     auto insert_result = cpu_mem.asu.alloc_sites.insert(std::move(new_alloc_site));
 825     allocation_site_ptr alloc_site = &*insert_result.first;
 826     if (insert_result.second) {
 827         alloc_site->next = cpu_mem.alloc_site_list_head;
 828         cpu_mem.alloc_site_list_head = alloc_site;
 829     }
 830     return alloc_site;
 831 }
 832
 833 #ifdef SEASTAR_HEAPPROF
 834
 835 allocation_site_ptr&
 836 small_pool::alloc_site_holder(void* ptr) {
 837     if (objects_page_aligned()) {
 838         return get_cpu_mem().to_page(ptr)->alloc_site;
 839     } else {
 840         return *reinterpret_cast<allocation_site_ptr*>(reinterpret_cast<char*>(ptr) + _object_size - sizeof(allocation_site_ptr));
 841     }
 842 }
 843
 844 #endif
 845
 846 void*
 847 cpu_pages::allocate_small(unsigned size) {
 848     auto idx = small_pool::size_to_idx(size);
 849     auto& pool = small_pools[idx];
 850     assert(size <= pool.object_size());
 851     auto ptr = pool.allocate();
 852 #ifdef SEASTAR_HEAPPROF
 853     if (!ptr) {
 854         return nullptr;
 855     }
 856     allocation_site_ptr alloc_site = get_allocation_site();
 857     if (alloc_site) {
 858         ++alloc_site->count;
 859         alloc_site->size += pool.object_size();
 860     }
 861     new (&pool.alloc_site_holder(ptr)) allocation_site_ptr{alloc_site};
 862 #endif
 863     return ptr;
 864 }
 865
 866 void cpu_pages::free_large(void* ptr) {
 867     pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
 868     page* span = &pages[idx];
 869 #ifdef SEASTAR_HEAPPROF
 870     auto alloc_site = span->alloc_site;
 871     if (alloc_site) {
 872         --alloc_site->count;
 873         alloc_site->size -= span->span_size * page_size;
 874     }
 875 #endif
 876     free_span(idx, span->span_size);
 877 }
 878
 879 size_t cpu_pages::object_size(void* ptr) {
 880     page* span = to_page(ptr);
 881     if (span->pool) {
 882         auto s = span->pool->object_size();
 883 #ifdef SEASTAR_HEAPPROF
 884         // We must not allow the object to be extended onto the allocation_site_ptr field.
 885         if (!span->pool->objects_page_aligned()) {
 886             s -= sizeof(allocation_site_ptr);
 887         }
 888 #endif
 889         return s;
 890     } else {
 891         return size_t(span->span_size) * page_size;
 892     }
 893 }
 894
 895 void cpu_pages::free_cross_cpu(unsigned cpu_id, void* ptr) {
 896     if (!live_cpus[cpu_id].load(std::memory_order_relaxed)) {
 897         // Thread was destroyed; leak object
 898         // should only happen for boost unit-tests.
 899         return;
 900     }
 901     auto p = reinterpret_cast<cross_cpu_free_item*>(ptr);
 902     auto& list = all_cpus[cpu_id]->xcpu_freelist;
 903     auto old = list.load(std::memory_order_relaxed);
 904     do {
 905         p->next = old;
 906     } while (!list.compare_exchange_weak(old, p, std::memory_order_release, std::memory_order_relaxed));
 907     alloc_stats::increment(alloc_stats::types::cross_cpu_frees);
 908 }
 909
 910 bool cpu_pages::drain_cross_cpu_freelist() {
 911     if (!xcpu_freelist.load(std::memory_order_relaxed)) {
 912         return false;
 913     }
 914     auto p = xcpu_freelist.exchange(nullptr, std::memory_order_acquire);
 915     while (p) {
 916         auto n = p->next;
 917         alloc_stats::increment_local(alloc_stats::types::frees);
 918         free(p);
 919         p = n;
 920     }
 921     return true;
 922 }
 923
 924 void cpu_pages::free(void* ptr) {
 925     page* span = to_page(ptr);
 926     if (span->pool) {
 927         small_pool& pool = *span->pool;
 928 #ifdef SEASTAR_HEAPPROF
 929         allocation_site_ptr alloc_site = pool.alloc_site_holder(ptr);
 930         if (alloc_site) {
 931             --alloc_site->count;
 932             alloc_site->size -= pool.object_size();
 933         }
 934 #endif
 935         pool.deallocate(ptr);
 936     } else {
 937         free_large(ptr);
 938     }
 939 }
 940
 941 void cpu_pages::free(void* ptr, size_t size) {
 942     // match action on allocate() so hit the right pool
 943     if (size <= sizeof(free_object)) {
 944         size = sizeof(free_object);
 945     }
 946     if (size <= max_small_allocation) {
 947         size = object_size_with_alloc_site(size);
 948         auto pool = &small_pools[small_pool::size_to_idx(size)];
 949 #ifdef SEASTAR_HEAPPROF
 950         allocation_site_ptr alloc_site = pool->alloc_site_holder(ptr);
 951         if (alloc_site) {
 952             --alloc_site->count;
 953             alloc_site->size -= pool->object_size();
 954         }
 955 #endif
 956         pool->deallocate(ptr);
 957     } else {
 958         free_large(ptr);
 959     }
 960 }
 961
 962 bool
 963 cpu_pages::try_foreign_free(void* ptr) {
 964     // fast path for local free
 965     if (__builtin_expect((reinterpret_cast<uintptr_t>(ptr) & cpu_id_and_mem_base_mask) == local_expected_cpu_id, true)) {
 966         return false;
 967     }
 968     if (!is_seastar_memory(ptr)) {
 969         if (is_reactor_thread) {
 970             alloc_stats::increment_local(alloc_stats::types::foreign_cross_frees);
 971         } else {
 972             alloc_stats::increment(alloc_stats::types::foreign_frees);
 973         }
 974         original_free_func(ptr);
 975         return true;
 976     }
 977     free_cross_cpu(object_cpu_id(ptr), ptr);
 978     return true;
 979 }
 980
 981 void cpu_pages::shrink(void* ptr, size_t new_size) {
 982     auto obj_cpu = object_cpu_id(ptr);
 983     assert(obj_cpu == cpu_id);
 984     page* span = to_page(ptr);
 985     if (span->pool) {
 986         return;
 987     }
 988     auto old_size_pages = span->span_size;
 989     size_t new_size_pages = old_size_pages;
 990     while (new_size_pages / 2 * page_size >= new_size) {
 991         new_size_pages /= 2;
 992     }
 993     if (new_size_pages == old_size_pages) {
 994         return;
 995     }
 996 #ifdef SEASTAR_HEAPPROF
 997     auto alloc_site = span->alloc_site;
 998     if (alloc_site) {
 999         alloc_site->size -= span->span_size * page_size;
1000         alloc_site->size += new_size_pages * page_size;
1001     }
1002 #endif
1003     span->span_size = new_size_pages;
1004     span[new_size_pages - 1].free = false;
1005     span[new_size_pages - 1].span_size = new_size_pages;
1006     pageidx idx = span - pages;
1007     free_span_unaligned(idx + new_size_pages, old_size_pages - new_size_pages);
1008 }
1009
1010 cpu_pages::~cpu_pages() {
1011     if (is_initialized()) {
1012         live_cpus[cpu_id].store(false, std::memory_order_relaxed);
1013     }
1014 }
1015
1016 bool cpu_pages::is_initialized() const {
1017     return bool(nr_pages);
1018 }
1019
1020 bool cpu_pages::initialize() {
1021     if (is_initialized()) {
1022         return false;
1023     }
1024     cpu_id = cpu_id_gen.fetch_add(1, std::memory_order_relaxed);
1025     local_expected_cpu_id = (static_cast<uint64_t>(cpu_id) << cpu_id_shift)
1026         | reinterpret_cast<uintptr_t>(mem_base());
1027     assert(cpu_id < max_cpus);
1028     all_cpus[cpu_id] = this;
1029     auto base = mem_base() + (size_t(cpu_id) << cpu_id_shift);
1030     auto size = 32 << 20;  // Small size for bootstrap
1031     auto r = ::mmap(base, size,
1032             PROT_READ | PROT_WRITE,
1033             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
1034             -1, 0);
1035     if (r == MAP_FAILED) {
1036         abort();
1037     }
1038     ::madvise(base, size, MADV_HUGEPAGE);
1039     pages = reinterpret_cast<page*>(base);
1040     memory = base;
1041     nr_pages = size / page_size;
1042     // we reserve the end page so we don't have to special case
1043     // the last span.
1044     auto reserved = align_up(sizeof(page) * (nr_pages + 1), page_size) / page_size;
1045     reserved = 1u << log2ceil(reserved);
1046     for (pageidx i = 0; i < reserved; ++i) {
1047         pages[i].free = false;
1048     }
1049     pages[nr_pages].free = false;
1050     free_span_unaligned(reserved, nr_pages - reserved);
1051     live_cpus[cpu_id].store(true, std::memory_order_relaxed);
1052     return true;
1053 }
1054
1055 mmap_area
1056 static allocate_anonymous_memory(void* where, size_t how_much) {
1057     return mmap_anonymous(where,
1058             how_much,
1059             PROT_READ | PROT_WRITE,
1060             MAP_PRIVATE | MAP_FIXED);
1061 }
1062
1063 mmap_area
1064 allocate_hugetlbfs_memory(file_desc& fd, void* where, size_t how_much) {
1065     auto pos = fd.size();
1066     fd.truncate(pos + how_much);
1067     auto ret = fd.map(
1068             how_much,
1069             PROT_READ | PROT_WRITE,
1070             MAP_SHARED | MAP_POPULATE | (where ? MAP_FIXED : 0),
1071             pos,
1072             where);
1073     return ret;
1074 }
1075
1076 void cpu_pages::replace_memory_backing(allocate_system_memory_fn alloc_sys_mem) {
1077     // We would like to use ::mremap() to atomically replace the old anonymous
1078     // memory with hugetlbfs backed memory, but mremap() does not support hugetlbfs
1079     // (for no reason at all).  So we must copy the anonymous memory to some other
1080     // place, map hugetlbfs in place, and copy it back, without modifying it during
1081     // the operation.
1082     auto bytes = nr_pages * page_size;
1083     auto old_mem = mem();
1084     auto relocated_old_mem = mmap_anonymous(nullptr, bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE);
1085     std::memcpy(relocated_old_mem.get(), old_mem, bytes);
1086     alloc_sys_mem(old_mem, bytes).release();
1087     std::memcpy(old_mem, relocated_old_mem.get(), bytes);
1088 }
1089
1090 void cpu_pages::do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem) {
1091     auto new_pages = new_size / page_size;
1092     if (new_pages <= nr_pages) {
1093         return;
1094     }
1095     auto old_size = nr_pages * page_size;
1096     auto mmap_start = memory + old_size;
1097     auto mmap_size = new_size - old_size;
1098     auto mem = alloc_sys_mem(mmap_start, mmap_size);
1099     mem.release();
1100     ::madvise(mmap_start, mmap_size, MADV_HUGEPAGE);
1101     // one past last page structure is a sentinel
1102     auto new_page_array_pages = align_up(sizeof(page[new_pages + 1]), page_size) / page_size;
1103     auto new_page_array
1104         = reinterpret_cast<page*>(allocate_large(new_page_array_pages));
1105     if (!new_page_array) {
1106         throw std::bad_alloc();
1107     }
1108     std::copy(pages, pages + nr_pages, new_page_array);
1109     // mark new one-past-last page as taken to avoid boundary conditions
1110     new_page_array[new_pages].free = false;
1111     auto old_pages = reinterpret_cast<char*>(pages);
1112     auto old_nr_pages = nr_pages;
1113     auto old_pages_size = align_up(sizeof(page[nr_pages + 1]), page_size);
1114     old_pages_size = size_t(1) << log2ceil(old_pages_size);
1115     pages = new_page_array;
1116     nr_pages = new_pages;
1117     auto old_pages_start = (old_pages - memory) / page_size;
1118     if (old_pages_start == 0) {
1119         // keep page 0 allocated
1120         old_pages_start = 1;
1121         old_pages_size -= page_size;
1122     }
1123     if (old_pages_size != 0) {
1124         free_span_unaligned(old_pages_start, old_pages_size / page_size);
1125     }
1126     free_span_unaligned(old_nr_pages, new_pages - old_nr_pages);
1127 }
1128
1129 void cpu_pages::resize(size_t new_size, allocate_system_memory_fn alloc_memory) {
1130     new_size = align_down(new_size, huge_page_size);
1131     while (nr_pages * page_size < new_size) {
1132         // don't reallocate all at once, since there might not
1133         // be enough free memory available to relocate the pages array
1134         auto tmp_size = std::min(new_size, 4 * nr_pages * page_size);
1135         do_resize(tmp_size, alloc_memory);
1136     }
1137 }
1138
1139 reclaiming_result cpu_pages::run_reclaimers(reclaimer_scope scope, size_t n_pages) {
1140     auto target = std::max<size_t>(nr_free_pages + n_pages, min_free_pages);
1141     reclaiming_result result = reclaiming_result::reclaimed_nothing;
1142     while (nr_free_pages < target) {
1143         bool made_progress = false;
1144         alloc_stats::increment_local(alloc_stats::types::reclaims);
1145         for (auto&& r : reclaimers) {
1146             if (r->scope() >= scope) {
1147                 made_progress |= r->do_reclaim((target - nr_free_pages) * page_size) == reclaiming_result::reclaimed_something;
1148             }
1149         }
1150         if (!made_progress) {
1151             return result;
1152         }
1153         result = reclaiming_result::reclaimed_something;
1154     }
1155     return result;
1156 }
1157
1158 void cpu_pages::schedule_reclaim() {
1159     current_min_free_pages = 0;
1160     reclaim_hook([this] {
1161         if (nr_free_pages < min_free_pages) {
1162             try {
1163                 run_reclaimers(reclaimer_scope::async, min_free_pages - nr_free_pages);
1164             } catch (...) {
1165                 current_min_free_pages = min_free_pages;
1166                 throw;
1167             }
1168         }
1169         current_min_free_pages = min_free_pages;
1170     });
1171 }
1172
1173 memory::memory_layout cpu_pages::memory_layout() {
1174     assert(is_initialized());
1175     return {
1176         reinterpret_cast<uintptr_t>(memory),
1177         reinterpret_cast<uintptr_t>(memory) + nr_pages * page_size
1178     };
1179 }
1180
1181 void cpu_pages::set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1182     reclaim_hook = hook;
1183     current_min_free_pages = min_free_pages;
1184 }
1185
1186 void cpu_pages::set_min_free_pages(size_t pages) {
1187     if (pages > std::numeric_limits<decltype(min_free_pages)>::max()) {
1188         throw std::runtime_error("Number of pages too large");
1189     }
1190     min_free_pages = pages;
1191     maybe_reclaim();
1192 }
1193
1194 small_pool::small_pool(unsigned object_size) noexcept
1195     : _object_size(object_size) {
1196     unsigned span_size = 1;
1197     auto span_bytes = [&] { return span_size * page_size; };
1198     auto waste = [&] { return (span_bytes() % _object_size) / (1.0 * span_bytes()); };
1199     while (object_size > span_bytes()) {
1200         ++span_size;
1201     }
1202     _span_sizes.fallback = span_size;
1203
1204     // Choose a preferred span size which keeps waste (internal fragmentation) below
1205     // 5% and fits at least 4 objects. If there is no span size (up to 32 pages) that
1206     // satisfies this, just go with the minimum waste out of the checked span sizes.
1207     float min_waste = std::numeric_limits<float>::max();
1208     unsigned min_waste_span_size = 0;
1209     for (span_size = 1; span_size <= 32; span_size *= 2) {
1210         if (span_bytes() / object_size >= 4) {
1211             auto w = waste();
1212             if (w < min_waste) {
1213                 min_waste = w;
1214                 min_waste_span_size = span_size;
1215                 if (w < 0.05) {
1216                     break;
1217                 }
1218             }
1219         }
1220     }
1221     _span_sizes.preferred = min_waste_span_size ? min_waste_span_size : _span_sizes.fallback;
1222
1223     _max_free = std::max<unsigned>(100, span_bytes() * 2 / _object_size);
1224     _min_free = _max_free / 2;
1225 }
1226
1227 small_pool::~small_pool() {
1228     _min_free = _max_free = 0;
1229     trim_free_list();
1230 }
1231
1232 // Should not throw in case of running out of memory to avoid infinite recursion,
1233 // becaue throwing std::bad_alloc requires allocation. __cxa_allocate_exception
1234 // falls back to the emergency pool in case malloc() returns nullptr.
1235 void*
1236 small_pool::allocate() {
1237     if (!_free) {
1238         add_more_objects();
1239     }
1240     if (!_free) {
1241         return nullptr;
1242     }
1243     auto* obj = _free;
1244     _free = _free->next;
1245     --_free_count;
1246     return obj;
1247 }
1248
1249 void
1250 small_pool::deallocate(void* object) {
1251     auto o = reinterpret_cast<free_object*>(object);
1252     o->next = _free;
1253     _free = o;
1254     ++_free_count;
1255     if (_free_count >= _max_free) {
1256         trim_free_list();
1257     }
1258 }
1259
1260 void
1261 small_pool::add_more_objects() {
1262     auto goal = (_min_free + _max_free) / 2;
1263     while (!_span_list.empty() && _free_count < goal) {
1264         page& span = _span_list.front(get_cpu_mem().pages);
1265         _span_list.pop_front(get_cpu_mem().pages);
1266         while (span.freelist) {
1267             auto obj = span.freelist;
1268             span.freelist = span.freelist->next;
1269             obj->next = _free;
1270             _free = obj;
1271             ++_free_count;
1272             ++span.nr_small_alloc;
1273         }
1274     }
1275     while (_free_count < goal) {
1276         disable_backtrace_temporarily dbt;
1277         auto span_size = _span_sizes.preferred;
1278         auto data = reinterpret_cast<char*>(get_cpu_mem().allocate_large(span_size));
1279         if (!data) {
1280             span_size = _span_sizes.fallback;
1281             data = reinterpret_cast<char*>(get_cpu_mem().allocate_large(span_size));
1282             if (!data) {
1283                 return;
1284             }
1285         }
1286         auto span = get_cpu_mem().to_page(data);
1287         span_size = span->span_size;
1288         _pages_in_use += span_size;
1289         for (unsigned i = 0; i < span_size; ++i) {
1290             span[i].offset_in_span = i;
1291             span[i].pool = this;
1292         }
1293         span->nr_small_alloc = 0;
1294         span->freelist = nullptr;
1295         for (unsigned offset = 0; offset <= span_size * page_size - _object_size; offset += _object_size) {
1296             auto h = reinterpret_cast<free_object*>(data + offset);
1297             h->next = _free;
1298             _free = h;
1299             ++_free_count;
1300             ++span->nr_small_alloc;
1301         }
1302     }
1303 }
1304
1305 void
1306 small_pool::trim_free_list() {
1307     auto goal = (_min_free + _max_free) / 2;
1308     while (_free && _free_count > goal) {
1309         auto obj = _free;
1310         _free = _free->next;
1311         --_free_count;
1312         page* span = get_cpu_mem().to_page(obj);
1313         span -= span->offset_in_span;
1314         if (!span->freelist) {
1315             new (&span->link) page_list_link();
1316             _span_list.push_front(get_cpu_mem().pages, *span);
1317         }
1318         obj->next = span->freelist;
1319         span->freelist = obj;
1320         if (--span->nr_small_alloc == 0) {
1321             _pages_in_use -= span->span_size;
1322             _span_list.erase(get_cpu_mem().pages, *span);
1323             get_cpu_mem().free_span(span - get_cpu_mem().pages, span->span_size);
1324         }
1325     }
1326 }
1327
1328 void
1329 abort_on_underflow(size_t size) {
1330     if (std::make_signed_t<size_t>(size) < 0) {
1331         // probably a logic error, stop hard
1332         abort();
1333     }
1334 }
1335
1336 void* allocate_large(size_t size) {
1337     abort_on_underflow(size);
1338     unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1339     if ((size_t(size_in_pages) << page_bits) < size) {
1340         return nullptr; // (size + page_size - 1) caused an overflow
1341     }
1342     return get_cpu_mem().allocate_large(size_in_pages);
1343
1344 }
1345
1346 void* allocate_large_aligned(size_t align, size_t size) {
1347     abort_on_underflow(size);
1348     unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1349     unsigned align_in_pages = std::max(align, page_size) >> page_bits;
1350     return get_cpu_mem().allocate_large_aligned(align_in_pages, size_in_pages);
1351 }
1352
1353 void free_large(void* ptr) {
1354     return get_cpu_mem().free_large(ptr);
1355 }
1356
1357 size_t object_size(void* ptr) {
1358     return cpu_pages::all_cpus[object_cpu_id(ptr)]->object_size(ptr);
1359 }
1360
1361 static thread_local cpu_pages* cpu_mem_ptr = nullptr;
1362
1363 // Mark as cold so that GCC8+ can move to .text.unlikely.
1364 [[gnu::cold]]
1365 static void init_cpu_mem() {
1366     cpu_mem_ptr = &cpu_mem;
1367     cpu_mem.initialize();
1368 }
1369
1370 [[gnu::always_inline]]
1371 static inline cpu_pages& get_cpu_mem()
1372 {
1373     // cpu_pages has a non-trivial constructor which means that the compiler
1374     // must make sure the instance local to the current thread has been
1375     // constructed before each access. So instead we access cpu_mem_ptr
1376     // which has been initialized by calls to init_cpu_mem() before it is
1377     // accessed.
1378     return *cpu_mem_ptr;
1379 }
1380
1381 #ifdef SEASTAR_DEBUG_ALLOCATIONS
1382 static constexpr int debug_allocation_pattern = 0xab;
1383 #endif
1384
1385 void* allocate(size_t size) {
1386     if (!is_reactor_thread) {
1387         if (original_malloc_func) {
1388             alloc_stats::increment(alloc_stats::types::foreign_mallocs);
1389             return original_malloc_func(size);
1390         }
1391         // original_malloc_func might be null for allocations before main
1392         // in constructors before original_malloc_func ctor is called
1393         init_cpu_mem();
1394     }
1395     if (size <= sizeof(free_object)) {
1396         size = sizeof(free_object);
1397     }
1398     void* ptr;
1399     if (size <= max_small_allocation) {
1400         size = object_size_with_alloc_site(size);
1401         ptr = get_cpu_mem().allocate_small(size);
1402     } else {
1403         ptr = allocate_large(size);
1404     }
1405     if (!ptr) {
1406         on_allocation_failure(size);
1407     } else {
1408 #ifdef SEASTAR_DEBUG_ALLOCATIONS
1409         std::memset(ptr, debug_allocation_pattern, size);
1410 #endif
1411     }
1412     alloc_stats::increment_local(alloc_stats::types::allocs);
1413     return ptr;
1414 }
1415
1416 void* allocate_aligned(size_t align, size_t size) {
1417     if (!is_reactor_thread) {
1418         if (original_aligned_alloc_func) {
1419             alloc_stats::increment(alloc_stats::types::foreign_mallocs);
1420             return original_aligned_alloc_func(align, size);
1421         }
1422         // original_realloc_func might be null for allocations before main
1423         // in constructors before original_realloc_func ctor is called
1424         init_cpu_mem();
1425     }
1426     if (size <= sizeof(free_object)) {
1427         size = std::max(sizeof(free_object), align);
1428     }
1429     void* ptr;
1430     if (size <= max_small_allocation && align <= page_size) {
1431         // Our small allocator only guarantees alignment for power-of-two
1432         // allocations which are not larger than a page.
1433         size = 1 << log2ceil(object_size_with_alloc_site(size));
1434         ptr = get_cpu_mem().allocate_small(size);
1435     } else {
1436         ptr = allocate_large_aligned(align, size);
1437     }
1438     if (!ptr) {
1439         on_allocation_failure(size);
1440     } else {
1441 #ifdef SEASTAR_DEBUG_ALLOCATIONS
1442         std::memset(ptr, debug_allocation_pattern, size);
1443 #endif
1444     }
1445     alloc_stats::increment_local(alloc_stats::types::allocs);
1446     return ptr;
1447 }
1448
1449 void free(void* obj) {
1450     if (cpu_pages::try_foreign_free(obj)) {
1451         return;
1452     }
1453     alloc_stats::increment_local(alloc_stats::types::frees);
1454     get_cpu_mem().free(obj);
1455 }
1456
1457 void free(void* obj, size_t size) {
1458     if (cpu_pages::try_foreign_free(obj)) {
1459         return;
1460     }
1461     alloc_stats::increment_local(alloc_stats::types::frees);
1462     get_cpu_mem().free(obj, size);
1463 }
1464
1465 void free_aligned(void* obj, size_t align, size_t size) {
1466     if (size <= sizeof(free_object)) {
1467         size = sizeof(free_object);
1468     }
1469     if (size <= max_small_allocation && align <= page_size) {
1470         // Same adjustment as allocate_aligned()
1471         size = 1 << log2ceil(object_size_with_alloc_site(size));
1472     }
1473     free(obj, size);
1474 }
1475
1476 void shrink(void* obj, size_t new_size) {
1477     alloc_stats::increment_local(alloc_stats::types::frees);
1478     alloc_stats::increment_local(alloc_stats::types::allocs); // keep them balanced
1479     get_cpu_mem().shrink(obj, new_size);
1480 }
1481
1482 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1483     get_cpu_mem().set_reclaim_hook(hook);
1484 }
1485
1486 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope scope)
1487     : reclaimer([reclaim = std::move(reclaim)] (request) {
1488         return reclaim();
1489     }, scope) {
1490 }
1491
1492 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope scope)
1493     : _reclaim(std::move(reclaim))
1494     , _scope(scope) {
1495     get_cpu_mem().reclaimers.push_back(this);
1496 }
1497
1498 reclaimer::~reclaimer() {
1499     auto& r = get_cpu_mem().reclaimers;
1500     r.erase(std::find(r.begin(), r.end(), this));
1501 }
1502
1503 void set_large_allocation_warning_threshold(size_t threshold) {
1504     get_cpu_mem().large_allocation_warning_threshold = threshold;
1505 }
1506
1507 size_t get_large_allocation_warning_threshold() {
1508     return get_cpu_mem().large_allocation_warning_threshold;
1509 }
1510
1511 void disable_large_allocation_warning() {
1512     get_cpu_mem().large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
1513 }
1514
1515 void configure(std::vector<resource::memory> m, bool mbind,
1516         optional<std::string> hugetlbfs_path) {
1517     // we need to make sure cpu_mem is initialize since configure calls cpu_mem.resize
1518     // and we might reach configure without ever allocating, hence without ever calling
1519     // cpu_pages::initialize.
1520     // The correct solution is to add a condition inside cpu_mem.resize, but since all
1521     // other paths to cpu_pages::resize are already verifying initialize was called, we
1522     // verify that here.
1523     init_cpu_mem();
1524     is_reactor_thread = true;
1525     size_t total = 0;
1526     for (auto&& x : m) {
1527         total += x.bytes;
1528     }
1529     allocate_system_memory_fn sys_alloc = allocate_anonymous_memory;
1530     if (hugetlbfs_path) {
1531         // std::function is copyable, but file_desc is not, so we must use
1532         // a shared_ptr to allow sys_alloc to be copied around
1533         auto fdp = make_lw_shared<file_desc>(file_desc::temporary(*hugetlbfs_path));
1534         sys_alloc = [fdp] (void* where, size_t how_much) {
1535             return allocate_hugetlbfs_memory(*fdp, where, how_much);
1536         };
1537         get_cpu_mem().replace_memory_backing(sys_alloc);
1538     }
1539     get_cpu_mem().resize(total, sys_alloc);
1540     size_t pos = 0;
1541     for (auto&& x : m) {
1542 #ifdef SEASTAR_HAVE_NUMA
1543         unsigned long nodemask = 1UL << x.nodeid;
1544         if (mbind) {
1545             auto r = ::mbind(get_cpu_mem().mem() + pos, x.bytes,
1546                             MPOL_PREFERRED,
1547                             &nodemask, std::numeric_limits<unsigned long>::digits,
1548                             MPOL_MF_MOVE);
1549
1550             if (r == -1) {
1551                 char err[1000] = {};
1552                 strerror_r(errno, err, sizeof(err));
1553                 std::cerr << "WARNING: unable to mbind shard memory; performance may suffer: "
1554                         << err << std::endl;
1555             }
1556         }
1557 #endif
1558         pos += x.bytes;
1559     }
1560 }
1561
1562 statistics stats() {
1563     return statistics{alloc_stats::get(alloc_stats::types::allocs), alloc_stats::get(alloc_stats::types::frees), alloc_stats::get(alloc_stats::types::cross_cpu_frees),
1564         cpu_mem.nr_pages * page_size, cpu_mem.nr_free_pages * page_size, alloc_stats::get(alloc_stats::types::reclaims), alloc_stats::get(alloc_stats::types::large_allocs),
1565         alloc_stats::get(alloc_stats::types::failed_allocs), alloc_stats::get(alloc_stats::types::foreign_mallocs), alloc_stats::get(alloc_stats::types::foreign_frees),
1566         alloc_stats::get(alloc_stats::types::foreign_cross_frees)};
1567 }
1568
1569 size_t free_memory() {
1570     return get_cpu_mem().nr_free_pages * page_size;
1571 }
1572
1573 bool drain_cross_cpu_freelist() {
1574     return get_cpu_mem().drain_cross_cpu_freelist();
1575 }
1576
1577 memory_layout get_memory_layout() {
1578     return get_cpu_mem().memory_layout();
1579 }
1580
1581 size_t min_free_memory() {
1582     return get_cpu_mem().min_free_pages * page_size;
1583 }
1584
1585 void set_min_free_pages(size_t pages) {
1586     get_cpu_mem().set_min_free_pages(pages);
1587 }
1588
1589 static thread_local int report_on_alloc_failure_suppressed = 0;
1590
1591 class disable_report_on_alloc_failure_temporarily {
1592 public:
1593     disable_report_on_alloc_failure_temporarily() {
1594         ++report_on_alloc_failure_suppressed;
1595     };
1596     ~disable_report_on_alloc_failure_temporarily() noexcept {
1597         --report_on_alloc_failure_suppressed;
1598     }
1599 };
1600
1601 static std::atomic<bool> abort_on_allocation_failure{false};
1602 static std::atomic<alloc_failure_kind> dump_diagnostics_on_alloc_failure_kind{alloc_failure_kind::critical};
1603
1604 void enable_abort_on_allocation_failure() {
1605     abort_on_allocation_failure.store(true, std::memory_order_seq_cst);
1606 }
1607
1608 void set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind kind) {
1609     dump_diagnostics_on_alloc_failure_kind.store(kind, std::memory_order_seq_cst);
1610 }
1611
1612 void set_dump_memory_diagnostics_on_alloc_failure_kind(std::string_view str) {
1613     if (str == "none") {
1614         set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind::none);
1615     } else if (str == "critical") {
1616         set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind::critical);
1617     } else if (str == "all") {
1618         set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind::all);
1619     } else {
1620         seastar_logger.error("Ignoring invalid option '{}' for the allocation failure kind to dump seastar memory diagnostics for, valid options are: none, critical and all", str);
1621     }
1622 }
1623
1624 static thread_local noncopyable_function<void(memory_diagnostics_writer)> additional_diagnostics_producer;
1625
1626 void set_additional_diagnostics_producer(noncopyable_function<void(memory_diagnostics_writer)> producer) {
1627     additional_diagnostics_producer = std::move(producer);
1628 }
1629
1630 struct human_readable_value {
1631     uint16_t value;  // [0, 1024)
1632     char suffix; // 0 -> no suffix
1633 };
1634
1635 std::ostream& operator<<(std::ostream& os, const human_readable_value& val) {
1636     os << val.value;
1637     if (val.suffix) {
1638         os << val.suffix;
1639     }
1640     return os;
1641 }
1642
1643 static human_readable_value to_human_readable_value(uint64_t value, uint64_t step, uint64_t precision, const std::array<char, 5>& suffixes) {
1644     if (!value) {
1645         return {0, suffixes[0]};
1646     }
1647
1648     uint64_t result = value;
1649     uint64_t remainder = 0;
1650     unsigned i = 0;
1651     // If there is no remainder we go below precision because we don't loose any.
1652     while (((!remainder && result >= step) || result >= precision)) {
1653         remainder = result % step;
1654         result /= step;
1655         if (i == suffixes.size()) {
1656             break;
1657         } else {
1658             ++i;
1659         }
1660     }
1661     return {uint16_t(remainder < (step / 2) ? result : result + 1), suffixes[i]};
1662 }
1663
1664 static human_readable_value to_hr_size(uint64_t size) {
1665     const std::array<char, 5> suffixes = {'B', 'K', 'M', 'G', 'T'};
1666     return to_human_readable_value(size, 1024, 8192, suffixes);
1667 }
1668
1669 static human_readable_value to_hr_number(uint64_t number) {
1670     const std::array<char, 5> suffixes = {'\0', 'k', 'm', 'b', 't'};
1671     return to_human_readable_value(number, 1000, 10000, suffixes);
1672 }
1673
1674 seastar::internal::log_buf::inserter_iterator do_dump_memory_diagnostics(seastar::internal::log_buf::inserter_iterator it) {
1675     auto free_mem = get_cpu_mem().nr_free_pages * page_size;
1676     auto total_mem = get_cpu_mem().nr_pages * page_size;
1677     it = fmt::format_to(it, "Dumping seastar memory diagnostics\n");
1678
1679     it = fmt::format_to(it, "Used memory:   {}\n", to_hr_size(total_mem - free_mem));
1680     it = fmt::format_to(it, "Free memory:   {}\n", to_hr_size(free_mem));
1681     it = fmt::format_to(it, "Total memory:  {}\n", to_hr_size(total_mem));
1682     it = fmt::format_to(it, "Hard failures: {}\n\n", alloc_stats::get(alloc_stats::types::failed_allocs));
1683
1684     if (additional_diagnostics_producer) {
1685         additional_diagnostics_producer([&it] (std::string_view v) mutable {
1686 #if FMT_VERSION >= 80000
1687             it = fmt::format_to(it, fmt::runtime(v));
1688 #else
1689             it = fmt::format_to(it, v);
1690 #endif
1691         });
1692     }
1693
1694     it = fmt::format_to(it, "Small pools:\n");
1695     it = fmt::format_to(it, "objsz spansz usedobj memory unused wst%\n");
1696     for (unsigned i = 0; i < get_cpu_mem().small_pools.nr_small_pools; i++) {
1697         auto& sp = get_cpu_mem().small_pools[i];
1698         // We don't use pools too small to fit a free_object, so skip these, they
1699         // are always empty.
1700         if (sp.object_size() < sizeof(free_object)) {
1701             continue;
1702         }
1703
1704         // For the small pools, there are two types of free objects:
1705         // Pool freelist objects are poitned to by sp._free and their count is sp._free_count
1706         // Span freelist objects are those removed from the pool freelist when that list
1707         // becomes too large: they are instead attached to the spans allocated to this
1708         // pool. To count this second category, we iterate over the spans below.
1709         uint32_t span_freelist_objs = 0;
1710         auto front = sp._span_list._front;
1711         while (front) {
1712             auto& span = get_cpu_mem().pages[front];
1713             auto capacity_in_objects = span.span_size * page_size / sp.object_size();
1714             span_freelist_objs += capacity_in_objects - span.nr_small_alloc;
1715             front = span.link._next;
1716         }
1717         const auto free_objs = sp._free_count + span_freelist_objs; // pool + span free objects
1718         const auto use_count = sp._pages_in_use * page_size / sp.object_size() - free_objs;
1719         auto memory = sp._pages_in_use * page_size;
1720         const auto unused = free_objs * sp.object_size();
1721         const auto wasted_percent = memory ? unused * 100 / memory : 0;
1722         it = fmt::format_to(it,
1723                 "{:>5}  {:>5}   {:>5}  {:>5}  {:>5} {:>4}\n",
1724                 sp.object_size(),
1725                 to_hr_size(sp._span_sizes.preferred * page_size),
1726                 to_hr_number(use_count),
1727                 to_hr_size(memory),
1728                 to_hr_size(unused),
1729                 unsigned(wasted_percent));
1730     }
1731     it = fmt::format_to(it, "\nPage spans:\n");
1732     it = fmt::format_to(it, "index  size  free  used spans\n");
1733
1734     std::array<uint32_t, cpu_pages::nr_span_lists> span_size_histogram;
1735     span_size_histogram.fill(0);
1736
1737     for (unsigned i = 0; i < get_cpu_mem().nr_pages;) {
1738         const auto span_size = get_cpu_mem().pages[i].span_size;
1739         if (!span_size) {
1740             ++i;
1741             continue;
1742         }
1743         ++span_size_histogram[log2ceil(span_size)];
1744         i += span_size;
1745     }
1746
1747     for (unsigned i = 0; i< get_cpu_mem().nr_span_lists; i++) {
1748         auto& span_list = get_cpu_mem().free_spans[i];
1749         auto front = span_list._front;
1750         uint32_t free_pages = 0;
1751         while (front) {
1752             auto& span = get_cpu_mem().pages[front];
1753             free_pages += span.span_size;
1754             front = span.link._next;
1755         }
1756         const auto total_spans = span_size_histogram[i];
1757         const auto total_pages = total_spans * (1 << i);
1758         it = fmt::format_to(it,
1759                 "{:>5} {:>5} {:>5} {:>5} {:>5}\n",
1760                 i,
1761                 to_hr_size((uint64_t(1) << i) * page_size),
1762                 to_hr_size(free_pages * page_size),
1763                 to_hr_size((total_pages - free_pages) * page_size),
1764                 to_hr_number(total_spans));
1765     }
1766
1767     return it;
1768 }
1769
1770 void dump_memory_diagnostics(log_level lvl, logger::rate_limit& rate_limit) {
1771     logger::lambda_log_writer writer([] (seastar::internal::log_buf::inserter_iterator it) {
1772         return do_dump_memory_diagnostics(it);
1773     });
1774     seastar_memory_logger.log(lvl, rate_limit, writer);
1775 }
1776
1777 void internal::log_memory_diagnostics_report(log_level lvl) {
1778     logger::rate_limit rl{std::chrono::seconds(0)}; // never limit for explicit dump requests
1779     dump_memory_diagnostics(lvl, rl);
1780 }
1781
1782 void maybe_dump_memory_diagnostics(size_t size, bool is_aborting) {
1783     if (report_on_alloc_failure_suppressed) {
1784         return;
1785     }
1786
1787     disable_report_on_alloc_failure_temporarily guard;
1788     if (seastar_memory_logger.is_enabled(log_level::debug)) {
1789         seastar_memory_logger.debug("Failed to allocate {} bytes at {}", size, current_backtrace());
1790     }
1791
1792     auto lvl = log_level::debug;
1793     switch (dump_diagnostics_on_alloc_failure_kind.load(std::memory_order_relaxed)) {
1794         case alloc_failure_kind::none:
1795             lvl = log_level::debug;
1796             break;
1797         case alloc_failure_kind::critical:
1798             lvl = is_critical_alloc_section() ? log_level::error : log_level::debug;
1799             break;
1800         case alloc_failure_kind::all:
1801             lvl = log_level::error;
1802             break;
1803     }
1804
1805     if (is_aborting) {
1806         // if we are about to abort, always report the memory diagnositics at error level
1807         lvl = log_level::error;
1808     }
1809
1810     static thread_local logger::rate_limit rate_limit(std::chrono::seconds(10));
1811     dump_memory_diagnostics(lvl, rate_limit);
1812
1813
1814 }
1815
1816 void on_allocation_failure(size_t size) {
1817     alloc_stats::increment(alloc_stats::types::failed_allocs);
1818
1819     bool will_abort = !abort_on_alloc_failure_suppressed
1820             && abort_on_allocation_failure.load(std::memory_order_relaxed);
1821
1822     maybe_dump_memory_diagnostics(size, will_abort);
1823
1824     if (will_abort) {
1825         seastar_logger.error("Failed to allocate {} bytes", size);
1826         abort();
1827     }
1828 }
1829
1830 sstring generate_memory_diagnostics_report() {
1831     seastar::internal::log_buf buf;
1832     auto it = buf.back_insert_begin();
1833     do_dump_memory_diagnostics(it);
1834     return sstring(buf.data(), buf.size());
1835 }
1836
1837 static void trigger_error_injector() {
1838     on_alloc_point();
1839 }
1840
1841 static bool try_trigger_error_injector() {
1842     try {
1843         on_alloc_point();
1844         return false;
1845     } catch (...) {
1846         return true;
1847     }
1848 }
1849
1850 }
1851
1852 }
1853
1854 using namespace seastar::memory;
1855
1856 extern "C"
1857 [[gnu::visibility("default")]]
1858 [[gnu::used]]
1859 void* malloc(size_t n) throw () {
1860     if (try_trigger_error_injector()) {
1861         return nullptr;
1862     }
1863     return allocate(n);
1864 }
1865
1866 extern "C"
1867 [[gnu::alias("malloc")]]
1868 [[gnu::visibility("default")]]
1869 [[gnu::malloc]]
1870 [[gnu::alloc_size(1)]]
1871 #ifndef __clang__
1872 [[gnu::leaf]]
1873 #endif
1874 void* __libc_malloc(size_t n) throw ();
1875
1876 extern "C"
1877 [[gnu::visibility("default")]]
1878 [[gnu::used]]
1879 void free(void* ptr) {
1880     if (ptr) {
1881         seastar::memory::free(ptr);
1882     }
1883 }
1884
1885 extern "C"
1886 [[gnu::alias("free")]]
1887 [[gnu::visibility("default")]]
1888 #ifndef __clang__
1889 [[gnu::leaf]]
1890 #endif
1891 void __libc_free(void* obj) throw ();
1892
1893 extern "C"
1894 [[gnu::visibility("default")]]
1895 void* calloc(size_t nmemb, size_t size) {
1896     if (try_trigger_error_injector()) {
1897         return nullptr;
1898     }
1899     auto s1 = __int128(nmemb) * __int128(size);
1900     assert(s1 == size_t(s1));
1901     size_t s = s1;
1902     auto p = malloc(s);
1903     if (p) {
1904         std::memset(p, 0, s);
1905     }
1906     return p;
1907 }
1908
1909 extern "C"
1910 [[gnu::alias("calloc")]]
1911 [[gnu::visibility("default")]]
1912 [[gnu::alloc_size(1, 2)]]
1913 [[gnu::malloc]]
1914 #ifndef __clang__
1915 [[gnu::leaf]]
1916 #endif
1917 void* __libc_calloc(size_t n, size_t m) throw ();
1918
1919 extern "C"
1920 [[gnu::visibility("default")]]
1921 void* realloc(void* ptr, size_t size) {
1922     if (try_trigger_error_injector()) {
1923         return nullptr;
1924     }
1925     if (ptr == nullptr) {
1926         // https://en.cppreference.com/w/cpp/memory/c/realloc
1927         // If ptr is a null pointer, the behavior is the same as calling std::malloc(new_size).
1928         return malloc(size);
1929     } else if (!is_seastar_memory(ptr)) {
1930         // we can't realloc foreign memory on a shard
1931         if (is_reactor_thread) {
1932             abort();
1933         }
1934         // original_realloc_func might be null when previous ctor allocates
1935         if (original_realloc_func) {
1936             return original_realloc_func(ptr, size);
1937         }
1938     }
1939     // if we're here, it's a non-null seastar memory ptr
1940     // or original functions aren't available.
1941     // at any rate, using the seastar allocator is OK now.
1942     auto old_size = ptr ? object_size(ptr) : 0;
1943     if (size == old_size) {
1944         return ptr;
1945     }
1946     if (size == 0) {
1947         ::free(ptr);
1948         return nullptr;
1949     }
1950     if (size < old_size) {
1951         seastar::memory::shrink(ptr, size);
1952         return ptr;
1953     }
1954     auto nptr = malloc(size);
1955     if (!nptr) {
1956         return nptr;
1957     }
1958     if (ptr) {
1959         std::memcpy(nptr, ptr, std::min(size, old_size));
1960         ::free(ptr);
1961     }
1962     return nptr;
1963 }
1964
1965 extern "C"
1966 [[gnu::alias("realloc")]]
1967 [[gnu::visibility("default")]]
1968 [[gnu::alloc_size(2)]]
1969 #ifndef __clang__
1970 [[gnu::leaf]]
1971 #endif
1972 void* __libc_realloc(void* obj, size_t size) throw ();
1973
1974 extern "C"
1975 [[gnu::visibility("default")]]
1976 [[gnu::used]]
1977 #ifndef __clang__
1978 [[gnu::leaf]]
1979 #endif
1980 [[gnu::nonnull(1)]]
1981 int posix_memalign(void** ptr, size_t align, size_t size) throw () {
1982     if (try_trigger_error_injector()) {
1983         return ENOMEM;
1984     }
1985     *ptr = allocate_aligned(align, size);
1986     if (!*ptr) {
1987         return ENOMEM;
1988     }
1989     return 0;
1990 }
1991
1992 extern "C"
1993 [[gnu::alias("posix_memalign")]]
1994 [[gnu::visibility("default")]]
1995 #ifndef __clang__
1996 [[gnu::leaf]]
1997 #endif
1998 [[gnu::nonnull(1)]]
1999 int __libc_posix_memalign(void** ptr, size_t align, size_t size) throw ();
2000
2001 extern "C"
2002 [[gnu::visibility("default")]]
2003 [[gnu::malloc]]
2004 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
2005 [[gnu::alloc_size(2)]]
2006 #endif
2007 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 35)
2008 [[gnu::alloc_align(1)]]
2009 #endif
2010 void* memalign(size_t align, size_t size) throw () {
2011     if (try_trigger_error_injector()) {
2012         return nullptr;
2013     }
2014     size = seastar::align_up(size, align);
2015     return allocate_aligned(align, size);
2016 }
2017
2018 extern "C"
2019 [[gnu::visibility("default")]]
2020 void *aligned_alloc(size_t align, size_t size) throw () {
2021     if (try_trigger_error_injector()) {
2022         return nullptr;
2023     }
2024     return allocate_aligned(align, size);
2025 }
2026
2027 extern "C"
2028 [[gnu::alias("memalign")]]
2029 [[gnu::visibility("default")]]
2030 [[gnu::malloc]]
2031 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
2032 [[gnu::alloc_size(2)]]
2033 #endif
2034 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 35)
2035 [[gnu::alloc_align(1)]]
2036 #endif
2037 void* __libc_memalign(size_t align, size_t size) throw ();
2038
2039 extern "C"
2040 [[gnu::visibility("default")]]
2041 void cfree(void* obj) throw () {
2042     return ::free(obj);
2043 }
2044
2045 extern "C"
2046 [[gnu::alias("cfree")]]
2047 [[gnu::visibility("default")]]
2048 void __libc_cfree(void* obj) throw ();
2049
2050 extern "C"
2051 [[gnu::visibility("default")]]
2052 size_t malloc_usable_size(void* obj) {
2053     if (!is_seastar_memory(obj)) {
2054         return original_malloc_usable_size_func(obj);
2055     }
2056     return object_size(obj);
2057 }
2058
2059 extern "C"
2060 [[gnu::visibility("default")]]
2061 int malloc_trim(size_t pad) {
2062     if (!is_reactor_thread) {
2063         return original_malloc_trim_func(pad);
2064     }
2065     return 0;
2066 }
2067
2068 static inline
2069 void* throw_if_null(void* ptr) {
2070     if (!ptr) {
2071         throw std::bad_alloc();
2072     }
2073     return ptr;
2074 }
2075
2076 [[gnu::visibility("default")]]
2077 void* operator new(size_t size) {
2078     trigger_error_injector();
2079     if (size == 0) {
2080         size = 1;
2081     }
2082     return throw_if_null(allocate(size));
2083 }
2084
2085 [[gnu::visibility("default")]]
2086 void* operator new[](size_t size) {
2087     trigger_error_injector();
2088     if (size == 0) {
2089         size = 1;
2090     }
2091     return throw_if_null(allocate(size));
2092 }
2093
2094 [[gnu::visibility("default")]]
2095 void operator delete(void* ptr) throw () {
2096     if (ptr) {
2097         seastar::memory::free(ptr);
2098     }
2099 }
2100
2101 [[gnu::visibility("default")]]
2102 void operator delete[](void* ptr) throw () {
2103     if (ptr) {
2104         seastar::memory::free(ptr);
2105     }
2106 }
2107
2108 [[gnu::visibility("default")]]
2109 void operator delete(void* ptr, size_t size) throw () {
2110     if (ptr) {
2111         seastar::memory::free(ptr, size);
2112     }
2113 }
2114
2115 [[gnu::visibility("default")]]
2116 void operator delete[](void* ptr, size_t size) throw () {
2117     if (ptr) {
2118         seastar::memory::free(ptr, size);
2119     }
2120 }
2121
2122 [[gnu::visibility("default")]]
2123 void* operator new(size_t size, std::nothrow_t) throw () {
2124     if (try_trigger_error_injector()) {
2125         return nullptr;
2126     }
2127     if (size == 0) {
2128         size = 1;
2129     }
2130     return allocate(size);
2131 }
2132
2133 [[gnu::visibility("default")]]
2134 void* operator new[](size_t size, std::nothrow_t) throw () {
2135     if (size == 0) {
2136         size = 1;
2137     }
2138     return allocate(size);
2139 }
2140
2141 [[gnu::visibility("default")]]
2142 void operator delete(void* ptr, std::nothrow_t) throw () {
2143     if (ptr) {
2144         seastar::memory::free(ptr);
2145     }
2146 }
2147
2148 [[gnu::visibility("default")]]
2149 void operator delete[](void* ptr, std::nothrow_t) throw () {
2150     if (ptr) {
2151         seastar::memory::free(ptr);
2152     }
2153 }
2154
2155 [[gnu::visibility("default")]]
2156 void operator delete(void* ptr, size_t size, std::nothrow_t) throw () {
2157     if (ptr) {
2158         seastar::memory::free(ptr, size);
2159     }
2160 }
2161
2162 [[gnu::visibility("default")]]
2163 void operator delete[](void* ptr, size_t size, std::nothrow_t) throw () {
2164     if (ptr) {
2165         seastar::memory::free(ptr, size);
2166     }
2167 }
2168
2169 #ifdef __cpp_aligned_new
2170
2171 [[gnu::visibility("default")]]
2172 void* operator new(size_t size, std::align_val_t a) {
2173     trigger_error_injector();
2174     auto ptr = allocate_aligned(size_t(a), size);
2175     return throw_if_null(ptr);
2176 }
2177
2178 [[gnu::visibility("default")]]
2179 void* operator new[](size_t size, std::align_val_t a) {
2180     trigger_error_injector();
2181     auto ptr = allocate_aligned(size_t(a), size);
2182     return throw_if_null(ptr);
2183 }
2184
2185 [[gnu::visibility("default")]]
2186 void* operator new(size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
2187     if (try_trigger_error_injector()) {
2188         return nullptr;
2189     }
2190     return allocate_aligned(size_t(a), size);
2191 }
2192
2193 [[gnu::visibility("default")]]
2194 void* operator new[](size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
2195     if (try_trigger_error_injector()) {
2196         return nullptr;
2197     }
2198     return allocate_aligned(size_t(a), size);
2199 }
2200
2201
2202 [[gnu::visibility("default")]]
2203 void operator delete(void* ptr, std::align_val_t a) noexcept {
2204     if (ptr) {
2205         seastar::memory::free(ptr);
2206     }
2207 }
2208
2209 [[gnu::visibility("default")]]
2210 void operator delete[](void* ptr, std::align_val_t a) noexcept {
2211     if (ptr) {
2212         seastar::memory::free(ptr);
2213     }
2214 }
2215
2216 [[gnu::visibility("default")]]
2217 void operator delete(void* ptr, size_t size, std::align_val_t a) noexcept {
2218     if (ptr) {
2219         seastar::memory::free_aligned(ptr, size_t(a), size);
2220     }
2221 }
2222
2223 [[gnu::visibility("default")]]
2224 void operator delete[](void* ptr, size_t size, std::align_val_t a) noexcept {
2225     if (ptr) {
2226         seastar::memory::free_aligned(ptr, size_t(a), size);
2227     }
2228 }
2229
2230 [[gnu::visibility("default")]]
2231 void operator delete(void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
2232     if (ptr) {
2233         seastar::memory::free(ptr);
2234     }
2235 }
2236
2237 [[gnu::visibility("default")]]
2238 void operator delete[](void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
2239     if (ptr) {
2240         seastar::memory::free(ptr);
2241     }
2242 }
2243
2244 #endif
2245
2246 namespace seastar {
2247
2248 #else
2249
2250 namespace seastar {
2251
2252 namespace memory {
2253
2254 disable_backtrace_temporarily::disable_backtrace_temporarily() {
2255     (void)_old;
2256 }
2257
2258 disable_backtrace_temporarily::~disable_backtrace_temporarily() {
2259 }
2260
2261 void set_heap_profiling_enabled(bool enabled) {
2262     seastar_logger.warn("Seastar compiled with default allocator, heap profiler not supported");
2263 }
2264
2265 scoped_heap_profiling::scoped_heap_profiling() noexcept {
2266     set_heap_profiling_enabled(true); // let it print the warning
2267 }
2268
2269 scoped_heap_profiling::~scoped_heap_profiling() {
2270 }
2271
2272 void enable_abort_on_allocation_failure() {
2273     seastar_logger.warn("Seastar compiled with default allocator, will not abort on bad_alloc");
2274 }
2275
2276 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope) {
2277 }
2278
2279 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope) {
2280 }
2281
2282 reclaimer::~reclaimer() {
2283 }
2284
2285 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
2286 }
2287
2288 void configure(std::vector<resource::memory> m, bool mbind, std::optional<std::string> hugepages_path) {
2289 }
2290
2291 statistics stats() {
2292     return statistics{0, 0, 0, 1 << 30, 1 << 30, 0, 0, 0, 0, 0, 0};
2293 }
2294
2295 size_t free_memory() {
2296     return stats().free_memory();
2297 }
2298
2299 bool drain_cross_cpu_freelist() {
2300     return false;
2301 }
2302
2303 memory_layout get_memory_layout() {
2304     throw std::runtime_error("get_memory_layout() not supported");
2305 }
2306
2307 size_t min_free_memory() {
2308     return 0;
2309 }
2310
2311 void set_min_free_pages(size_t pages) {
2312     // Ignore, reclaiming not supported for default allocator.
2313 }
2314
2315 void set_large_allocation_warning_threshold(size_t) {
2316     // Ignore, not supported for default allocator.
2317 }
2318
2319 size_t get_large_allocation_warning_threshold() {
2320     // Ignore, not supported for default allocator.
2321     return std::numeric_limits<size_t>::max();
2322 }
2323
2324 void disable_large_allocation_warning() {
2325     // Ignore, not supported for default allocator.
2326 }
2327
2328
2329 void set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind) {
2330     // Ignore, not supported for default allocator.
2331 }
2332
2333 void set_dump_memory_diagnostics_on_alloc_failure_kind(std::string_view) {
2334     // Ignore, not supported for default allocator.
2335 }
2336
2337 void set_additional_diagnostics_producer(noncopyable_function<void(memory_diagnostics_writer)>) {
2338     // Ignore, not supported for default allocator.
2339 }
2340
2341 sstring generate_memory_diagnostics_report() {
2342     // Ignore, not supported for default allocator.
2343     return {};
2344 }
2345
2346 }
2347
2348 }
2349
2350 namespace seastar {
2351
2352 #endif
2353
2354 /// \endcond
2355
2356 }