ceph/src/seastar/src/core/memory.cc

   1 /*
   2  * This file is open source software, licensed to you under the terms
   3  * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
   4  * distributed with this work for additional information regarding copyright
   5  * ownership.  You may not use this file except in compliance with the License.
   6  *
   7  * You may obtain a copy of the License at
   8  *
   9  *   http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing,
  12  * software distributed under the License is distributed on an
  13  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14  * KIND, either express or implied.  See the License for the
  15  * specific language governing permissions and limitations
  16  * under the License.
  17  */
  18 /*
  19  * Copyright (C) 2014 Cloudius Systems, Ltd.
  20  */
  21
  22
  23 /// \cond internal
  24
  25 //
  26 // Seastar memory allocator
  27 //
  28 // This is a share-nothing allocator (memory allocated on one cpu must
  29 // be freed on the same cpu).
  30 //
  31 // Inspired by gperftools' tcmalloc.
  32 //
  33 // Memory map:
  34 //
  35 // 0x0000'sccc'vvvv'vvvv
  36 //
  37 // 0000 - required by architecture (only 48 bits of address space)
  38 // s    - chosen to satisfy system allocator (1-7)
  39 // ccc  - cpu number (0-12 bits allocated vary according to system)
  40 // v    - virtual address within cpu (32-44 bits, according to how much ccc
  41 //        leaves us
  42 //
  43 // Each page has a page structure that describes it.  Within a cpu's
  44 // memory pool, the page array starts at offset 0, describing all pages
  45 // within that pool.  Page 0 does not describe a valid page.
  46 //
  47 // Each pool can contain at most 2^32 pages (or 44 address bits), so we can
  48 // use a 32-bit integer to identify a page.
  49 //
  50 // Runs of pages are organized into spans.  Free spans are organized into lists,
  51 // by size.  When spans are broken up or coalesced, they may move into new lists.
  52 // Spans have a size that is a power-of-two and are naturally aligned (aka buddy
  53 // allocator)
  54
  55 #include <seastar/core/cacheline.hh>
  56 #include <seastar/core/memory.hh>
  57 #include <seastar/core/reactor.hh>
  58 #include <seastar/core/print.hh>
  59 #include <seastar/util/alloc_failure_injector.hh>
  60 #include <seastar/util/std-compat.hh>
  61 #include <iostream>
  62
  63 namespace seastar {
  64
  65 void* internal::allocate_aligned_buffer_impl(size_t size, size_t align) {
  66     void *ret;
  67     auto r = posix_memalign(&ret, align, size);
  68     if (r == ENOMEM) {
  69         throw std::bad_alloc();
  70     } else if (r == EINVAL) {
  71         throw std::runtime_error(format("Invalid alignment of {:d}; allocating {:d} bytes", align, size));
  72     } else {
  73         assert(r == 0);
  74         return ret;
  75     }
  76 }
  77
  78 namespace memory {
  79
  80 static thread_local int abort_on_alloc_failure_suppressed = 0;
  81
  82 disable_abort_on_alloc_failure_temporarily::disable_abort_on_alloc_failure_temporarily() {
  83     ++abort_on_alloc_failure_suppressed;
  84 }
  85
  86 disable_abort_on_alloc_failure_temporarily::~disable_abort_on_alloc_failure_temporarily() noexcept {
  87     --abort_on_alloc_failure_suppressed;
  88 }
  89
  90 static compat::polymorphic_allocator<char> static_malloc_allocator{compat::pmr_get_default_resource()};;
  91 compat::polymorphic_allocator<char>* malloc_allocator{&static_malloc_allocator};
  92
  93 }
  94
  95 }
  96
  97 #ifndef SEASTAR_DEFAULT_ALLOCATOR
  98
  99 #include <seastar/core/bitops.hh>
 100 #include <seastar/core/align.hh>
 101 #include <seastar/core/posix.hh>
 102 #include <seastar/core/shared_ptr.hh>
 103 #include <new>
 104 #include <cstdint>
 105 #include <algorithm>
 106 #include <limits>
 107 #include <cassert>
 108 #include <atomic>
 109 #include <mutex>
 110 #include <seastar/util/std-compat.hh>
 111 #include <functional>
 112 #include <cstring>
 113 #include <boost/intrusive/list.hpp>
 114 #include <sys/mman.h>
 115 #include <seastar/util/defer.hh>
 116 #include <seastar/util/backtrace.hh>
 117
 118 #ifdef SEASTAR_HAVE_NUMA
 119 #include <numaif.h>
 120 #endif
 121
 122 namespace seastar {
 123
 124 struct allocation_site {
 125     mutable size_t count = 0; // number of live objects allocated at backtrace.
 126     mutable size_t size = 0; // amount of bytes in live objects allocated at backtrace.
 127     mutable const allocation_site* next = nullptr;
 128     saved_backtrace backtrace;
 129
 130     bool operator==(const allocation_site& o) const {
 131         return backtrace == o.backtrace;
 132     }
 133
 134     bool operator!=(const allocation_site& o) const {
 135         return !(*this == o);
 136     }
 137 };
 138
 139 }
 140
 141 namespace std {
 142
 143 template<>
 144 struct hash<seastar::allocation_site> {
 145     size_t operator()(const seastar::allocation_site& bi) const {
 146         return std::hash<seastar::saved_backtrace>()(bi.backtrace);
 147     }
 148 };
 149
 150 }
 151
 152 namespace seastar {
 153
 154 using allocation_site_ptr = const allocation_site*;
 155
 156 namespace memory {
 157
 158 seastar::logger seastar_memory_logger("seastar_memory");
 159
 160 [[gnu::unused]]
 161 static allocation_site_ptr get_allocation_site();
 162
 163 static void on_allocation_failure(size_t size);
 164
 165 static constexpr unsigned cpu_id_shift = 36; // FIXME: make dynamic
 166 static constexpr unsigned max_cpus = 256;
 167
 168 using pageidx = uint32_t;
 169
 170 struct page;
 171 class page_list;
 172
 173 static std::atomic<bool> live_cpus[max_cpus];
 174
 175 static thread_local uint64_t g_allocs;
 176 static thread_local uint64_t g_frees;
 177 static thread_local uint64_t g_cross_cpu_frees;
 178 static thread_local uint64_t g_reclaims;
 179 static thread_local uint64_t g_large_allocs;
 180
 181 using compat::optional;
 182
 183 using allocate_system_memory_fn
 184         = std::function<mmap_area (optional<void*> where, size_t how_much)>;
 185
 186 namespace bi = boost::intrusive;
 187
 188 inline
 189 unsigned object_cpu_id(const void* ptr) {
 190     return (reinterpret_cast<uintptr_t>(ptr) >> cpu_id_shift) & 0xff;
 191 }
 192
 193 class page_list_link {
 194     uint32_t _prev;
 195     uint32_t _next;
 196     friend class page_list;
 197     friend void on_allocation_failure(size_t);
 198 };
 199
 200 static char* mem_base() {
 201     static char* known;
 202     static std::once_flag flag;
 203     std::call_once(flag, [] {
 204         size_t alloc = size_t(1) << 44;
 205         auto r = ::mmap(NULL, 2 * alloc,
 206                     PROT_NONE,
 207                     MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
 208                     -1, 0);
 209         if (r == MAP_FAILED) {
 210             abort();
 211         }
 212         ::madvise(r, 2 * alloc, MADV_DONTDUMP);
 213         auto cr = reinterpret_cast<char*>(r);
 214         known = align_up(cr, alloc);
 215         ::munmap(cr, known - cr);
 216         ::munmap(known + alloc, cr + 2 * alloc - (known + alloc));
 217     });
 218     return known;
 219 }
 220
 221 constexpr bool is_page_aligned(size_t size) {
 222     return (size & (page_size - 1)) == 0;
 223 }
 224
 225 constexpr size_t next_page_aligned(size_t size) {
 226     return (size + (page_size - 1)) & ~(page_size - 1);
 227 }
 228
 229 class small_pool;
 230
 231 struct free_object {
 232     free_object* next;
 233 };
 234
 235 struct page {
 236     bool free;
 237     uint8_t offset_in_span;
 238     uint16_t nr_small_alloc;
 239     uint32_t span_size; // in pages, if we're the head or the tail
 240     page_list_link link;
 241     small_pool* pool;  // if used in a small_pool
 242     free_object* freelist;
 243 #ifdef SEASTAR_HEAPPROF
 244     allocation_site_ptr alloc_site; // for objects whose size is multiple of page size, valid for head only
 245 #endif
 246 };
 247
 248 class page_list {
 249     uint32_t _front = 0;
 250     uint32_t _back = 0;
 251 public:
 252     page& front(page* ary) { return ary[_front]; }
 253     page& back(page* ary) { return ary[_back]; }
 254     bool empty() const { return !_front; }
 255     void erase(page* ary, page& span) {
 256         if (span.link._next) {
 257             ary[span.link._next].link._prev = span.link._prev;
 258         } else {
 259             _back = span.link._prev;
 260         }
 261         if (span.link._prev) {
 262             ary[span.link._prev].link._next = span.link._next;
 263         } else {
 264             _front = span.link._next;
 265         }
 266     }
 267     void push_front(page* ary, page& span) {
 268         auto idx = &span - ary;
 269         if (_front) {
 270             ary[_front].link._prev = idx;
 271         } else {
 272             _back = idx;
 273         }
 274         span.link._next = _front;
 275         span.link._prev = 0;
 276         _front = idx;
 277     }
 278     void pop_front(page* ary) {
 279         if (ary[_front].link._next) {
 280             ary[ary[_front].link._next].link._prev = 0;
 281         } else {
 282             _back = 0;
 283         }
 284         _front = ary[_front].link._next;
 285     }
 286     friend void on_allocation_failure(size_t);
 287 };
 288
 289 class small_pool {
 290     struct span_sizes {
 291         uint8_t preferred;
 292         uint8_t fallback;
 293     };
 294     unsigned _object_size;
 295     span_sizes _span_sizes;
 296     free_object* _free = nullptr;
 297     size_t _free_count = 0;
 298     unsigned _min_free;
 299     unsigned _max_free;
 300     unsigned _pages_in_use = 0;
 301     page_list _span_list;
 302     static constexpr unsigned idx_frac_bits = 2;
 303 public:
 304     explicit small_pool(unsigned object_size) noexcept;
 305     ~small_pool();
 306     void* allocate();
 307     void deallocate(void* object);
 308     unsigned object_size() const { return _object_size; }
 309     bool objects_page_aligned() const { return is_page_aligned(_object_size); }
 310     static constexpr unsigned size_to_idx(unsigned size);
 311     static constexpr unsigned idx_to_size(unsigned idx);
 312     allocation_site_ptr& alloc_site_holder(void* ptr);
 313 private:
 314     void add_more_objects();
 315     void trim_free_list();
 316     friend void on_allocation_failure(size_t);
 317 };
 318
 319 // index 0b0001'1100 -> size (1 << 4) + 0b11 << (4 - 2)
 320
 321 constexpr unsigned
 322 small_pool::idx_to_size(unsigned idx) {
 323     return (((1 << idx_frac_bits) | (idx & ((1 << idx_frac_bits) - 1)))
 324               << (idx >> idx_frac_bits))
 325                   >> idx_frac_bits;
 326 }
 327
 328 constexpr unsigned
 329 small_pool::size_to_idx(unsigned size) {
 330     return ((log2floor(size) << idx_frac_bits) - ((1 << idx_frac_bits) - 1))
 331             + ((size - 1) >> (log2floor(size) - idx_frac_bits));
 332 }
 333
 334 class small_pool_array {
 335 public:
 336     static constexpr unsigned nr_small_pools = small_pool::size_to_idx(4 * page_size) + 1;
 337 private:
 338     union u {
 339         small_pool a[nr_small_pools];
 340         u() {
 341             for (unsigned i = 0; i < nr_small_pools; ++i) {
 342                 new (&a[i]) small_pool(small_pool::idx_to_size(i));
 343             }
 344         }
 345         ~u() {
 346             // cannot really call destructor, since other
 347             // objects may be freed after we are gone.
 348         }
 349     } _u;
 350 public:
 351     small_pool& operator[](unsigned idx) { return _u.a[idx]; }
 352 };
 353
 354 static constexpr size_t max_small_allocation
 355     = small_pool::idx_to_size(small_pool_array::nr_small_pools - 1);
 356
 357 constexpr size_t object_size_with_alloc_site(size_t size) {
 358 #ifdef SEASTAR_HEAPPROF
 359     // For page-aligned sizes, allocation_site* lives in page::alloc_site, not with the object.
 360     static_assert(is_page_aligned(max_small_allocation), "assuming that max_small_allocation is page aligned so that we"
 361             " don't need to add allocation_site_ptr to objects of size close to it");
 362     size_t next_page_aligned_size = next_page_aligned(size);
 363     if (next_page_aligned_size - size > sizeof(allocation_site_ptr)) {
 364         size += sizeof(allocation_site_ptr);
 365     } else {
 366         return next_page_aligned_size;
 367     }
 368 #endif
 369     return size;
 370 }
 371
 372 #ifdef SEASTAR_HEAPPROF
 373 // Ensure that object_size_with_alloc_site() does not exceed max_small_allocation
 374 static_assert(object_size_with_alloc_site(max_small_allocation) == max_small_allocation, "");
 375 static_assert(object_size_with_alloc_site(max_small_allocation - 1) == max_small_allocation, "");
 376 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) + 1) == max_small_allocation, "");
 377 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr)) == max_small_allocation, "");
 378 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 1) == max_small_allocation - 1, "");
 379 static_assert(object_size_with_alloc_site(max_small_allocation - sizeof(allocation_site_ptr) - 2) == max_small_allocation - 2, "");
 380 #endif
 381
 382 struct cross_cpu_free_item {
 383     cross_cpu_free_item* next;
 384 };
 385
 386 struct cpu_pages {
 387     uint32_t min_free_pages = 20000000 / page_size;
 388     char* memory;
 389     page* pages;
 390     uint32_t nr_pages;
 391     uint32_t nr_free_pages;
 392     uint32_t current_min_free_pages = 0;
 393     size_t large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
 394     unsigned cpu_id = -1U;
 395     std::function<void (std::function<void ()>)> reclaim_hook;
 396     std::vector<reclaimer*> reclaimers;
 397     static constexpr unsigned nr_span_lists = 32;
 398     page_list free_spans[nr_span_lists];  // contains aligned spans with span_size == 2^idx
 399     small_pool_array small_pools;
 400     alignas(seastar::cache_line_size) std::atomic<cross_cpu_free_item*> xcpu_freelist;
 401     static std::atomic<unsigned> cpu_id_gen;
 402     static cpu_pages* all_cpus[max_cpus];
 403     union asu {
 404         using alloc_sites_type = std::unordered_set<allocation_site>;
 405         asu() : alloc_sites{} {
 406         }
 407         ~asu() {} // alloc_sites live forever
 408         alloc_sites_type alloc_sites;
 409     } asu;
 410     allocation_site_ptr alloc_site_list_head = nullptr; // For easy traversal of asu.alloc_sites from scylla-gdb.py
 411     bool collect_backtrace = false;
 412     char* mem() { return memory; }
 413
 414     void link(page_list& list, page* span);
 415     void unlink(page_list& list, page* span);
 416     struct trim {
 417         unsigned offset;
 418         unsigned nr_pages;
 419     };
 420     void maybe_reclaim();
 421     void* allocate_large_and_trim(unsigned nr_pages);
 422     void* allocate_large(unsigned nr_pages);
 423     void* allocate_large_aligned(unsigned align_pages, unsigned nr_pages);
 424     page* find_and_unlink_span(unsigned nr_pages);
 425     page* find_and_unlink_span_reclaiming(unsigned n_pages);
 426     void free_large(void* ptr);
 427     bool grow_span(pageidx& start, uint32_t& nr_pages, unsigned idx);
 428     void free_span(pageidx start, uint32_t nr_pages);
 429     void free_span_no_merge(pageidx start, uint32_t nr_pages);
 430     void free_span_unaligned(pageidx start, uint32_t nr_pages);
 431     void* allocate_small(unsigned size);
 432     void free(void* ptr);
 433     void free(void* ptr, size_t size);
 434     bool try_cross_cpu_free(void* ptr);
 435     void shrink(void* ptr, size_t new_size);
 436     void free_cross_cpu(unsigned cpu_id, void* ptr);
 437     bool drain_cross_cpu_freelist();
 438     size_t object_size(void* ptr);
 439     page* to_page(void* p) {
 440         return &pages[(reinterpret_cast<char*>(p) - mem()) / page_size];
 441     }
 442
 443     bool is_initialized() const;
 444     bool initialize();
 445     reclaiming_result run_reclaimers(reclaimer_scope, size_t pages_to_reclaim);
 446     void schedule_reclaim();
 447     void set_reclaim_hook(std::function<void (std::function<void ()>)> hook);
 448     void set_min_free_pages(size_t pages);
 449     void resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
 450     void do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
 451     void replace_memory_backing(allocate_system_memory_fn alloc_sys_mem);
 452     void check_large_allocation(size_t size);
 453     void warn_large_allocation(size_t size);
 454     memory::memory_layout memory_layout();
 455     ~cpu_pages();
 456 };
 457
 458 static thread_local cpu_pages cpu_mem;
 459 std::atomic<unsigned> cpu_pages::cpu_id_gen;
 460 cpu_pages* cpu_pages::all_cpus[max_cpus];
 461
 462 #ifdef SEASTAR_HEAPPROF
 463
 464 void set_heap_profiling_enabled(bool enable) {
 465     bool is_enabled = cpu_mem.collect_backtrace;
 466     if (enable) {
 467         if (!is_enabled) {
 468             seastar_logger.info("Enabling heap profiler");
 469         }
 470     } else {
 471         if (is_enabled) {
 472             seastar_logger.info("Disabling heap profiler");
 473         }
 474     }
 475     cpu_mem.collect_backtrace = enable;
 476 }
 477
 478 static thread_local int64_t scoped_heap_profiling_embed_count = 0;
 479
 480 scoped_heap_profiling::scoped_heap_profiling() noexcept {
 481     ++scoped_heap_profiling_embed_count;
 482     set_heap_profiling_enabled(true);
 483 }
 484
 485 scoped_heap_profiling::~scoped_heap_profiling() {
 486     if (!--scoped_heap_profiling_embed_count) {
 487         set_heap_profiling_enabled(false);
 488     }
 489 }
 490
 491 #else
 492
 493 void set_heap_profiling_enabled(bool enable) {
 494     seastar_logger.warn("Seastar compiled without heap profiling support, heap profiler not supported;"
 495             " compile with the Seastar_HEAP_PROFILING=ON CMake option to add heap profiling support");
 496 }
 497
 498 scoped_heap_profiling::scoped_heap_profiling() noexcept {
 499     set_heap_profiling_enabled(true); // let it print the warning
 500 }
 501
 502 scoped_heap_profiling::~scoped_heap_profiling() {
 503 }
 504
 505 #endif
 506
 507 // Smallest index i such that all spans stored in the index are >= pages.
 508 static inline
 509 unsigned index_of(unsigned pages) {
 510     if (pages == 1) {
 511         return 0;
 512     }
 513     return std::numeric_limits<unsigned>::digits - count_leading_zeros(pages - 1);
 514 }
 515
 516 void
 517 cpu_pages::unlink(page_list& list, page* span) {
 518     list.erase(pages, *span);
 519 }
 520
 521 void
 522 cpu_pages::link(page_list& list, page* span) {
 523     list.push_front(pages, *span);
 524 }
 525
 526 void cpu_pages::free_span_no_merge(uint32_t span_start, uint32_t nr_pages) {
 527     assert(nr_pages);
 528     nr_free_pages += nr_pages;
 529     auto span = &pages[span_start];
 530     auto span_end = &pages[span_start + nr_pages - 1];
 531     span->free = span_end->free = true;
 532     span->span_size = span_end->span_size = nr_pages;
 533     auto idx = index_of(nr_pages);
 534     link(free_spans[idx], span);
 535 }
 536
 537 bool cpu_pages::grow_span(uint32_t& span_start, uint32_t& nr_pages, unsigned idx) {
 538     auto which = (span_start >> idx) & 1; // 0=lower, 1=upper
 539     // locate first page of upper buddy or last page of lower buddy
 540     // examples: span_start = 0x10 nr_pages = 0x08 -> buddy = 0x18  (which = 0)
 541     //           span_start = 0x18 nr_pages = 0x08 -> buddy = 0x17  (which = 1)
 542     // delta = which ? -1u : nr_pages
 543     auto delta = ((which ^ 1) << idx) | -which;
 544     auto buddy = span_start + delta;
 545     if (pages[buddy].free && pages[buddy].span_size == nr_pages) {
 546         unlink(free_spans[idx], &pages[span_start ^ nr_pages]);
 547         nr_free_pages -= nr_pages; // free_span_no_merge() will restore
 548         span_start &= ~nr_pages;
 549         nr_pages *= 2;
 550         return true;
 551     }
 552     return false;
 553 }
 554
 555 void cpu_pages::free_span(uint32_t span_start, uint32_t nr_pages) {
 556     auto idx = index_of(nr_pages);
 557     while (grow_span(span_start, nr_pages, idx)) {
 558         ++idx;
 559     }
 560     free_span_no_merge(span_start, nr_pages);
 561 }
 562
 563 // Internal, used during startup. Span is not aligned so needs to be broken up
 564 void cpu_pages::free_span_unaligned(uint32_t span_start, uint32_t nr_pages) {
 565     while (nr_pages) {
 566         auto start_nr_bits = span_start ? count_trailing_zeros(span_start) : 32;
 567         auto size_nr_bits = count_trailing_zeros(nr_pages);
 568         auto now = 1u << std::min(start_nr_bits, size_nr_bits);
 569         free_span(span_start, now);
 570         span_start += now;
 571         nr_pages -= now;
 572     }
 573 }
 574
 575 page*
 576 cpu_pages::find_and_unlink_span(unsigned n_pages) {
 577     auto idx = index_of(n_pages);
 578     if (n_pages >= (2u << idx)) {
 579         return nullptr;
 580     }
 581     while (idx < nr_span_lists && free_spans[idx].empty()) {
 582         ++idx;
 583     }
 584     if (idx == nr_span_lists) {
 585         if (initialize()) {
 586             return find_and_unlink_span(n_pages);
 587         }
 588         return nullptr;
 589     }
 590     auto& list = free_spans[idx];
 591     page* span = &list.front(pages);
 592     unlink(list, span);
 593     return span;
 594 }
 595
 596 page*
 597 cpu_pages::find_and_unlink_span_reclaiming(unsigned n_pages) {
 598     while (true) {
 599         auto span = find_and_unlink_span(n_pages);
 600         if (span) {
 601             return span;
 602         }
 603         if (run_reclaimers(reclaimer_scope::sync, n_pages) == reclaiming_result::reclaimed_nothing) {
 604             return nullptr;
 605         }
 606     }
 607 }
 608
 609 void cpu_pages::maybe_reclaim() {
 610     if (nr_free_pages < current_min_free_pages) {
 611         drain_cross_cpu_freelist();
 612         if (nr_free_pages < current_min_free_pages) {
 613             run_reclaimers(reclaimer_scope::sync, current_min_free_pages - nr_free_pages);
 614         }
 615         if (nr_free_pages < current_min_free_pages) {
 616             schedule_reclaim();
 617         }
 618     }
 619 }
 620
 621 void*
 622 cpu_pages::allocate_large_and_trim(unsigned n_pages) {
 623     // Avoid exercising the reclaimers for requests we'll not be able to satisfy
 624     // nr_pages might be zero during startup, so check for that too
 625     if (nr_pages && n_pages >= nr_pages) {
 626         return nullptr;
 627     }
 628     page* span = find_and_unlink_span_reclaiming(n_pages);
 629     if (!span) {
 630         return nullptr;
 631     }
 632     auto span_size = span->span_size;
 633     auto span_idx = span - pages;
 634     nr_free_pages -= span->span_size;
 635     while (span_size >= n_pages * 2) {
 636         span_size /= 2;
 637         auto other_span_idx = span_idx + span_size;
 638         free_span_no_merge(other_span_idx, span_size);
 639     }
 640     auto span_end = &pages[span_idx + span_size - 1];
 641     span->free = span_end->free = false;
 642     span->span_size = span_end->span_size = span_size;
 643     span->pool = nullptr;
 644 #ifdef SEASTAR_HEAPPROF
 645     auto alloc_site = get_allocation_site();
 646     span->alloc_site = alloc_site;
 647     if (alloc_site) {
 648         ++alloc_site->count;
 649         alloc_site->size += span->span_size * page_size;
 650     }
 651 #endif
 652     maybe_reclaim();
 653     return mem() + span_idx * page_size;
 654 }
 655
 656 void
 657 cpu_pages::warn_large_allocation(size_t size) {
 658     ++g_large_allocs;
 659     seastar_memory_logger.warn("oversized allocation: {} bytes. This is non-fatal, but could lead to latency and/or fragmentation issues. Please report: at {}", size, current_backtrace());
 660     large_allocation_warning_threshold *= 1.618; // prevent spam
 661 }
 662
 663 void
 664 inline
 665 cpu_pages::check_large_allocation(size_t size) {
 666     if (size > large_allocation_warning_threshold) {
 667         warn_large_allocation(size);
 668     }
 669 }
 670
 671 void*
 672 cpu_pages::allocate_large(unsigned n_pages) {
 673     check_large_allocation(n_pages * page_size);
 674     return allocate_large_and_trim(n_pages);
 675 }
 676
 677 void*
 678 cpu_pages::allocate_large_aligned(unsigned align_pages, unsigned n_pages) {
 679     check_large_allocation(n_pages * page_size);
 680     // buddy allocation is always aligned
 681     return allocate_large_and_trim(n_pages);
 682 }
 683
 684 #ifdef SEASTAR_HEAPPROF
 685
 686 class disable_backtrace_temporarily {
 687     bool _old;
 688 public:
 689     disable_backtrace_temporarily() {
 690         _old = cpu_mem.collect_backtrace;
 691         cpu_mem.collect_backtrace = false;
 692     }
 693     ~disable_backtrace_temporarily() {
 694         cpu_mem.collect_backtrace = _old;
 695     }
 696 };
 697
 698 #else
 699
 700 struct disable_backtrace_temporarily {
 701     ~disable_backtrace_temporarily() {}
 702 };
 703
 704 #endif
 705
 706 static
 707 saved_backtrace get_backtrace() noexcept {
 708     disable_backtrace_temporarily dbt;
 709     return current_backtrace();
 710 }
 711
 712 static
 713 allocation_site_ptr get_allocation_site() {
 714     if (!cpu_mem.is_initialized() || !cpu_mem.collect_backtrace) {
 715         return nullptr;
 716     }
 717     disable_backtrace_temporarily dbt;
 718     allocation_site new_alloc_site;
 719     new_alloc_site.backtrace = get_backtrace();
 720     auto insert_result = cpu_mem.asu.alloc_sites.insert(std::move(new_alloc_site));
 721     allocation_site_ptr alloc_site = &*insert_result.first;
 722     if (insert_result.second) {
 723         alloc_site->next = cpu_mem.alloc_site_list_head;
 724         cpu_mem.alloc_site_list_head = alloc_site;
 725     }
 726     return alloc_site;
 727 }
 728
 729 #ifdef SEASTAR_HEAPPROF
 730
 731 allocation_site_ptr&
 732 small_pool::alloc_site_holder(void* ptr) {
 733     if (objects_page_aligned()) {
 734         return cpu_mem.to_page(ptr)->alloc_site;
 735     } else {
 736         return *reinterpret_cast<allocation_site_ptr*>(reinterpret_cast<char*>(ptr) + _object_size - sizeof(allocation_site_ptr));
 737     }
 738 }
 739
 740 #endif
 741
 742 void*
 743 cpu_pages::allocate_small(unsigned size) {
 744     auto idx = small_pool::size_to_idx(size);
 745     auto& pool = small_pools[idx];
 746     assert(size <= pool.object_size());
 747     auto ptr = pool.allocate();
 748 #ifdef SEASTAR_HEAPPROF
 749     if (!ptr) {
 750         return nullptr;
 751     }
 752     allocation_site_ptr alloc_site = get_allocation_site();
 753     if (alloc_site) {
 754         ++alloc_site->count;
 755         alloc_site->size += pool.object_size();
 756     }
 757     new (&pool.alloc_site_holder(ptr)) allocation_site_ptr{alloc_site};
 758 #endif
 759     return ptr;
 760 }
 761
 762 void cpu_pages::free_large(void* ptr) {
 763     pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
 764     page* span = &pages[idx];
 765 #ifdef SEASTAR_HEAPPROF
 766     auto alloc_site = span->alloc_site;
 767     if (alloc_site) {
 768         --alloc_site->count;
 769         alloc_site->size -= span->span_size * page_size;
 770     }
 771 #endif
 772     free_span(idx, span->span_size);
 773 }
 774
 775 size_t cpu_pages::object_size(void* ptr) {
 776     page* span = to_page(ptr);
 777     if (span->pool) {
 778         auto s = span->pool->object_size();
 779 #ifdef SEASTAR_HEAPPROF
 780         // We must not allow the object to be extended onto the allocation_site_ptr field.
 781         if (!span->pool->objects_page_aligned()) {
 782             s -= sizeof(allocation_site_ptr);
 783         }
 784 #endif
 785         return s;
 786     } else {
 787         return size_t(span->span_size) * page_size;
 788     }
 789 }
 790
 791 void cpu_pages::free_cross_cpu(unsigned cpu_id, void* ptr) {
 792     if (!live_cpus[cpu_id].load(std::memory_order_relaxed)) {
 793         // Thread was destroyed; leak object
 794         // should only happen for boost unit-tests.
 795         return;
 796     }
 797     auto p = reinterpret_cast<cross_cpu_free_item*>(ptr);
 798     auto& list = all_cpus[cpu_id]->xcpu_freelist;
 799     auto old = list.load(std::memory_order_relaxed);
 800     do {
 801         p->next = old;
 802     } while (!list.compare_exchange_weak(old, p, std::memory_order_release, std::memory_order_relaxed));
 803     ++g_cross_cpu_frees;
 804 }
 805
 806 bool cpu_pages::drain_cross_cpu_freelist() {
 807     if (!xcpu_freelist.load(std::memory_order_relaxed)) {
 808         return false;
 809     }
 810     auto p = xcpu_freelist.exchange(nullptr, std::memory_order_acquire);
 811     while (p) {
 812         auto n = p->next;
 813         ++g_frees;
 814         free(p);
 815         p = n;
 816     }
 817     return true;
 818 }
 819
 820 void cpu_pages::free(void* ptr) {
 821     page* span = to_page(ptr);
 822     if (span->pool) {
 823         small_pool& pool = *span->pool;
 824 #ifdef SEASTAR_HEAPPROF
 825         allocation_site_ptr alloc_site = pool.alloc_site_holder(ptr);
 826         if (alloc_site) {
 827             --alloc_site->count;
 828             alloc_site->size -= pool.object_size();
 829         }
 830 #endif
 831         pool.deallocate(ptr);
 832     } else {
 833         free_large(ptr);
 834     }
 835 }
 836
 837 void cpu_pages::free(void* ptr, size_t size) {
 838     // match action on allocate() so hit the right pool
 839     if (size <= sizeof(free_object)) {
 840         size = sizeof(free_object);
 841     }
 842     if (size <= max_small_allocation) {
 843         size = object_size_with_alloc_site(size);
 844         auto pool = &small_pools[small_pool::size_to_idx(size)];
 845 #ifdef SEASTAR_HEAPPROF
 846         allocation_site_ptr alloc_site = pool->alloc_site_holder(ptr);
 847         if (alloc_site) {
 848             --alloc_site->count;
 849             alloc_site->size -= pool->object_size();
 850         }
 851 #endif
 852         pool->deallocate(ptr);
 853     } else {
 854         free_large(ptr);
 855     }
 856 }
 857
 858 bool
 859 cpu_pages::try_cross_cpu_free(void* ptr) {
 860     auto obj_cpu = object_cpu_id(ptr);
 861     if (obj_cpu != cpu_id) {
 862         free_cross_cpu(obj_cpu, ptr);
 863         return true;
 864     }
 865     return false;
 866 }
 867
 868 void cpu_pages::shrink(void* ptr, size_t new_size) {
 869     auto obj_cpu = object_cpu_id(ptr);
 870     assert(obj_cpu == cpu_id);
 871     page* span = to_page(ptr);
 872     if (span->pool) {
 873         return;
 874     }
 875     auto old_size_pages = span->span_size;
 876     size_t new_size_pages = old_size_pages;
 877     while (new_size_pages / 2 * page_size >= new_size) {
 878         new_size_pages /= 2;
 879     }
 880     if (new_size_pages == old_size_pages) {
 881         return;
 882     }
 883 #ifdef SEASTAR_HEAPPROF
 884     auto alloc_site = span->alloc_site;
 885     if (alloc_site) {
 886         alloc_site->size -= span->span_size * page_size;
 887         alloc_site->size += new_size_pages * page_size;
 888     }
 889 #endif
 890     span->span_size = new_size_pages;
 891     span[new_size_pages - 1].free = false;
 892     span[new_size_pages - 1].span_size = new_size_pages;
 893     pageidx idx = span - pages;
 894     free_span_unaligned(idx + new_size_pages, old_size_pages - new_size_pages);
 895 }
 896
 897 cpu_pages::~cpu_pages() {
 898     live_cpus[cpu_id].store(false, std::memory_order_relaxed);
 899 }
 900
 901 bool cpu_pages::is_initialized() const {
 902     return bool(nr_pages);
 903 }
 904
 905 bool cpu_pages::initialize() {
 906     if (is_initialized()) {
 907         return false;
 908     }
 909     cpu_id = cpu_id_gen.fetch_add(1, std::memory_order_relaxed);
 910     assert(cpu_id < max_cpus);
 911     all_cpus[cpu_id] = this;
 912     auto base = mem_base() + (size_t(cpu_id) << cpu_id_shift);
 913     auto size = 32 << 20;  // Small size for bootstrap
 914     auto r = ::mmap(base, size,
 915             PROT_READ | PROT_WRITE,
 916             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
 917             -1, 0);
 918     if (r == MAP_FAILED) {
 919         abort();
 920     }
 921     ::madvise(base, size, MADV_HUGEPAGE);
 922     pages = reinterpret_cast<page*>(base);
 923     memory = base;
 924     nr_pages = size / page_size;
 925     // we reserve the end page so we don't have to special case
 926     // the last span.
 927     auto reserved = align_up(sizeof(page) * (nr_pages + 1), page_size) / page_size;
 928     reserved = 1u << log2ceil(reserved);
 929     for (pageidx i = 0; i < reserved; ++i) {
 930         pages[i].free = false;
 931     }
 932     pages[nr_pages].free = false;
 933     free_span_unaligned(reserved, nr_pages - reserved);
 934     live_cpus[cpu_id].store(true, std::memory_order_relaxed);
 935     return true;
 936 }
 937
 938 mmap_area
 939 allocate_anonymous_memory(compat::optional<void*> where, size_t how_much) {
 940     return mmap_anonymous(where.value_or(nullptr),
 941             how_much,
 942             PROT_READ | PROT_WRITE,
 943             MAP_PRIVATE | (where ? MAP_FIXED : 0));
 944 }
 945
 946 mmap_area
 947 allocate_hugetlbfs_memory(file_desc& fd, compat::optional<void*> where, size_t how_much) {
 948     auto pos = fd.size();
 949     fd.truncate(pos + how_much);
 950     auto ret = fd.map(
 951             how_much,
 952             PROT_READ | PROT_WRITE,
 953             MAP_SHARED | MAP_POPULATE | (where ? MAP_FIXED : 0),
 954             pos,
 955             where.value_or(nullptr));
 956     return ret;
 957 }
 958
 959 void cpu_pages::replace_memory_backing(allocate_system_memory_fn alloc_sys_mem) {
 960     // We would like to use ::mremap() to atomically replace the old anonymous
 961     // memory with hugetlbfs backed memory, but mremap() does not support hugetlbfs
 962     // (for no reason at all).  So we must copy the anonymous memory to some other
 963     // place, map hugetlbfs in place, and copy it back, without modifying it during
 964     // the operation.
 965     auto bytes = nr_pages * page_size;
 966     auto old_mem = mem();
 967     auto relocated_old_mem = mmap_anonymous(nullptr, bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE);
 968     std::memcpy(relocated_old_mem.get(), old_mem, bytes);
 969     alloc_sys_mem({old_mem}, bytes).release();
 970     std::memcpy(old_mem, relocated_old_mem.get(), bytes);
 971 }
 972
 973 void cpu_pages::do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem) {
 974     auto new_pages = new_size / page_size;
 975     if (new_pages <= nr_pages) {
 976         return;
 977     }
 978     auto old_size = nr_pages * page_size;
 979     auto mmap_start = memory + old_size;
 980     auto mmap_size = new_size - old_size;
 981     auto mem = alloc_sys_mem({mmap_start}, mmap_size);
 982     mem.release();
 983     ::madvise(mmap_start, mmap_size, MADV_HUGEPAGE);
 984     // one past last page structure is a sentinel
 985     auto new_page_array_pages = align_up(sizeof(page[new_pages + 1]), page_size) / page_size;
 986     auto new_page_array
 987         = reinterpret_cast<page*>(allocate_large(new_page_array_pages));
 988     if (!new_page_array) {
 989         throw std::bad_alloc();
 990     }
 991     std::copy(pages, pages + nr_pages, new_page_array);
 992     // mark new one-past-last page as taken to avoid boundary conditions
 993     new_page_array[new_pages].free = false;
 994     auto old_pages = reinterpret_cast<char*>(pages);
 995     auto old_nr_pages = nr_pages;
 996     auto old_pages_size = align_up(sizeof(page[nr_pages + 1]), page_size);
 997     old_pages_size = size_t(1) << log2ceil(old_pages_size);
 998     pages = new_page_array;
 999     nr_pages = new_pages;
1000     auto old_pages_start = (old_pages - memory) / page_size;
1001     if (old_pages_start == 0) {
1002         // keep page 0 allocated
1003         old_pages_start = 1;
1004         old_pages_size -= page_size;
1005     }
1006     if (old_pages_size != 0) {
1007         free_span_unaligned(old_pages_start, old_pages_size / page_size);
1008     }
1009     free_span_unaligned(old_nr_pages, new_pages - old_nr_pages);
1010 }
1011
1012 void cpu_pages::resize(size_t new_size, allocate_system_memory_fn alloc_memory) {
1013     new_size = align_down(new_size, huge_page_size);
1014     while (nr_pages * page_size < new_size) {
1015         // don't reallocate all at once, since there might not
1016         // be enough free memory available to relocate the pages array
1017         auto tmp_size = std::min(new_size, 4 * nr_pages * page_size);
1018         do_resize(tmp_size, alloc_memory);
1019     }
1020 }
1021
1022 reclaiming_result cpu_pages::run_reclaimers(reclaimer_scope scope, size_t n_pages) {
1023     auto target = std::max<size_t>(nr_free_pages + n_pages, min_free_pages);
1024     reclaiming_result result = reclaiming_result::reclaimed_nothing;
1025     while (nr_free_pages < target) {
1026         bool made_progress = false;
1027         ++g_reclaims;
1028         for (auto&& r : reclaimers) {
1029             if (r->scope() >= scope) {
1030                 made_progress |= r->do_reclaim((target - nr_free_pages) * page_size) == reclaiming_result::reclaimed_something;
1031             }
1032         }
1033         if (!made_progress) {
1034             return result;
1035         }
1036         result = reclaiming_result::reclaimed_something;
1037     }
1038     return result;
1039 }
1040
1041 void cpu_pages::schedule_reclaim() {
1042     current_min_free_pages = 0;
1043     reclaim_hook([this] {
1044         if (nr_free_pages < min_free_pages) {
1045             try {
1046                 run_reclaimers(reclaimer_scope::async, min_free_pages - nr_free_pages);
1047             } catch (...) {
1048                 current_min_free_pages = min_free_pages;
1049                 throw;
1050             }
1051         }
1052         current_min_free_pages = min_free_pages;
1053     });
1054 }
1055
1056 memory::memory_layout cpu_pages::memory_layout() {
1057     assert(is_initialized());
1058     return {
1059         reinterpret_cast<uintptr_t>(memory),
1060         reinterpret_cast<uintptr_t>(memory) + nr_pages * page_size
1061     };
1062 }
1063
1064 void cpu_pages::set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1065     reclaim_hook = hook;
1066     current_min_free_pages = min_free_pages;
1067 }
1068
1069 void cpu_pages::set_min_free_pages(size_t pages) {
1070     if (pages > std::numeric_limits<decltype(min_free_pages)>::max()) {
1071         throw std::runtime_error("Number of pages too large");
1072     }
1073     min_free_pages = pages;
1074     maybe_reclaim();
1075 }
1076
1077 small_pool::small_pool(unsigned object_size) noexcept
1078     : _object_size(object_size) {
1079     unsigned span_size = 1;
1080     auto span_bytes = [&] { return span_size * page_size; };
1081     auto waste = [&] { return (span_bytes() % _object_size) / (1.0 * span_bytes()); };
1082     while (object_size > span_bytes()) {
1083         ++span_size;
1084     }
1085     _span_sizes.fallback = span_size;
1086     span_size = 1;
1087     while (_object_size > span_bytes()
1088             || (span_size < 32 && waste() > 0.05)
1089             || (span_bytes() / object_size < 4)) {
1090         ++span_size;
1091     }
1092     _span_sizes.preferred = span_size;
1093     _max_free = std::max<unsigned>(100, span_bytes() * 2 / _object_size);
1094     _min_free = _max_free / 2;
1095 }
1096
1097 small_pool::~small_pool() {
1098     _min_free = _max_free = 0;
1099     trim_free_list();
1100 }
1101
1102 // Should not throw in case of running out of memory to avoid infinite recursion,
1103 // becaue throwing std::bad_alloc requires allocation. __cxa_allocate_exception
1104 // falls back to the emergency pool in case malloc() returns nullptr.
1105 void*
1106 small_pool::allocate() {
1107     if (!_free) {
1108         add_more_objects();
1109     }
1110     if (!_free) {
1111         return nullptr;
1112     }
1113     auto* obj = _free;
1114     _free = _free->next;
1115     --_free_count;
1116     return obj;
1117 }
1118
1119 void
1120 small_pool::deallocate(void* object) {
1121     auto o = reinterpret_cast<free_object*>(object);
1122     o->next = _free;
1123     _free = o;
1124     ++_free_count;
1125     if (_free_count >= _max_free) {
1126         trim_free_list();
1127     }
1128 }
1129
1130 void
1131 small_pool::add_more_objects() {
1132     auto goal = (_min_free + _max_free) / 2;
1133     while (!_span_list.empty() && _free_count < goal) {
1134         page& span = _span_list.front(cpu_mem.pages);
1135         _span_list.pop_front(cpu_mem.pages);
1136         while (span.freelist) {
1137             auto obj = span.freelist;
1138             span.freelist = span.freelist->next;
1139             obj->next = _free;
1140             _free = obj;
1141             ++_free_count;
1142             ++span.nr_small_alloc;
1143         }
1144     }
1145     while (_free_count < goal) {
1146         disable_backtrace_temporarily dbt;
1147         auto span_size = _span_sizes.preferred;
1148         auto data = reinterpret_cast<char*>(cpu_mem.allocate_large(span_size));
1149         if (!data) {
1150             span_size = _span_sizes.fallback;
1151             data = reinterpret_cast<char*>(cpu_mem.allocate_large(span_size));
1152             if (!data) {
1153                 return;
1154             }
1155         }
1156         auto span = cpu_mem.to_page(data);
1157         span_size = span->span_size;
1158         _pages_in_use += span_size;
1159         for (unsigned i = 0; i < span_size; ++i) {
1160             span[i].offset_in_span = i;
1161             span[i].pool = this;
1162         }
1163         span->nr_small_alloc = 0;
1164         span->freelist = nullptr;
1165         for (unsigned offset = 0; offset <= span_size * page_size - _object_size; offset += _object_size) {
1166             auto h = reinterpret_cast<free_object*>(data + offset);
1167             h->next = _free;
1168             _free = h;
1169             ++_free_count;
1170             ++span->nr_small_alloc;
1171         }
1172     }
1173 }
1174
1175 void
1176 small_pool::trim_free_list() {
1177     auto goal = (_min_free + _max_free) / 2;
1178     while (_free && _free_count > goal) {
1179         auto obj = _free;
1180         _free = _free->next;
1181         --_free_count;
1182         page* span = cpu_mem.to_page(obj);
1183         span -= span->offset_in_span;
1184         if (!span->freelist) {
1185             new (&span->link) page_list_link();
1186             _span_list.push_front(cpu_mem.pages, *span);
1187         }
1188         obj->next = span->freelist;
1189         span->freelist = obj;
1190         if (--span->nr_small_alloc == 0) {
1191             _pages_in_use -= span->span_size;
1192             _span_list.erase(cpu_mem.pages, *span);
1193             cpu_mem.free_span(span - cpu_mem.pages, span->span_size);
1194         }
1195     }
1196 }
1197
1198 void
1199 abort_on_underflow(size_t size) {
1200     if (std::make_signed_t<size_t>(size) < 0) {
1201         // probably a logic error, stop hard
1202         abort();
1203     }
1204 }
1205
1206 void* allocate_large(size_t size) {
1207     abort_on_underflow(size);
1208     unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1209     if ((size_t(size_in_pages) << page_bits) < size) {
1210         return nullptr; // (size + page_size - 1) caused an overflow
1211     }
1212     return cpu_mem.allocate_large(size_in_pages);
1213
1214 }
1215
1216 void* allocate_large_aligned(size_t align, size_t size) {
1217     abort_on_underflow(size);
1218     unsigned size_in_pages = (size + page_size - 1) >> page_bits;
1219     unsigned align_in_pages = std::max(align, page_size) >> page_bits;
1220     return cpu_mem.allocate_large_aligned(align_in_pages, size_in_pages);
1221 }
1222
1223 void free_large(void* ptr) {
1224     return cpu_mem.free_large(ptr);
1225 }
1226
1227 size_t object_size(void* ptr) {
1228     return cpu_pages::all_cpus[object_cpu_id(ptr)]->object_size(ptr);
1229 }
1230
1231 // Mark as cold so that GCC8+ can move to .text.unlikely.
1232 [[gnu::cold]]
1233 static void init_cpu_mem_ptr(cpu_pages*& cpu_mem_ptr) {
1234     cpu_mem_ptr = &cpu_mem;
1235 };
1236
1237 [[gnu::always_inline]]
1238 static inline cpu_pages& get_cpu_mem()
1239 {
1240     // cpu_pages has a non-trivial constructor which means that the compiler
1241     // must make sure the instance local to the current thread has been
1242     // constructed before each access.
1243     // Unfortunately, this means that GCC will emit an unconditional call
1244     // to __tls_init(), which may incurr a noticeable overhead in applications
1245     // that are heavy on memory allocations.
1246     // This can be solved by adding an easily predictable branch checking
1247     // whether the object has already been constructed.
1248     static thread_local cpu_pages* cpu_mem_ptr;
1249     if (__builtin_expect(!bool(cpu_mem_ptr), false)) {
1250         init_cpu_mem_ptr(cpu_mem_ptr);
1251     }
1252     return *cpu_mem_ptr;
1253 }
1254
1255 void* allocate(size_t size) {
1256     if (size <= sizeof(free_object)) {
1257         size = sizeof(free_object);
1258     }
1259     void* ptr;
1260     if (size <= max_small_allocation) {
1261         size = object_size_with_alloc_site(size);
1262         ptr = get_cpu_mem().allocate_small(size);
1263     } else {
1264         ptr = allocate_large(size);
1265     }
1266     if (!ptr) {
1267         on_allocation_failure(size);
1268     }
1269     ++g_allocs;
1270     return ptr;
1271 }
1272
1273 void* allocate_aligned(size_t align, size_t size) {
1274     if (size <= sizeof(free_object)) {
1275         size = std::max(sizeof(free_object), align);
1276     }
1277     void* ptr;
1278     if (size <= max_small_allocation && align <= page_size) {
1279         // Our small allocator only guarantees alignment for power-of-two
1280         // allocations which are not larger than a page.
1281         size = 1 << log2ceil(object_size_with_alloc_site(size));
1282         ptr = get_cpu_mem().allocate_small(size);
1283     } else {
1284         ptr = allocate_large_aligned(align, size);
1285     }
1286     if (!ptr) {
1287         on_allocation_failure(size);
1288     }
1289     ++g_allocs;
1290     return ptr;
1291 }
1292
1293 void free(void* obj) {
1294     if (get_cpu_mem().try_cross_cpu_free(obj)) {
1295         return;
1296     }
1297     ++g_frees;
1298     get_cpu_mem().free(obj);
1299 }
1300
1301 void free(void* obj, size_t size) {
1302     if (get_cpu_mem().try_cross_cpu_free(obj)) {
1303         return;
1304     }
1305     ++g_frees;
1306     get_cpu_mem().free(obj, size);
1307 }
1308
1309 void free_aligned(void* obj, size_t align, size_t size) {
1310     if (size <= sizeof(free_object)) {
1311         size = sizeof(free_object);
1312     }
1313     free(obj, size);
1314 }
1315
1316 void shrink(void* obj, size_t new_size) {
1317     ++g_frees;
1318     ++g_allocs; // keep them balanced
1319     cpu_mem.shrink(obj, new_size);
1320 }
1321
1322 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1323     cpu_mem.set_reclaim_hook(hook);
1324 }
1325
1326 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope scope)
1327     : reclaimer([reclaim = std::move(reclaim)] (request) {
1328         return reclaim();
1329     }, scope) {
1330 }
1331
1332 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope scope)
1333     : _reclaim(std::move(reclaim))
1334     , _scope(scope) {
1335     cpu_mem.reclaimers.push_back(this);
1336 }
1337
1338 reclaimer::~reclaimer() {
1339     auto& r = cpu_mem.reclaimers;
1340     r.erase(std::find(r.begin(), r.end(), this));
1341 }
1342
1343 void set_large_allocation_warning_threshold(size_t threshold) {
1344     cpu_mem.large_allocation_warning_threshold = threshold;
1345 }
1346
1347 size_t get_large_allocation_warning_threshold() {
1348     return cpu_mem.large_allocation_warning_threshold;
1349 }
1350
1351 void disable_large_allocation_warning() {
1352     cpu_mem.large_allocation_warning_threshold = std::numeric_limits<size_t>::max();
1353 }
1354
1355 void configure(std::vector<resource::memory> m, bool mbind,
1356         optional<std::string> hugetlbfs_path) {
1357     size_t total = 0;
1358     for (auto&& x : m) {
1359         total += x.bytes;
1360     }
1361     allocate_system_memory_fn sys_alloc = allocate_anonymous_memory;
1362     if (hugetlbfs_path) {
1363         // std::function is copyable, but file_desc is not, so we must use
1364         // a shared_ptr to allow sys_alloc to be copied around
1365         auto fdp = make_lw_shared<file_desc>(file_desc::temporary(*hugetlbfs_path));
1366         sys_alloc = [fdp] (optional<void*> where, size_t how_much) {
1367             return allocate_hugetlbfs_memory(*fdp, where, how_much);
1368         };
1369         cpu_mem.replace_memory_backing(sys_alloc);
1370     }
1371     cpu_mem.resize(total, sys_alloc);
1372     size_t pos = 0;
1373     for (auto&& x : m) {
1374 #ifdef SEASTAR_HAVE_NUMA
1375         unsigned long nodemask = 1UL << x.nodeid;
1376         if (mbind) {
1377             auto r = ::mbind(cpu_mem.mem() + pos, x.bytes,
1378                             MPOL_PREFERRED,
1379                             &nodemask, std::numeric_limits<unsigned long>::digits,
1380                             MPOL_MF_MOVE);
1381
1382             if (r == -1) {
1383                 char err[1000] = {};
1384                 strerror_r(errno, err, sizeof(err));
1385                 std::cerr << "WARNING: unable to mbind shard memory; performance may suffer: "
1386                         << err << std::endl;
1387             }
1388         }
1389 #endif
1390         pos += x.bytes;
1391     }
1392 }
1393
1394 statistics stats() {
1395     return statistics{g_allocs, g_frees, g_cross_cpu_frees,
1396         cpu_mem.nr_pages * page_size, cpu_mem.nr_free_pages * page_size, g_reclaims, g_large_allocs};
1397 }
1398
1399 bool drain_cross_cpu_freelist() {
1400     return cpu_mem.drain_cross_cpu_freelist();
1401 }
1402
1403 memory_layout get_memory_layout() {
1404     return cpu_mem.memory_layout();
1405 }
1406
1407 size_t min_free_memory() {
1408     return cpu_mem.min_free_pages * page_size;
1409 }
1410
1411 void set_min_free_pages(size_t pages) {
1412     cpu_mem.set_min_free_pages(pages);
1413 }
1414
1415 static thread_local int report_on_alloc_failure_suppressed = 0;
1416
1417 class disable_report_on_alloc_failure_temporarily {
1418 public:
1419     disable_report_on_alloc_failure_temporarily() {
1420         ++report_on_alloc_failure_suppressed;
1421     };
1422     ~disable_report_on_alloc_failure_temporarily() noexcept {
1423         --report_on_alloc_failure_suppressed;
1424     }
1425 };
1426
1427 static std::atomic<bool> abort_on_allocation_failure{false};
1428
1429 void enable_abort_on_allocation_failure() {
1430     abort_on_allocation_failure.store(true, std::memory_order_seq_cst);
1431 }
1432
1433 void on_allocation_failure(size_t size) {
1434     if (!report_on_alloc_failure_suppressed &&
1435             // report even suppressed failures if trace level is enabled
1436             (seastar_memory_logger.is_enabled(seastar::log_level::trace) ||
1437                     (seastar_memory_logger.is_enabled(seastar::log_level::debug) && !abort_on_alloc_failure_suppressed))) {
1438         disable_report_on_alloc_failure_temporarily guard;
1439         seastar_memory_logger.debug("Failed to allocate {} bytes at {}", size, current_backtrace());
1440         auto free_mem = cpu_mem.nr_free_pages * page_size;
1441         auto total_mem = cpu_mem.nr_pages * page_size;
1442         seastar_memory_logger.debug("Used memory: {} Free memory: {} Total memory: {}", total_mem - free_mem, free_mem, total_mem);
1443         seastar_memory_logger.debug("Small pools:");
1444         seastar_memory_logger.debug("objsz spansz usedobj   memory       wst%");
1445         for (unsigned i = 0; i < cpu_mem.small_pools.nr_small_pools; i++) {
1446             auto& sp = cpu_mem.small_pools[i];
1447             auto use_count = sp._pages_in_use * page_size / sp.object_size() - sp._free_count;
1448             auto memory = sp._pages_in_use * page_size;
1449             auto wasted_percent = memory ? sp._free_count * sp.object_size() * 100.0 / memory : 0;
1450             seastar_memory_logger.debug("{} {} {} {} {}", sp.object_size(), sp._span_sizes.preferred * page_size, use_count, memory, wasted_percent);
1451         }
1452         seastar_memory_logger.debug("Page spans:");
1453         seastar_memory_logger.debug("index size [B]     free [B]");
1454         for (unsigned i = 0; i< cpu_mem.nr_span_lists; i++) {
1455             auto& span_list = cpu_mem.free_spans[i];
1456             auto front = span_list._front;
1457             uint32_t total = 0;
1458             while(front) {
1459                 auto& span = cpu_mem.pages[front];
1460                 total += span.span_size;
1461                 front = span.link._next;
1462             }
1463             seastar_memory_logger.debug("{} {} {}", i, (1<<i) * page_size, total * page_size);
1464         }
1465     }
1466
1467     if (!abort_on_alloc_failure_suppressed
1468             && abort_on_allocation_failure.load(std::memory_order_relaxed)) {
1469         seastar_logger.error("Failed to allocate {} bytes", size);
1470         abort();
1471     }
1472 }
1473
1474 static void trigger_error_injector() {
1475     on_alloc_point();
1476 }
1477
1478 static bool try_trigger_error_injector() {
1479     try {
1480         on_alloc_point();
1481         return false;
1482     } catch (...) {
1483         return true;
1484     }
1485 }
1486
1487 }
1488
1489 }
1490
1491 using namespace seastar::memory;
1492
1493 extern "C"
1494 [[gnu::visibility("default")]]
1495 [[gnu::used]]
1496 void* malloc(size_t n) throw () {
1497     if (try_trigger_error_injector()) {
1498         return nullptr;
1499     }
1500     return allocate(n);
1501 }
1502
1503 extern "C"
1504 [[gnu::alias("malloc")]]
1505 [[gnu::visibility("default")]]
1506 [[gnu::malloc]]
1507 [[gnu::alloc_size(1)]]
1508 #ifndef __clang__
1509 [[gnu::leaf]]
1510 #endif
1511 void* __libc_malloc(size_t n) throw ();
1512
1513 extern "C"
1514 [[gnu::visibility("default")]]
1515 [[gnu::used]]
1516 void free(void* ptr) {
1517     if (ptr) {
1518         seastar::memory::free(ptr);
1519     }
1520 }
1521
1522 extern "C"
1523 [[gnu::alias("free")]]
1524 [[gnu::visibility("default")]]
1525 #ifndef __clang__
1526 [[gnu::leaf]]
1527 #endif
1528 void __libc_free(void* obj) throw ();
1529
1530 extern "C"
1531 [[gnu::visibility("default")]]
1532 void* calloc(size_t nmemb, size_t size) {
1533     if (try_trigger_error_injector()) {
1534         return nullptr;
1535     }
1536     auto s1 = __int128(nmemb) * __int128(size);
1537     assert(s1 == size_t(s1));
1538     size_t s = s1;
1539     auto p = malloc(s);
1540     if (p) {
1541         std::memset(p, 0, s);
1542     }
1543     return p;
1544 }
1545
1546 extern "C"
1547 [[gnu::alias("calloc")]]
1548 [[gnu::visibility("default")]]
1549 [[gnu::alloc_size(1, 2)]]
1550 [[gnu::malloc]]
1551 #ifndef __clang__
1552 [[gnu::leaf]]
1553 #endif
1554 void* __libc_calloc(size_t n, size_t m) throw ();
1555
1556 extern "C"
1557 [[gnu::visibility("default")]]
1558 void* realloc(void* ptr, size_t size) {
1559     if (try_trigger_error_injector()) {
1560         return nullptr;
1561     }
1562     auto old_size = ptr ? object_size(ptr) : 0;
1563     if (size == old_size) {
1564         return ptr;
1565     }
1566     if (size == 0) {
1567         ::free(ptr);
1568         return nullptr;
1569     }
1570     if (size < old_size) {
1571         seastar::memory::shrink(ptr, size);
1572         return ptr;
1573     }
1574     auto nptr = malloc(size);
1575     if (!nptr) {
1576         return nptr;
1577     }
1578     if (ptr) {
1579         std::memcpy(nptr, ptr, std::min(size, old_size));
1580         ::free(ptr);
1581     }
1582     return nptr;
1583 }
1584
1585 extern "C"
1586 [[gnu::alias("realloc")]]
1587 [[gnu::visibility("default")]]
1588 [[gnu::alloc_size(2)]]
1589 #ifndef __clang__
1590 [[gnu::leaf]]
1591 #endif
1592 void* __libc_realloc(void* obj, size_t size) throw ();
1593
1594 extern "C"
1595 [[gnu::visibility("default")]]
1596 [[gnu::used]]
1597 #ifndef __clang__
1598 [[gnu::leaf]]
1599 #endif
1600 [[gnu::nonnull(1)]]
1601 int posix_memalign(void** ptr, size_t align, size_t size) throw () {
1602     if (try_trigger_error_injector()) {
1603         return ENOMEM;
1604     }
1605     *ptr = allocate_aligned(align, size);
1606     if (!*ptr) {
1607         return ENOMEM;
1608     }
1609     return 0;
1610 }
1611
1612 extern "C"
1613 [[gnu::alias("posix_memalign")]]
1614 [[gnu::visibility("default")]]
1615 #ifndef __clang__
1616 [[gnu::leaf]]
1617 #endif
1618 [[gnu::nonnull(1)]]
1619 int __libc_posix_memalign(void** ptr, size_t align, size_t size) throw ();
1620
1621 extern "C"
1622 [[gnu::visibility("default")]]
1623 [[gnu::malloc]]
1624 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
1625 [[gnu::alloc_size(2)]]
1626 #endif
1627 void* memalign(size_t align, size_t size) throw () {
1628     if (try_trigger_error_injector()) {
1629         return nullptr;
1630     }
1631     size = seastar::align_up(size, align);
1632     return allocate_aligned(align, size);
1633 }
1634
1635 extern "C"
1636 [[gnu::visibility("default")]]
1637 void *aligned_alloc(size_t align, size_t size) throw () {
1638     if (try_trigger_error_injector()) {
1639         return nullptr;
1640     }
1641     return allocate_aligned(align, size);
1642 }
1643
1644 extern "C"
1645 [[gnu::alias("memalign")]]
1646 [[gnu::visibility("default")]]
1647 [[gnu::malloc]]
1648 #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 30)
1649 [[gnu::alloc_size(2)]]
1650 #endif
1651 void* __libc_memalign(size_t align, size_t size) throw ();
1652
1653 extern "C"
1654 [[gnu::visibility("default")]]
1655 void cfree(void* obj) throw () {
1656     return ::free(obj);
1657 }
1658
1659 extern "C"
1660 [[gnu::alias("cfree")]]
1661 [[gnu::visibility("default")]]
1662 void __libc_cfree(void* obj) throw ();
1663
1664 extern "C"
1665 [[gnu::visibility("default")]]
1666 size_t malloc_usable_size(void* obj) {
1667     return object_size(obj);
1668 }
1669
1670 extern "C"
1671 [[gnu::visibility("default")]]
1672 int malloc_trim(size_t pad) {
1673     return 0;
1674 }
1675
1676 static inline
1677 void* throw_if_null(void* ptr) {
1678     if (!ptr) {
1679         throw std::bad_alloc();
1680     }
1681     return ptr;
1682 }
1683
1684 [[gnu::visibility("default")]]
1685 void* operator new(size_t size) {
1686     trigger_error_injector();
1687     if (size == 0) {
1688         size = 1;
1689     }
1690     return throw_if_null(allocate(size));
1691 }
1692
1693 [[gnu::visibility("default")]]
1694 void* operator new[](size_t size) {
1695     trigger_error_injector();
1696     if (size == 0) {
1697         size = 1;
1698     }
1699     return throw_if_null(allocate(size));
1700 }
1701
1702 [[gnu::visibility("default")]]
1703 void operator delete(void* ptr) throw () {
1704     if (ptr) {
1705         seastar::memory::free(ptr);
1706     }
1707 }
1708
1709 [[gnu::visibility("default")]]
1710 void operator delete[](void* ptr) throw () {
1711     if (ptr) {
1712         seastar::memory::free(ptr);
1713     }
1714 }
1715
1716 [[gnu::visibility("default")]]
1717 void operator delete(void* ptr, size_t size) throw () {
1718     if (ptr) {
1719         seastar::memory::free(ptr, size);
1720     }
1721 }
1722
1723 [[gnu::visibility("default")]]
1724 void operator delete[](void* ptr, size_t size) throw () {
1725     if (ptr) {
1726         seastar::memory::free(ptr, size);
1727     }
1728 }
1729
1730 [[gnu::visibility("default")]]
1731 void* operator new(size_t size, std::nothrow_t) throw () {
1732     if (try_trigger_error_injector()) {
1733         return nullptr;
1734     }
1735     if (size == 0) {
1736         size = 1;
1737     }
1738     return allocate(size);
1739 }
1740
1741 [[gnu::visibility("default")]]
1742 void* operator new[](size_t size, std::nothrow_t) throw () {
1743     if (size == 0) {
1744         size = 1;
1745     }
1746     return allocate(size);
1747 }
1748
1749 [[gnu::visibility("default")]]
1750 void operator delete(void* ptr, std::nothrow_t) throw () {
1751     if (ptr) {
1752         seastar::memory::free(ptr);
1753     }
1754 }
1755
1756 [[gnu::visibility("default")]]
1757 void operator delete[](void* ptr, std::nothrow_t) throw () {
1758     if (ptr) {
1759         seastar::memory::free(ptr);
1760     }
1761 }
1762
1763 [[gnu::visibility("default")]]
1764 void operator delete(void* ptr, size_t size, std::nothrow_t) throw () {
1765     if (ptr) {
1766         seastar::memory::free(ptr, size);
1767     }
1768 }
1769
1770 [[gnu::visibility("default")]]
1771 void operator delete[](void* ptr, size_t size, std::nothrow_t) throw () {
1772     if (ptr) {
1773         seastar::memory::free(ptr, size);
1774     }
1775 }
1776
1777 #ifdef __cpp_aligned_new
1778
1779 [[gnu::visibility("default")]]
1780 void* operator new(size_t size, std::align_val_t a) {
1781     trigger_error_injector();
1782     auto ptr = allocate_aligned(size_t(a), size);
1783     return throw_if_null(ptr);
1784 }
1785
1786 [[gnu::visibility("default")]]
1787 void* operator new[](size_t size, std::align_val_t a) {
1788     trigger_error_injector();
1789     auto ptr = allocate_aligned(size_t(a), size);
1790     return throw_if_null(ptr);
1791 }
1792
1793 [[gnu::visibility("default")]]
1794 void* operator new(size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
1795     if (try_trigger_error_injector()) {
1796         return nullptr;
1797     }
1798     return allocate_aligned(size_t(a), size);
1799 }
1800
1801 [[gnu::visibility("default")]]
1802 void* operator new[](size_t size, std::align_val_t a, const std::nothrow_t&) noexcept {
1803     if (try_trigger_error_injector()) {
1804         return nullptr;
1805     }
1806     return allocate_aligned(size_t(a), size);
1807 }
1808
1809
1810 [[gnu::visibility("default")]]
1811 void operator delete(void* ptr, std::align_val_t a) noexcept {
1812     if (ptr) {
1813         seastar::memory::free(ptr);
1814     }
1815 }
1816
1817 [[gnu::visibility("default")]]
1818 void operator delete[](void* ptr, std::align_val_t a) noexcept {
1819     if (ptr) {
1820         seastar::memory::free(ptr);
1821     }
1822 }
1823
1824 [[gnu::visibility("default")]]
1825 void operator delete(void* ptr, size_t size, std::align_val_t a) noexcept {
1826     if (ptr) {
1827         seastar::memory::free_aligned(ptr, size_t(a), size);
1828     }
1829 }
1830
1831 [[gnu::visibility("default")]]
1832 void operator delete[](void* ptr, size_t size, std::align_val_t a) noexcept {
1833     if (ptr) {
1834         seastar::memory::free_aligned(ptr, size_t(a), size);
1835     }
1836 }
1837
1838 [[gnu::visibility("default")]]
1839 void operator delete(void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
1840     if (ptr) {
1841         seastar::memory::free(ptr);
1842     }
1843 }
1844
1845 [[gnu::visibility("default")]]
1846 void operator delete[](void* ptr, std::align_val_t a, const std::nothrow_t&) noexcept {
1847     if (ptr) {
1848         seastar::memory::free(ptr);
1849     }
1850 }
1851
1852 #endif
1853
1854 namespace seastar {
1855
1856 #else
1857
1858 namespace seastar {
1859
1860 namespace memory {
1861
1862 void set_heap_profiling_enabled(bool enabled) {
1863     seastar_logger.warn("Seastar compiled with default allocator, heap profiler not supported");
1864 }
1865
1866 scoped_heap_profiling::scoped_heap_profiling() noexcept {
1867     set_heap_profiling_enabled(true); // let it print the warning
1868 }
1869
1870 scoped_heap_profiling::~scoped_heap_profiling() {
1871 }
1872
1873 void enable_abort_on_allocation_failure() {
1874     seastar_logger.warn("Seastar compiled with default allocator, will not abort on bad_alloc");
1875 }
1876
1877 reclaimer::reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope) {
1878 }
1879
1880 reclaimer::reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope) {
1881 }
1882
1883 reclaimer::~reclaimer() {
1884 }
1885
1886 void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
1887 }
1888
1889 void configure(std::vector<resource::memory> m, bool mbind, compat::optional<std::string> hugepages_path) {
1890 }
1891
1892 statistics stats() {
1893     return statistics{0, 0, 0, 1 << 30, 1 << 30, 0, 0};
1894 }
1895
1896 bool drain_cross_cpu_freelist() {
1897     return false;
1898 }
1899
1900 memory_layout get_memory_layout() {
1901     throw std::runtime_error("get_memory_layout() not supported");
1902 }
1903
1904 size_t min_free_memory() {
1905     return 0;
1906 }
1907
1908 void set_min_free_pages(size_t pages) {
1909     // Ignore, reclaiming not supported for default allocator.
1910 }
1911
1912 void set_large_allocation_warning_threshold(size_t) {
1913     // Ignore, not supported for default allocator.
1914 }
1915
1916 size_t get_large_allocation_warning_threshold() {
1917     // Ignore, not supported for default allocator.
1918     return std::numeric_limits<size_t>::max();
1919 }
1920
1921 void disable_large_allocation_warning() {
1922     // Ignore, not supported for default allocator.
1923 }
1924
1925 }
1926
1927 }
1928
1929 namespace seastar {
1930
1931 #endif
1932
1933 /// \endcond
1934
1935 }