ceph/src/os/bluestore/fastbmap_allocator_impl.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Bitmap based in-memory allocator implementation.
   5  * Author: Igor Fedotov, ifedotov@suse.com
   6  *
   7  */
   8
   9 #ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
  10 #define __FAST_BITMAP_ALLOCATOR_IMPL_H
  11 #include "include/intarith.h"
  12
  13 #include <bit>
  14 #include <vector>
  15 #include <algorithm>
  16 #include <mutex>
  17
  18 typedef uint64_t slot_t;
  19
  20 #ifdef NON_CEPH_BUILD
  21 #include <assert.h>
  22 struct interval_t
  23 {
  24   uint64_t offset = 0;
  25   uint64_t length = 0;
  26
  27   interval_t() {}
  28   interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
  29   interval_t(const interval_t &ext) :
  30     offset(ext.offset), length(ext.length) {}
  31 };
  32 typedef std::vector<interval_t> interval_vector_t;
  33 typedef std::vector<slot_t> slot_vector_t;
  34 #else
  35 #include "include/ceph_assert.h"
  36 #include "common/likely.h"
  37 #include "os/bluestore/bluestore_types.h"
  38 #include "include/mempool.h"
  39 #include "common/ceph_mutex.h"
  40
  41 typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
  42 typedef PExtentVector interval_vector_t;
  43
  44 typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
  45
  46 #endif
  47
  48 // fitting into cache line on x86_64
  49 static const size_t slots_per_slotset = 8; // 8 slots per set
  50 static const size_t slotset_bytes = sizeof(slot_t) * slots_per_slotset;
  51 static const size_t bits_per_slot = sizeof(slot_t) * 8;
  52 static const size_t bits_per_slotset = slotset_bytes * 8;
  53 static const slot_t all_slot_set = 0xffffffffffffffff;
  54 static const slot_t all_slot_clear = 0;
  55
  56 inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
  57 {
  58 #ifdef __GNUC__
  59   if (start_pos == 0) {
  60     start_pos = __builtin_ffsll(slot_val);
  61     return start_pos ? start_pos - 1 : bits_per_slot;
  62   }
  63 #endif
  64   slot_t mask = slot_t(1) << start_pos;
  65   while (start_pos < bits_per_slot && !(slot_val & mask)) {
  66     mask <<= 1;
  67     ++start_pos;
  68   }
  69   return start_pos;
  70 }
  71
  72
  73 class AllocatorLevel
  74 {
  75 protected:
  76
  77   virtual uint64_t _children_per_slot() const = 0;
  78   virtual uint64_t _level_granularity() const = 0;
  79
  80 public:
  81   static uint64_t l0_dives;
  82   static uint64_t l0_iterations;
  83   static uint64_t l0_inner_iterations;
  84   static uint64_t alloc_fragments;
  85   static uint64_t alloc_fragments_fast;
  86   static uint64_t l2_allocs;
  87
  88   virtual ~AllocatorLevel()
  89   {}
  90
  91   virtual void collect_stats(
  92     std::map<size_t, size_t>& bins_overall) = 0;
  93
  94 };
  95
  96 class AllocatorLevel01 : public AllocatorLevel
  97 {
  98 protected:
  99   slot_vector_t l0; // set bit means free entry
 100   slot_vector_t l1;
 101   uint64_t l0_granularity = 0; // space per entry
 102   uint64_t l1_granularity = 0; // space per entry
 103
 104   size_t partial_l1_count = 0;
 105   size_t unalloc_l1_count = 0;
 106
 107   double get_fragmentation() const {
 108     double res = 0.0;
 109     auto total = unalloc_l1_count + partial_l1_count;
 110     if (total) {
 111       res = double(partial_l1_count) / double(total);
 112     }
 113     return res;
 114   }
 115
 116   uint64_t _level_granularity() const override
 117   {
 118     return l1_granularity;
 119   }
 120
 121   inline bool _is_slot_fully_allocated(uint64_t idx) const {
 122     return l1[idx] == all_slot_clear;
 123   }
 124 public:
 125   inline uint64_t get_min_alloc_size() const
 126   {
 127     return l0_granularity;
 128   }
 129
 130 };
 131
 132 template <class T>
 133 class AllocatorLevel02;
 134
 135 class AllocatorLevel01Loose : public AllocatorLevel01
 136 {
 137   enum {
 138     L1_ENTRY_WIDTH = 2,
 139     L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
 140     L1_ENTRY_FULL = 0x00,
 141     L1_ENTRY_PARTIAL = 0x01,
 142     L1_ENTRY_NOT_USED = 0x02,
 143     L1_ENTRY_FREE = 0x03,
 144     L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32
 145     L0_ENTRIES_PER_SLOT = bits_per_slot, // 64
 146   };
 147   uint64_t _children_per_slot() const override
 148   {
 149     return L1_ENTRIES_PER_SLOT;
 150   }
 151
 152   interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
 153     uint64_t min_length, interval_t* tail) const;
 154
 155   inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
 156     uint64_t len,
 157     interval_vector_t* res)
 158   {
 159     auto it = res->rbegin();
 160     if (max_length) {
 161       if (it != res->rend() && it->offset + it->length == offset) {
 162         auto l = max_length - it->length;
 163         if (l >= len) {
 164           it->length += len;
 165           return;
 166         } else {
 167           offset += l;
 168           len -= l;
 169           it->length += l;
 170         }
 171       }
 172
 173       while (len > max_length) {
 174         res->emplace_back(offset, max_length);
 175         offset += max_length;
 176         len -= max_length;
 177       }
 178       res->emplace_back(offset, len);
 179       return;
 180     }
 181
 182     if (it != res->rend() && it->offset + it->length == offset) {
 183       it->length += len;
 184     } else {
 185       res->emplace_back(offset, len);
 186     }
 187   }
 188
 189   bool _allocate_l0(uint64_t length,
 190     uint64_t max_length,
 191     uint64_t l0_pos0, uint64_t l0_pos1,
 192     uint64_t* allocated,
 193     interval_vector_t* res)
 194   {
 195     uint64_t d0 = L0_ENTRIES_PER_SLOT;
 196
 197     ++l0_dives;
 198
 199     ceph_assert(l0_pos0 < l0_pos1);
 200     ceph_assert(length > *allocated);
 201     ceph_assert(0 == (l0_pos0 % (slots_per_slotset * d0)));
 202     ceph_assert(0 == (l0_pos1 % (slots_per_slotset * d0)));
 203     ceph_assert(((length - *allocated) % l0_granularity) == 0);
 204
 205     uint64_t need_entries = (length - *allocated) / l0_granularity;
 206
 207     for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
 208       ++idx) {
 209       ++l0_iterations;
 210       slot_t& slot_val = l0[idx];
 211       auto base = idx * d0;
 212       if (slot_val == all_slot_clear) {
 213         continue;
 214       } else if (slot_val == all_slot_set) {
 215         uint64_t to_alloc = std::min(need_entries, d0);
 216         *allocated += to_alloc * l0_granularity;
 217         ++alloc_fragments;
 218         need_entries -= to_alloc;
 219
 220         _fragment_and_emplace(max_length, base * l0_granularity,
 221           to_alloc * l0_granularity, res);
 222
 223         if (to_alloc == d0) {
 224           slot_val = all_slot_clear;
 225         } else {
 226           _mark_alloc_l0(base, base + to_alloc);
 227         }
 228         continue;
 229       }
 230
 231       auto free_pos = find_next_set_bit(slot_val, 0);
 232       ceph_assert(free_pos < bits_per_slot);
 233       auto next_pos = free_pos + 1;
 234       while (next_pos < bits_per_slot &&
 235         (next_pos - free_pos) < need_entries) {
 236         ++l0_inner_iterations;
 237
 238         if (0 == (slot_val & (slot_t(1) << next_pos))) {
 239           auto to_alloc = (next_pos - free_pos);
 240           *allocated += to_alloc * l0_granularity;
 241           ++alloc_fragments;
 242           need_entries -= to_alloc;
 243           _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
 244             to_alloc * l0_granularity, res);
 245           _mark_alloc_l0(base + free_pos, base + next_pos);
 246           free_pos = find_next_set_bit(slot_val, next_pos + 1);
 247           next_pos = free_pos + 1;
 248         } else {
 249           ++next_pos;
 250         }
 251       }
 252       if (need_entries && free_pos < bits_per_slot) {
 253         auto to_alloc = std::min(need_entries, d0 - free_pos);
 254         *allocated += to_alloc * l0_granularity;
 255         ++alloc_fragments;
 256         need_entries -= to_alloc;
 257         _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
 258           to_alloc * l0_granularity, res);
 259         _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
 260       }
 261     }
 262     return _is_empty_l0(l0_pos0, l0_pos1);
 263   }
 264
 265 protected:
 266
 267   friend class AllocatorLevel02<AllocatorLevel01Loose>;
 268
 269   void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
 270   {
 271     l0_granularity = _alloc_unit;
 272     // 512 bits at L0 mapped to L1 entry
 273     l1_granularity = l0_granularity * bits_per_slotset;
 274
 275     // capacity to have slot alignment at l1
 276     auto aligned_capacity =
 277       p2roundup((int64_t)capacity,
 278         int64_t(l1_granularity * slots_per_slotset * _children_per_slot()));
 279     size_t slot_count =
 280       aligned_capacity / l1_granularity / _children_per_slot();
 281     // we use set bit(s) as a marker for (partially) free entry
 282     l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
 283
 284     // l0 slot count
 285     size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
 286     // we use set bit(s) as a marker for (partially) free entry
 287     l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
 288
 289     partial_l1_count = unalloc_l1_count = 0;
 290     if (mark_as_free) {
 291       unalloc_l1_count = slot_count * _children_per_slot();
 292       auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
 293       _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
 294     }
 295   }
 296
 297   struct search_ctx_t
 298   {
 299     size_t partial_count = 0;
 300     size_t free_count = 0;
 301     uint64_t free_l1_pos = 0;
 302
 303     uint64_t min_affordable_len = 0;
 304     uint64_t min_affordable_offs = 0;
 305     uint64_t affordable_len = 0;
 306     uint64_t affordable_offs = 0;
 307
 308     bool fully_processed = false;
 309
 310     void reset()
 311     {
 312       *this = search_ctx_t();
 313     }
 314   };
 315   enum {
 316     NO_STOP,
 317     STOP_ON_EMPTY,
 318     STOP_ON_PARTIAL,
 319   };
 320   void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
 321     uint64_t length, uint64_t min_length, int mode,
 322     search_ctx_t* ctx);
 323
 324   void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
 325   void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
 326   uint64_t _claim_free_to_left_l0(int64_t l0_pos_start);
 327   uint64_t _claim_free_to_right_l0(int64_t l0_pos_start);
 328
 329
 330   void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 331   {
 332     _mark_alloc_l0(l0_pos_start, l0_pos_end);
 333     l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
 334     l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
 335     _mark_l1_on_l0(l0_pos_start, l0_pos_end);
 336   }
 337
 338   void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 339   {
 340     auto d0 = L0_ENTRIES_PER_SLOT;
 341
 342     auto pos = l0_pos_start;
 343     slot_t bits = (slot_t)1 << (l0_pos_start % d0);
 344     slot_t* val_s = &l0[pos / d0];
 345     int64_t pos_e = std::min(l0_pos_end,
 346                              p2roundup<int64_t>(l0_pos_start + 1, d0));
 347     while (pos < pos_e) {
 348       *val_s |=  bits;
 349       bits <<= 1;
 350       pos++;
 351     }
 352     pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
 353     while (pos < pos_e) {
 354       *(++val_s) = all_slot_set;
 355       pos += d0;
 356     }
 357     bits = 1;
 358     ++val_s;
 359     while (pos < l0_pos_end) {
 360       *val_s |= bits;
 361       bits <<= 1;
 362       pos++;
 363     }
 364   }
 365
 366   void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 367   {
 368     _mark_free_l0(l0_pos_start, l0_pos_end);
 369     l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
 370     l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
 371     _mark_l1_on_l0(l0_pos_start, l0_pos_end);
 372   }
 373
 374   bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
 375   {
 376     bool no_free = true;
 377     uint64_t d = slots_per_slotset * L0_ENTRIES_PER_SLOT;
 378     ceph_assert(0 == (l0_pos % d));
 379     ceph_assert(0 == (l0_pos_end % d));
 380
 381     auto idx = l0_pos / L0_ENTRIES_PER_SLOT;
 382     auto idx_end = l0_pos_end / L0_ENTRIES_PER_SLOT;
 383     while (idx < idx_end && no_free) {
 384       no_free = l0[idx] == all_slot_clear;
 385       ++idx;
 386     }
 387     return no_free;
 388   }
 389   bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
 390   {
 391     bool no_free = true;
 392     uint64_t d = slots_per_slotset * _children_per_slot();
 393     ceph_assert(0 == (l1_pos % d));
 394     ceph_assert(0 == (l1_pos_end % d));
 395
 396     auto idx = l1_pos / L1_ENTRIES_PER_SLOT;
 397     auto idx_end = l1_pos_end / L1_ENTRIES_PER_SLOT;
 398     while (idx < idx_end && no_free) {
 399       no_free = _is_slot_fully_allocated(idx);
 400       ++idx;
 401     }
 402     return no_free;
 403   }
 404
 405   interval_t _allocate_l1_contiguous(uint64_t length,
 406     uint64_t min_length, uint64_t max_length,
 407     uint64_t pos_start, uint64_t pos_end);
 408
 409   bool _allocate_l1(uint64_t length,
 410     uint64_t min_length, uint64_t max_length,
 411     uint64_t l1_pos_start, uint64_t l1_pos_end,
 412     uint64_t* allocated,
 413     interval_vector_t* res);
 414
 415   uint64_t _mark_alloc_l1(uint64_t offset, uint64_t length)
 416   {
 417     uint64_t l0_pos_start = offset / l0_granularity;
 418     uint64_t l0_pos_end = p2roundup(offset + length, l0_granularity) / l0_granularity;
 419     _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
 420     return l0_granularity * (l0_pos_end - l0_pos_start);
 421   }
 422
 423   uint64_t _free_l1(uint64_t offs, uint64_t len)
 424   {
 425     uint64_t l0_pos_start = offs / l0_granularity;
 426     uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity;
 427     _mark_free_l1_l0(l0_pos_start, l0_pos_end);
 428     return l0_granularity * (l0_pos_end - l0_pos_start);
 429   }
 430
 431   uint64_t claim_free_to_left_l1(uint64_t offs)
 432   {
 433     uint64_t l0_pos_end = offs / l0_granularity;
 434     uint64_t l0_pos_start = _claim_free_to_left_l0(l0_pos_end);
 435     if (l0_pos_start < l0_pos_end) {
 436       _mark_l1_on_l0(
 437         p2align(l0_pos_start, uint64_t(bits_per_slotset)),
 438         p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
 439       return l0_granularity * (l0_pos_end - l0_pos_start);
 440     }
 441     return 0;
 442   }
 443
 444   uint64_t claim_free_to_right_l1(uint64_t offs)
 445   {
 446     uint64_t l0_pos_start = offs / l0_granularity;
 447     uint64_t l0_pos_end = _claim_free_to_right_l0(l0_pos_start);
 448
 449     if (l0_pos_start < l0_pos_end) {
 450       _mark_l1_on_l0(
 451         p2align(l0_pos_start, uint64_t(bits_per_slotset)),
 452         p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
 453       return l0_granularity * (l0_pos_end - l0_pos_start);
 454     }
 455     return 0;
 456   }
 457
 458
 459 public:
 460   uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
 461   {
 462     if (pos1 == 0) {
 463       pos1 = l1.size() * L1_ENTRIES_PER_SLOT;
 464     }
 465     auto avail = debug_get_free(pos0, pos1);
 466     return (pos1 - pos0) * l1_granularity - avail;
 467   }
 468
 469   uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
 470   {
 471     ceph_assert(0 == (l1_pos0 % L1_ENTRIES_PER_SLOT));
 472     ceph_assert(0 == (l1_pos1 % L1_ENTRIES_PER_SLOT));
 473
 474     auto idx0 = l1_pos0 * slots_per_slotset;
 475     auto idx1 = l1_pos1 * slots_per_slotset;
 476
 477     if (idx1 == 0) {
 478       idx1 = l0.size();
 479     }
 480
 481     uint64_t res = 0;
 482     for (uint64_t i = idx0; i < idx1; ++i) {
 483       auto v = l0[i];
 484       if (v == all_slot_set) {
 485         res += L0_ENTRIES_PER_SLOT;
 486       } else if (v != all_slot_clear) {
 487         size_t cnt = 0;
 488 #ifdef __GNUC__
 489         cnt = __builtin_popcountll(v);
 490 #else
 491         // Kernighan's Alg to count set bits
 492         while (v) {
 493           v &= (v - 1);
 494           cnt++;
 495         }
 496 #endif
 497         res += cnt;
 498       }
 499     }
 500     return res * l0_granularity;
 501   }
 502   void collect_stats(
 503     std::map<size_t, size_t>& bins_overall) override;
 504
 505   static inline ssize_t count_0s(slot_t slot_val, size_t start_pos);
 506   static inline ssize_t count_1s(slot_t slot_val, size_t start_pos);
 507   void foreach_internal(std::function<void(uint64_t offset, uint64_t length)> notify);
 508 };
 509
 510
 511 class AllocatorLevel01Compact : public AllocatorLevel01
 512 {
 513   uint64_t _children_per_slot() const override
 514   {
 515     return 8;
 516   }
 517 public:
 518   void collect_stats(
 519     std::map<size_t, size_t>& bins_overall) override
 520   {
 521     // not implemented
 522   }
 523 };
 524
 525 template <class L1>
 526 class AllocatorLevel02 : public AllocatorLevel
 527 {
 528 public:
 529   uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
 530   {
 531     std::lock_guard l(lock);
 532     return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
 533       pos1 * l1._children_per_slot() * bits_per_slot);
 534   }
 535   uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
 536   {
 537     std::lock_guard l(lock);
 538     return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
 539       pos1 * l1._children_per_slot() * bits_per_slot);
 540   }
 541
 542   uint64_t get_available()
 543   {
 544     std::lock_guard l(lock);
 545     return available;
 546   }
 547   inline uint64_t get_min_alloc_size() const
 548   {
 549     return l1.get_min_alloc_size();
 550   }
 551   void collect_stats(
 552     std::map<size_t, size_t>& bins_overall) override {
 553
 554       std::lock_guard l(lock);
 555       l1.collect_stats(bins_overall);
 556   }
 557   uint64_t claim_free_to_left(uint64_t offset) {
 558     std::lock_guard l(lock);
 559     auto allocated = l1.claim_free_to_left_l1(offset);
 560     ceph_assert(available >= allocated);
 561     available -= allocated;
 562
 563     uint64_t l2_pos = (offset - allocated) / l2_granularity;
 564     uint64_t l2_pos_end =
 565       p2roundup(int64_t(offset), int64_t(l2_granularity)) / l2_granularity;
 566     _mark_l2_on_l1(l2_pos, l2_pos_end);
 567     return allocated;
 568   }
 569
 570   uint64_t claim_free_to_right(uint64_t offset) {
 571     std::lock_guard l(lock);
 572     auto allocated = l1.claim_free_to_right_l1(offset);
 573     ceph_assert(available >= allocated);
 574     available -= allocated;
 575
 576     uint64_t l2_pos = (offset) / l2_granularity;
 577     int64_t end = offset + allocated;
 578     uint64_t l2_pos_end = p2roundup(end, int64_t(l2_granularity)) / l2_granularity;
 579     _mark_l2_on_l1(l2_pos, l2_pos_end);
 580     return allocated;
 581   }
 582
 583   void foreach_internal(
 584     std::function<void(uint64_t offset, uint64_t length)> notify)
 585   {
 586     size_t alloc_size = get_min_alloc_size();
 587     auto multiply_by_alloc_size = [alloc_size, notify](size_t off, size_t len) {
 588       notify(off * alloc_size, len * alloc_size);
 589     };
 590     std::lock_guard l(lock);
 591     l1.foreach_internal(multiply_by_alloc_size);
 592   }
 593   double get_fragmentation_internal() {
 594     std::lock_guard l(lock);
 595     return l1.get_fragmentation();
 596   }
 597
 598 protected:
 599   ceph::mutex lock = ceph::make_mutex("AllocatorLevel02::lock");
 600   L1 l1;
 601   slot_vector_t l2;
 602   uint64_t l2_granularity = 0; // space per entry
 603   uint64_t available = 0;
 604   uint64_t last_pos = 0;
 605
 606   enum {
 607     L1_ENTRIES_PER_SLOT = bits_per_slot, // 64
 608   };
 609
 610   uint64_t _children_per_slot() const override
 611   {
 612     return L1_ENTRIES_PER_SLOT;
 613   }
 614   uint64_t _level_granularity() const override
 615   {
 616     return l2_granularity;
 617   }
 618
 619   void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
 620   {
 621     ceph_assert(std::has_single_bit(_alloc_unit));
 622     l1._init(capacity, _alloc_unit, mark_as_free);
 623
 624     l2_granularity =
 625       l1._level_granularity() * l1._children_per_slot() * slots_per_slotset;
 626
 627     // capacity to have slot alignment at l2
 628     auto aligned_capacity =
 629       p2roundup((int64_t)capacity, (int64_t)l2_granularity * L1_ENTRIES_PER_SLOT);
 630     size_t elem_count = aligned_capacity / l2_granularity / L1_ENTRIES_PER_SLOT;
 631     // we use set bit(s) as a marker for (partially) free entry
 632     l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
 633
 634     if (mark_as_free) {
 635       // capacity to have slotset alignment at l1
 636       auto l2_pos_no_use =
 637         p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
 638       _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
 639       available = p2align(capacity, _alloc_unit);
 640     } else {
 641       available = 0;
 642     }
 643   }
 644
 645   void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
 646   {
 647     auto d = L1_ENTRIES_PER_SLOT;
 648     ceph_assert(0 <= l2_pos_end);
 649     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 650
 651     while (l2_pos < l2_pos_end) {
 652       l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
 653       ++l2_pos;
 654     }
 655   }
 656
 657   void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
 658   {
 659     auto d = L1_ENTRIES_PER_SLOT;
 660     ceph_assert(0 <= l2_pos_end);
 661     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 662
 663     while (l2_pos < l2_pos_end) {
 664         l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
 665         ++l2_pos;
 666     }
 667   }
 668
 669   void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
 670   {
 671     auto d = L1_ENTRIES_PER_SLOT;
 672     ceph_assert(0 <= l2_pos_end);
 673     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 674
 675     auto idx = l2_pos * slots_per_slotset;
 676     auto idx_end = l2_pos_end * slots_per_slotset;
 677     bool all_allocated = true;
 678     while (idx < idx_end) {
 679       if (!l1._is_slot_fully_allocated(idx)) {
 680         all_allocated = false;
 681         idx = p2roundup(int64_t(++idx), int64_t(slots_per_slotset));
 682       }
 683       else {
 684         ++idx;
 685       }
 686       if ((idx % slots_per_slotset) == 0) {
 687         if (all_allocated) {
 688           l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
 689         }
 690         else {
 691           l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
 692         }
 693         all_allocated = true;
 694         ++l2_pos;
 695       }
 696     }
 697   }
 698
 699   void _allocate_l2(uint64_t length,
 700     uint64_t min_length,
 701     uint64_t max_length,
 702     uint64_t hint,
 703
 704     uint64_t* allocated,
 705     interval_vector_t* res)
 706   {
 707     uint64_t prev_allocated = *allocated;
 708     uint64_t d = L1_ENTRIES_PER_SLOT;
 709     ceph_assert(min_length <= l2_granularity);
 710     ceph_assert(max_length == 0 || max_length >= min_length);
 711     ceph_assert(max_length == 0 || (max_length % min_length) == 0);
 712     ceph_assert(length >= min_length);
 713     ceph_assert((length % min_length) == 0);
 714
 715     uint64_t cap = 1ull << 31;
 716     if (max_length == 0 || max_length >= cap) {
 717       max_length = cap;
 718     }
 719
 720     uint64_t l1_w = slots_per_slotset * l1._children_per_slot();
 721
 722     std::lock_guard l(lock);
 723
 724     if (available < min_length) {
 725       return;
 726     }
 727     if (hint != 0) {
 728       last_pos = (hint / (d * l2_granularity)) < l2.size() ? p2align(hint / l2_granularity, d) : 0;
 729     }
 730     auto l2_pos = last_pos;
 731     auto last_pos0 = last_pos;
 732     auto pos = last_pos / d;
 733     auto pos_end = l2.size();
 734     // outer loop below is intended to optimize the performance by
 735     // avoiding 'modulo' operations inside the internal loop.
 736     // Looks like they have negative impact on the performance
 737     for (auto i = 0; i < 2; ++i) {
 738       for(; length > *allocated && pos < pos_end; ++pos) {
 739         slot_t& slot_val = l2[pos];
 740         size_t free_pos = 0;
 741         bool all_set = false;
 742         if (slot_val == all_slot_clear) {
 743           l2_pos += d;
 744           last_pos = l2_pos;
 745           continue;
 746         } else if (slot_val == all_slot_set) {
 747           free_pos = 0;
 748           all_set = true;
 749         } else {
 750           free_pos = find_next_set_bit(slot_val, 0);
 751           ceph_assert(free_pos < bits_per_slot);
 752         }
 753         do {
 754           ceph_assert(length > *allocated);
 755           bool empty = l1._allocate_l1(length,
 756             min_length,
 757             max_length,
 758             (l2_pos + free_pos) * l1_w,
 759             (l2_pos + free_pos + 1) * l1_w,
 760             allocated,
 761             res);
 762           if (empty) {
 763             slot_val &= ~(slot_t(1) << free_pos);
 764           }
 765           if (length <= *allocated || slot_val == all_slot_clear) {
 766             break;
 767           }
 768           ++free_pos;
 769           if (!all_set) {
 770             free_pos = find_next_set_bit(slot_val, free_pos);
 771           }
 772         } while (free_pos < bits_per_slot);
 773         last_pos = l2_pos;
 774         l2_pos += d;
 775       }
 776       l2_pos = 0;
 777       pos = 0;
 778       pos_end = last_pos0 / d;
 779     }
 780
 781     ++l2_allocs;
 782     auto allocated_here = *allocated - prev_allocated;
 783     ceph_assert(available >= allocated_here);
 784     available -= allocated_here;
 785   }
 786
 787 #ifndef NON_CEPH_BUILD
 788   // to provide compatibility with BlueStore's allocator interface
 789   void _free_l2(const interval_set<uint64_t> & rr)
 790   {
 791     uint64_t released = 0;
 792     std::lock_guard l(lock);
 793     for (auto r : rr) {
 794       released += l1._free_l1(r.first, r.second);
 795       uint64_t l2_pos = r.first / l2_granularity;
 796       uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
 797
 798       _mark_l2_free(l2_pos, l2_pos_end);
 799     }
 800     available += released;
 801   }
 802 #endif
 803
 804   template <typename T>
 805   void _free_l2(const T& rr)
 806   {
 807     uint64_t released = 0;
 808     std::lock_guard l(lock);
 809     for (auto r : rr) {
 810       released += l1._free_l1(r.offset, r.length);
 811       uint64_t l2_pos = r.offset / l2_granularity;
 812       uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
 813
 814       _mark_l2_free(l2_pos, l2_pos_end);
 815     }
 816     available += released;
 817   }
 818
 819   void _mark_allocated(uint64_t o, uint64_t len)
 820   {
 821     uint64_t l2_pos = o / l2_granularity;
 822     uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
 823
 824     std::lock_guard l(lock);
 825     auto allocated = l1._mark_alloc_l1(o, len);
 826     ceph_assert(available >= allocated);
 827     available -= allocated;
 828     _mark_l2_on_l1(l2_pos, l2_pos_end);
 829   }
 830
 831   void _mark_free(uint64_t o, uint64_t len)
 832   {
 833     uint64_t l2_pos = o / l2_granularity;
 834     uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
 835
 836     std::lock_guard l(lock);
 837     available += l1._free_l1(o, len);
 838     _mark_l2_free(l2_pos, l2_pos_end);
 839   }
 840   void _shutdown()
 841   {
 842     last_pos = 0;
 843   }
 844 };
 845
 846 #endif