ceph/src/os/bluestore/fastbmap_allocator_impl.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Bitmap based in-memory allocator implementation.
   5  * Author: Igor Fedotov, ifedotov@suse.com
   6  *
   7  */
   8
   9 #ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
  10 #define __FAST_BITMAP_ALLOCATOR_IMPL_H
  11 #include "include/intarith.h"
  12
  13 #include <vector>
  14 #include <algorithm>
  15 #include <mutex>
  16
  17 typedef uint64_t slot_t;
  18
  19 #ifdef NON_CEPH_BUILD
  20 #include <assert.h>
  21 struct interval_t
  22 {
  23   uint64_t offset = 0;
  24   uint64_t length = 0;
  25
  26   interval_t() {}
  27   interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
  28   interval_t(const interval_t &ext) :
  29     offset(ext.offset), length(ext.length) {}
  30 };
  31 typedef std::vector<interval_t> interval_vector_t;
  32 typedef std::vector<slot_t> slot_vector_t;
  33 #else
  34 #include "include/ceph_assert.h"
  35 #include "common/likely.h"
  36 #include "os/bluestore/bluestore_types.h"
  37 #include "include/mempool.h"
  38 #include "common/ceph_mutex.h"
  39
  40 typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
  41 typedef PExtentVector interval_vector_t;
  42
  43 typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
  44
  45 #endif
  46
  47 // fitting into cache line on x86_64
  48 static const size_t slots_per_slotset = 8; // 8 slots per set
  49 static const size_t slotset_bytes = sizeof(slot_t) * slots_per_slotset;
  50 static const size_t bits_per_slot = sizeof(slot_t) * 8;
  51 static const size_t bits_per_slotset = slotset_bytes * 8;
  52 static const slot_t all_slot_set = 0xffffffffffffffff;
  53 static const slot_t all_slot_clear = 0;
  54
  55 inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
  56 {
  57 #ifdef __GNUC__
  58   if (start_pos == 0) {
  59     start_pos = __builtin_ffsll(slot_val);
  60     return start_pos ? start_pos - 1 : bits_per_slot;
  61   }
  62 #endif
  63   slot_t mask = slot_t(1) << start_pos;
  64   while (start_pos < bits_per_slot && !(slot_val & mask)) {
  65     mask <<= 1;
  66     ++start_pos;
  67   }
  68   return start_pos;
  69 }
  70
  71
  72 class AllocatorLevel
  73 {
  74 protected:
  75
  76   virtual uint64_t _children_per_slot() const = 0;
  77   virtual uint64_t _level_granularity() const = 0;
  78
  79 public:
  80   static uint64_t l0_dives;
  81   static uint64_t l0_iterations;
  82   static uint64_t l0_inner_iterations;
  83   static uint64_t alloc_fragments;
  84   static uint64_t alloc_fragments_fast;
  85   static uint64_t l2_allocs;
  86
  87   virtual ~AllocatorLevel()
  88   {}
  89
  90   virtual void collect_stats(
  91     std::map<size_t, size_t>& bins_overall) = 0;
  92
  93 };
  94
  95 class AllocatorLevel01 : public AllocatorLevel
  96 {
  97 protected:
  98   slot_vector_t l0; // set bit means free entry
  99   slot_vector_t l1;
 100   uint64_t l0_granularity = 0; // space per entry
 101   uint64_t l1_granularity = 0; // space per entry
 102
 103   size_t partial_l1_count = 0;
 104   size_t unalloc_l1_count = 0;
 105
 106   double get_fragmentation() const {
 107     double res = 0.0;
 108     auto total = unalloc_l1_count + partial_l1_count;
 109     if (total) {
 110       res = double(partial_l1_count) / double(total);
 111     }
 112     return res;
 113   }
 114
 115   uint64_t _level_granularity() const override
 116   {
 117     return l1_granularity;
 118   }
 119
 120   inline bool _is_slot_fully_allocated(uint64_t idx) const {
 121     return l1[idx] == all_slot_clear;
 122   }
 123 public:
 124   inline uint64_t get_min_alloc_size() const
 125   {
 126     return l0_granularity;
 127   }
 128
 129 };
 130
 131 template <class T>
 132 class AllocatorLevel02;
 133
 134 class AllocatorLevel01Loose : public AllocatorLevel01
 135 {
 136   enum {
 137     L1_ENTRY_WIDTH = 2,
 138     L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
 139     L1_ENTRY_FULL = 0x00,
 140     L1_ENTRY_PARTIAL = 0x01,
 141     L1_ENTRY_NOT_USED = 0x02,
 142     L1_ENTRY_FREE = 0x03,
 143     L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32
 144     L0_ENTRIES_PER_SLOT = bits_per_slot, // 64
 145   };
 146   uint64_t _children_per_slot() const override
 147   {
 148     return L1_ENTRIES_PER_SLOT;
 149   }
 150
 151   interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
 152     uint64_t min_length, interval_t* tail) const;
 153
 154   inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
 155     uint64_t len,
 156     interval_vector_t* res)
 157   {
 158     auto it = res->rbegin();
 159     if (max_length) {
 160       if (it != res->rend() && it->offset + it->length == offset) {
 161         auto l = max_length - it->length;
 162         if (l >= len) {
 163           it->length += len;
 164           return;
 165         } else {
 166           offset += l;
 167           len -= l;
 168           it->length += l;
 169         }
 170       }
 171
 172       while (len > max_length) {
 173         res->emplace_back(offset, max_length);
 174         offset += max_length;
 175         len -= max_length;
 176       }
 177       res->emplace_back(offset, len);
 178       return;
 179     }
 180
 181     if (it != res->rend() && it->offset + it->length == offset) {
 182       it->length += len;
 183     } else {
 184       res->emplace_back(offset, len);
 185     }
 186   }
 187
 188   bool _allocate_l0(uint64_t length,
 189     uint64_t max_length,
 190     uint64_t l0_pos0, uint64_t l0_pos1,
 191     uint64_t* allocated,
 192     interval_vector_t* res)
 193   {
 194     uint64_t d0 = L0_ENTRIES_PER_SLOT;
 195
 196     ++l0_dives;
 197
 198     ceph_assert(l0_pos0 < l0_pos1);
 199     ceph_assert(length > *allocated);
 200     ceph_assert(0 == (l0_pos0 % (slots_per_slotset * d0)));
 201     ceph_assert(0 == (l0_pos1 % (slots_per_slotset * d0)));
 202     ceph_assert(((length - *allocated) % l0_granularity) == 0);
 203
 204     uint64_t need_entries = (length - *allocated) / l0_granularity;
 205
 206     for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
 207       ++idx) {
 208       ++l0_iterations;
 209       slot_t& slot_val = l0[idx];
 210       auto base = idx * d0;
 211       if (slot_val == all_slot_clear) {
 212         continue;
 213       } else if (slot_val == all_slot_set) {
 214         uint64_t to_alloc = std::min(need_entries, d0);
 215         *allocated += to_alloc * l0_granularity;
 216         ++alloc_fragments;
 217         need_entries -= to_alloc;
 218
 219         _fragment_and_emplace(max_length, base * l0_granularity,
 220           to_alloc * l0_granularity, res);
 221
 222         if (to_alloc == d0) {
 223           slot_val = all_slot_clear;
 224         } else {
 225           _mark_alloc_l0(base, base + to_alloc);
 226         }
 227         continue;
 228       }
 229
 230       auto free_pos = find_next_set_bit(slot_val, 0);
 231       ceph_assert(free_pos < bits_per_slot);
 232       auto next_pos = free_pos + 1;
 233       while (next_pos < bits_per_slot &&
 234         (next_pos - free_pos) < need_entries) {
 235         ++l0_inner_iterations;
 236
 237         if (0 == (slot_val & (slot_t(1) << next_pos))) {
 238           auto to_alloc = (next_pos - free_pos);
 239           *allocated += to_alloc * l0_granularity;
 240           ++alloc_fragments;
 241           need_entries -= to_alloc;
 242           _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
 243             to_alloc * l0_granularity, res);
 244           _mark_alloc_l0(base + free_pos, base + next_pos);
 245           free_pos = find_next_set_bit(slot_val, next_pos + 1);
 246           next_pos = free_pos + 1;
 247         } else {
 248           ++next_pos;
 249         }
 250       }
 251       if (need_entries && free_pos < bits_per_slot) {
 252         auto to_alloc = std::min(need_entries, d0 - free_pos);
 253         *allocated += to_alloc * l0_granularity;
 254         ++alloc_fragments;
 255         need_entries -= to_alloc;
 256         _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
 257           to_alloc * l0_granularity, res);
 258         _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
 259       }
 260     }
 261     return _is_empty_l0(l0_pos0, l0_pos1);
 262   }
 263
 264 protected:
 265
 266   friend class AllocatorLevel02<AllocatorLevel01Loose>;
 267
 268   void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
 269   {
 270     l0_granularity = _alloc_unit;
 271     // 512 bits at L0 mapped to L1 entry
 272     l1_granularity = l0_granularity * bits_per_slotset;
 273
 274     // capacity to have slot alignment at l1
 275     auto aligned_capacity =
 276       p2roundup((int64_t)capacity,
 277         int64_t(l1_granularity * slots_per_slotset * _children_per_slot()));
 278     size_t slot_count =
 279       aligned_capacity / l1_granularity / _children_per_slot();
 280     // we use set bit(s) as a marker for (partially) free entry
 281     l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
 282
 283     // l0 slot count
 284     size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
 285     // we use set bit(s) as a marker for (partially) free entry
 286     l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
 287
 288     partial_l1_count = unalloc_l1_count = 0;
 289     if (mark_as_free) {
 290       unalloc_l1_count = slot_count * _children_per_slot();
 291       auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
 292       _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
 293     }
 294   }
 295
 296   struct search_ctx_t
 297   {
 298     size_t partial_count = 0;
 299     size_t free_count = 0;
 300     uint64_t free_l1_pos = 0;
 301
 302     uint64_t min_affordable_len = 0;
 303     uint64_t min_affordable_offs = 0;
 304     uint64_t affordable_len = 0;
 305     uint64_t affordable_offs = 0;
 306
 307     bool fully_processed = false;
 308
 309     void reset()
 310     {
 311       *this = search_ctx_t();
 312     }
 313   };
 314   enum {
 315     NO_STOP,
 316     STOP_ON_EMPTY,
 317     STOP_ON_PARTIAL,
 318   };
 319   void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
 320     uint64_t length, uint64_t min_length, int mode,
 321     search_ctx_t* ctx);
 322
 323   void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
 324   void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
 325
 326   void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 327   {
 328     _mark_alloc_l0(l0_pos_start, l0_pos_end);
 329     l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
 330     l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
 331     _mark_l1_on_l0(l0_pos_start, l0_pos_end);
 332   }
 333
 334   void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 335   {
 336     auto d0 = L0_ENTRIES_PER_SLOT;
 337
 338     auto pos = l0_pos_start;
 339     slot_t bits = (slot_t)1 << (l0_pos_start % d0);
 340     slot_t* val_s = &l0[pos / d0];
 341     int64_t pos_e = std::min(l0_pos_end,
 342                              p2roundup<int64_t>(l0_pos_start + 1, d0));
 343     while (pos < pos_e) {
 344       *val_s |=  bits;
 345       bits <<= 1;
 346       pos++;
 347     }
 348     pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
 349     while (pos < pos_e) {
 350       *(++val_s) = all_slot_set;
 351       pos += d0;
 352     }
 353     bits = 1;
 354     ++val_s;
 355     while (pos < l0_pos_end) {
 356       *val_s |= bits;
 357       bits <<= 1;
 358       pos++;
 359     }
 360   }
 361
 362   void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 363   {
 364     _mark_free_l0(l0_pos_start, l0_pos_end);
 365     l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
 366     l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
 367     _mark_l1_on_l0(l0_pos_start, l0_pos_end);
 368   }
 369
 370   bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
 371   {
 372     bool no_free = true;
 373     uint64_t d = slots_per_slotset * L0_ENTRIES_PER_SLOT;
 374     ceph_assert(0 == (l0_pos % d));
 375     ceph_assert(0 == (l0_pos_end % d));
 376
 377     auto idx = l0_pos / L0_ENTRIES_PER_SLOT;
 378     auto idx_end = l0_pos_end / L0_ENTRIES_PER_SLOT;
 379     while (idx < idx_end && no_free) {
 380       no_free = l0[idx] == all_slot_clear;
 381       ++idx;
 382     }
 383     return no_free;
 384   }
 385   bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
 386   {
 387     bool no_free = true;
 388     uint64_t d = slots_per_slotset * _children_per_slot();
 389     ceph_assert(0 == (l1_pos % d));
 390     ceph_assert(0 == (l1_pos_end % d));
 391
 392     auto idx = l1_pos / L1_ENTRIES_PER_SLOT;
 393     auto idx_end = l1_pos_end / L1_ENTRIES_PER_SLOT;
 394     while (idx < idx_end && no_free) {
 395       no_free = _is_slot_fully_allocated(idx);
 396       ++idx;
 397     }
 398     return no_free;
 399   }
 400
 401   interval_t _allocate_l1_contiguous(uint64_t length,
 402     uint64_t min_length, uint64_t max_length,
 403     uint64_t pos_start, uint64_t pos_end);
 404
 405   bool _allocate_l1(uint64_t length,
 406     uint64_t min_length, uint64_t max_length,
 407     uint64_t l1_pos_start, uint64_t l1_pos_end,
 408     uint64_t* allocated,
 409     interval_vector_t* res);
 410
 411   uint64_t _mark_alloc_l1(uint64_t offset, uint64_t length)
 412   {
 413     uint64_t l0_pos_start = offset / l0_granularity;
 414     uint64_t l0_pos_end = p2roundup(offset + length, l0_granularity) / l0_granularity;
 415     _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
 416     return l0_granularity * (l0_pos_end - l0_pos_start);
 417   }
 418
 419   uint64_t _free_l1(uint64_t offs, uint64_t len)
 420   {
 421     uint64_t l0_pos_start = offs / l0_granularity;
 422     uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity;
 423     _mark_free_l1_l0(l0_pos_start, l0_pos_end);
 424     return l0_granularity * (l0_pos_end - l0_pos_start);
 425   }
 426
 427 public:
 428   uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
 429   {
 430     if (pos1 == 0) {
 431       pos1 = l1.size() * L1_ENTRIES_PER_SLOT;
 432     }
 433     auto avail = debug_get_free(pos0, pos1);
 434     return (pos1 - pos0) * l1_granularity - avail;
 435   }
 436
 437   uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
 438   {
 439     ceph_assert(0 == (l1_pos0 % L1_ENTRIES_PER_SLOT));
 440     ceph_assert(0 == (l1_pos1 % L1_ENTRIES_PER_SLOT));
 441
 442     auto idx0 = l1_pos0 * slots_per_slotset;
 443     auto idx1 = l1_pos1 * slots_per_slotset;
 444
 445     if (idx1 == 0) {
 446       idx1 = l0.size();
 447     }
 448
 449     uint64_t res = 0;
 450     for (uint64_t i = idx0; i < idx1; ++i) {
 451       auto v = l0[i];
 452       if (v == all_slot_set) {
 453         res += L0_ENTRIES_PER_SLOT;
 454       } else if (v != all_slot_clear) {
 455         size_t cnt = 0;
 456 #ifdef __GNUC__
 457         cnt = __builtin_popcountll(v);
 458 #else
 459         // Kernighan's Alg to count set bits
 460         while (v) {
 461           v &= (v - 1);
 462           cnt++;
 463         }
 464 #endif
 465         res += cnt;
 466       }
 467     }
 468     return res * l0_granularity;
 469   }
 470   void collect_stats(
 471     std::map<size_t, size_t>& bins_overall) override;
 472
 473   static inline ssize_t count_0s(slot_t slot_val, size_t start_pos);
 474   static inline ssize_t count_1s(slot_t slot_val, size_t start_pos);
 475   void dump(std::function<void(uint64_t offset, uint64_t length)> notify);
 476 };
 477
 478
 479 class AllocatorLevel01Compact : public AllocatorLevel01
 480 {
 481   uint64_t _children_per_slot() const override
 482   {
 483     return 8;
 484   }
 485 public:
 486   void collect_stats(
 487     std::map<size_t, size_t>& bins_overall) override
 488   {
 489     // not implemented
 490   }
 491 };
 492
 493 template <class L1>
 494 class AllocatorLevel02 : public AllocatorLevel
 495 {
 496 public:
 497   uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
 498   {
 499     std::lock_guard l(lock);
 500     return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
 501       pos1 * l1._children_per_slot() * bits_per_slot);
 502   }
 503   uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
 504   {
 505     std::lock_guard l(lock);
 506     return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
 507       pos1 * l1._children_per_slot() * bits_per_slot);
 508   }
 509
 510   uint64_t get_available()
 511   {
 512     std::lock_guard l(lock);
 513     return available;
 514   }
 515   inline uint64_t get_min_alloc_size() const
 516   {
 517     return l1.get_min_alloc_size();
 518   }
 519   void collect_stats(
 520     std::map<size_t, size_t>& bins_overall) override {
 521
 522       std::lock_guard l(lock);
 523       l1.collect_stats(bins_overall);
 524   }
 525
 526 protected:
 527   ceph::mutex lock = ceph::make_mutex("AllocatorLevel02::lock");
 528   L1 l1;
 529   slot_vector_t l2;
 530   uint64_t l2_granularity = 0; // space per entry
 531   uint64_t available = 0;
 532   uint64_t last_pos = 0;
 533
 534   enum {
 535     L1_ENTRIES_PER_SLOT = bits_per_slot, // 64
 536   };
 537
 538   uint64_t _children_per_slot() const override
 539   {
 540     return L1_ENTRIES_PER_SLOT;
 541   }
 542   uint64_t _level_granularity() const override
 543   {
 544     return l2_granularity;
 545   }
 546
 547   void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
 548   {
 549     ceph_assert(isp2(_alloc_unit));
 550     l1._init(capacity, _alloc_unit, mark_as_free);
 551
 552     l2_granularity =
 553       l1._level_granularity() * l1._children_per_slot() * slots_per_slotset;
 554
 555     // capacity to have slot alignment at l2
 556     auto aligned_capacity =
 557       p2roundup((int64_t)capacity, (int64_t)l2_granularity * L1_ENTRIES_PER_SLOT);
 558     size_t elem_count = aligned_capacity / l2_granularity / L1_ENTRIES_PER_SLOT;
 559     // we use set bit(s) as a marker for (partially) free entry
 560     l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
 561
 562     if (mark_as_free) {
 563       // capacity to have slotset alignment at l1
 564       auto l2_pos_no_use =
 565         p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
 566       _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
 567       available = p2align(capacity, _alloc_unit);
 568     } else {
 569       available = 0;
 570     }
 571   }
 572
 573   void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
 574   {
 575     auto d = L1_ENTRIES_PER_SLOT;
 576     ceph_assert(0 <= l2_pos_end);
 577     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 578
 579     while (l2_pos < l2_pos_end) {
 580       l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
 581       ++l2_pos;
 582     }
 583   }
 584
 585   void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
 586   {
 587     auto d = L1_ENTRIES_PER_SLOT;
 588     ceph_assert(0 <= l2_pos_end);
 589     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 590
 591     while (l2_pos < l2_pos_end) {
 592         l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
 593         ++l2_pos;
 594     }
 595   }
 596
 597   void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
 598   {
 599     auto d = L1_ENTRIES_PER_SLOT;
 600     ceph_assert(0 <= l2_pos_end);
 601     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 602
 603     auto idx = l2_pos * slots_per_slotset;
 604     auto idx_end = l2_pos_end * slots_per_slotset;
 605     bool all_allocated = true;
 606     while (idx < idx_end) {
 607       if (!l1._is_slot_fully_allocated(idx)) {
 608         all_allocated = false;
 609         idx = p2roundup(int64_t(++idx), int64_t(slots_per_slotset));
 610       }
 611       else {
 612         ++idx;
 613       }
 614       if ((idx % slots_per_slotset) == 0) {
 615         if (all_allocated) {
 616           l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
 617         }
 618         else {
 619           l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
 620         }
 621         all_allocated = true;
 622         ++l2_pos;
 623       }
 624     }
 625   }
 626
 627   void _allocate_l2(uint64_t length,
 628     uint64_t min_length,
 629     uint64_t max_length,
 630     uint64_t hint,
 631
 632     uint64_t* allocated,
 633     interval_vector_t* res)
 634   {
 635     uint64_t prev_allocated = *allocated;
 636     uint64_t d = L1_ENTRIES_PER_SLOT;
 637     ceph_assert(isp2(min_length));
 638     ceph_assert(min_length <= l2_granularity);
 639     ceph_assert(max_length == 0 || max_length >= min_length);
 640     ceph_assert(max_length == 0 || (max_length % min_length) == 0);
 641     ceph_assert(length >= min_length);
 642     ceph_assert((length % min_length) == 0);
 643
 644     uint64_t cap = 1ull << 31;
 645     if (max_length == 0 || max_length >= cap) {
 646       max_length = cap;
 647     }
 648
 649     uint64_t l1_w = slots_per_slotset * l1._children_per_slot();
 650
 651     std::lock_guard l(lock);
 652
 653     if (available < min_length) {
 654       return;
 655     }
 656     if (hint != 0) {
 657       last_pos = (hint / d) < l2.size() ? p2align(hint, d) : 0;
 658     }
 659     auto l2_pos = last_pos;
 660     auto last_pos0 = last_pos;
 661     auto pos = last_pos / d;
 662     auto pos_end = l2.size();
 663     // outer loop below is intended to optimize the performance by
 664     // avoiding 'modulo' operations inside the internal loop.
 665     // Looks like they have negative impact on the performance
 666     for (auto i = 0; i < 2; ++i) {
 667       for(; length > *allocated && pos < pos_end; ++pos) {
 668         slot_t& slot_val = l2[pos];
 669         size_t free_pos = 0;
 670         bool all_set = false;
 671         if (slot_val == all_slot_clear) {
 672           l2_pos += d;
 673           last_pos = l2_pos;
 674           continue;
 675         } else if (slot_val == all_slot_set) {
 676           free_pos = 0;
 677           all_set = true;
 678         } else {
 679           free_pos = find_next_set_bit(slot_val, 0);
 680           ceph_assert(free_pos < bits_per_slot);
 681         }
 682         do {
 683           ceph_assert(length > *allocated);
 684           bool empty = l1._allocate_l1(length,
 685             min_length,
 686             max_length,
 687             (l2_pos + free_pos) * l1_w,
 688             (l2_pos + free_pos + 1) * l1_w,
 689             allocated,
 690             res);
 691           if (empty) {
 692             slot_val &= ~(slot_t(1) << free_pos);
 693           }
 694           if (length <= *allocated || slot_val == all_slot_clear) {
 695             break;
 696           }
 697           ++free_pos;
 698           if (!all_set) {
 699             free_pos = find_next_set_bit(slot_val, free_pos);
 700           }
 701         } while (free_pos < bits_per_slot);
 702         last_pos = l2_pos;
 703         l2_pos += d;
 704       }
 705       l2_pos = 0;
 706       pos = 0;
 707       pos_end = last_pos0 / d;
 708     }
 709
 710     ++l2_allocs;
 711     auto allocated_here = *allocated - prev_allocated;
 712     ceph_assert(available >= allocated_here);
 713     available -= allocated_here;
 714   }
 715
 716 #ifndef NON_CEPH_BUILD
 717   // to provide compatibility with BlueStore's allocator interface
 718   void _free_l2(const interval_set<uint64_t> & rr)
 719   {
 720     uint64_t released = 0;
 721     std::lock_guard l(lock);
 722     for (auto r : rr) {
 723       released += l1._free_l1(r.first, r.second);
 724       uint64_t l2_pos = r.first / l2_granularity;
 725       uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
 726
 727       _mark_l2_free(l2_pos, l2_pos_end);
 728     }
 729     available += released;
 730   }
 731 #endif
 732
 733   template <typename T>
 734   void _free_l2(const T& rr)
 735   {
 736     uint64_t released = 0;
 737     std::lock_guard l(lock);
 738     for (auto r : rr) {
 739       released += l1._free_l1(r.offset, r.length);
 740       uint64_t l2_pos = r.offset / l2_granularity;
 741       uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
 742
 743       _mark_l2_free(l2_pos, l2_pos_end);
 744     }
 745     available += released;
 746   }
 747
 748   void _mark_allocated(uint64_t o, uint64_t len)
 749   {
 750     uint64_t l2_pos = o / l2_granularity;
 751     uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
 752
 753     std::lock_guard l(lock);
 754     auto allocated = l1._mark_alloc_l1(o, len);
 755     ceph_assert(available >= allocated);
 756     available -= allocated;
 757     _mark_l2_on_l1(l2_pos, l2_pos_end);
 758   }
 759
 760   void _mark_free(uint64_t o, uint64_t len)
 761   {
 762     uint64_t l2_pos = o / l2_granularity;
 763     uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
 764
 765     std::lock_guard l(lock);
 766     available += l1._free_l1(o, len);
 767     _mark_l2_free(l2_pos, l2_pos_end);
 768   }
 769   void _shutdown()
 770   {
 771     last_pos = 0;
 772   }
 773   double _get_fragmentation() {
 774     std::lock_guard l(lock);
 775     return l1.get_fragmentation();
 776   }
 777 };
 778
 779 #endif