ceph/src/os/bluestore/fastbmap_allocator_impl.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Bitmap based in-memory allocator implementation.
   5  * Author: Igor Fedotov, ifedotov@suse.com
   6  *
   7  */
   8
   9 #ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
  10 #define __FAST_BITMAP_ALLOCATOR_IMPL_H
  11 #include "include/intarith.h"
  12
  13 #include <vector>
  14 #include <algorithm>
  15 #include <mutex>
  16
  17 typedef uint64_t slot_t;
  18
  19 #ifdef NON_CEPH_BUILD
  20 #include <assert.h>
  21 struct interval_t
  22 {
  23   uint64_t offset = 0;
  24   uint64_t length = 0;
  25
  26   interval_t() {}
  27   interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
  28   interval_t(const interval_t &ext) :
  29     offset(ext.offset), length(ext.length) {}
  30 };
  31 typedef std::vector<interval_t> interval_vector_t;
  32 typedef std::vector<slot_t> slot_vector_t;
  33 #else
  34 #include "include/ceph_assert.h"
  35 #include "common/likely.h"
  36 #include "os/bluestore/bluestore_types.h"
  37 #include "include/mempool.h"
  38 #include "common/ceph_mutex.h"
  39
  40 typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
  41 typedef PExtentVector interval_vector_t;
  42
  43 typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
  44
  45 #endif
  46
  47 // fitting into cache line on x86_64
  48 static const size_t slots_per_slotset = 8; // 8 slots per set
  49 static const size_t slotset_bytes = sizeof(slot_t) * slots_per_slotset;
  50 static const size_t bits_per_slot = sizeof(slot_t) * 8;
  51 static const size_t bits_per_slotset = slotset_bytes * 8;
  52 static const slot_t all_slot_set = 0xffffffffffffffff;
  53 static const slot_t all_slot_clear = 0;
  54
  55 inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
  56 {
  57 #ifdef __GNUC__
  58   if (start_pos == 0) {
  59     start_pos = __builtin_ffsll(slot_val);
  60     return start_pos ? start_pos - 1 : bits_per_slot;
  61   }
  62 #endif
  63   slot_t mask = slot_t(1) << start_pos;
  64   while (start_pos < bits_per_slot && !(slot_val & mask)) {
  65     mask <<= 1;
  66     ++start_pos;
  67   }
  68   return start_pos;
  69 }
  70
  71
  72 class AllocatorLevel
  73 {
  74 protected:
  75
  76   virtual uint64_t _children_per_slot() const = 0;
  77   virtual uint64_t _level_granularity() const = 0;
  78
  79 public:
  80   static uint64_t l0_dives;
  81   static uint64_t l0_iterations;
  82   static uint64_t l0_inner_iterations;
  83   static uint64_t alloc_fragments;
  84   static uint64_t alloc_fragments_fast;
  85   static uint64_t l2_allocs;
  86
  87   virtual ~AllocatorLevel()
  88   {}
  89
  90   virtual void collect_stats(
  91     std::map<size_t, size_t>& bins_overall) = 0;
  92
  93 };
  94
  95 class AllocatorLevel01 : public AllocatorLevel
  96 {
  97 protected:
  98   slot_vector_t l0; // set bit means free entry
  99   slot_vector_t l1;
 100   uint64_t l0_granularity = 0; // space per entry
 101   uint64_t l1_granularity = 0; // space per entry
 102
 103   size_t partial_l1_count = 0;
 104   size_t unalloc_l1_count = 0;
 105
 106   double get_fragmentation() const {
 107     double res = 0.0;
 108     auto total = unalloc_l1_count + partial_l1_count;
 109     if (total) {
 110       res = double(partial_l1_count) / double(total);
 111     }
 112     return res;
 113   }
 114
 115   uint64_t _level_granularity() const override
 116   {
 117     return l1_granularity;
 118   }
 119
 120   inline bool _is_slot_fully_allocated(uint64_t idx) const {
 121     return l1[idx] == all_slot_clear;
 122   }
 123 public:
 124   inline uint64_t get_min_alloc_size() const
 125   {
 126     return l0_granularity;
 127   }
 128
 129 };
 130
 131 template <class T>
 132 class AllocatorLevel02;
 133
 134 class AllocatorLevel01Loose : public AllocatorLevel01
 135 {
 136   enum {
 137     L1_ENTRY_WIDTH = 2,
 138     L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
 139     L1_ENTRY_FULL = 0x00,
 140     L1_ENTRY_PARTIAL = 0x01,
 141     L1_ENTRY_NOT_USED = 0x02,
 142     L1_ENTRY_FREE = 0x03,
 143     L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32
 144     L0_ENTRIES_PER_SLOT = bits_per_slot, // 64
 145   };
 146   uint64_t _children_per_slot() const override
 147   {
 148     return L1_ENTRIES_PER_SLOT;
 149   }
 150
 151   interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
 152     uint64_t min_length, interval_t* tail) const;
 153
 154   inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
 155     uint64_t len,
 156     interval_vector_t* res)
 157   {
 158     auto it = res->rbegin();
 159     if (max_length) {
 160       if (it != res->rend() && it->offset + it->length == offset) {
 161         auto l = max_length - it->length;
 162         if (l >= len) {
 163           it->length += len;
 164           return;
 165         } else {
 166           offset += l;
 167           len -= l;
 168           it->length += l;
 169         }
 170       }
 171
 172       while (len > max_length) {
 173         res->emplace_back(offset, max_length);
 174         offset += max_length;
 175         len -= max_length;
 176       }
 177       res->emplace_back(offset, len);
 178       return;
 179     }
 180
 181     if (it != res->rend() && it->offset + it->length == offset) {
 182       it->length += len;
 183     } else {
 184       res->emplace_back(offset, len);
 185     }
 186   }
 187
 188   bool _allocate_l0(uint64_t length,
 189     uint64_t max_length,
 190     uint64_t l0_pos0, uint64_t l0_pos1,
 191     uint64_t* allocated,
 192     interval_vector_t* res)
 193   {
 194     uint64_t d0 = L0_ENTRIES_PER_SLOT;
 195
 196     ++l0_dives;
 197
 198     ceph_assert(l0_pos0 < l0_pos1);
 199     ceph_assert(length > *allocated);
 200     ceph_assert(0 == (l0_pos0 % (slots_per_slotset * d0)));
 201     ceph_assert(0 == (l0_pos1 % (slots_per_slotset * d0)));
 202     ceph_assert(((length - *allocated) % l0_granularity) == 0);
 203
 204     uint64_t need_entries = (length - *allocated) / l0_granularity;
 205
 206     for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
 207       ++idx) {
 208       ++l0_iterations;
 209       slot_t& slot_val = l0[idx];
 210       auto base = idx * d0;
 211       if (slot_val == all_slot_clear) {
 212         continue;
 213       } else if (slot_val == all_slot_set) {
 214         uint64_t to_alloc = std::min(need_entries, d0);
 215         *allocated += to_alloc * l0_granularity;
 216         ++alloc_fragments;
 217         need_entries -= to_alloc;
 218
 219         _fragment_and_emplace(max_length, base * l0_granularity,
 220           to_alloc * l0_granularity, res);
 221
 222         if (to_alloc == d0) {
 223           slot_val = all_slot_clear;
 224         } else {
 225           _mark_alloc_l0(base, base + to_alloc);
 226         }
 227         continue;
 228       }
 229
 230       auto free_pos = find_next_set_bit(slot_val, 0);
 231       ceph_assert(free_pos < bits_per_slot);
 232       auto next_pos = free_pos + 1;
 233       while (next_pos < bits_per_slot &&
 234         (next_pos - free_pos) < need_entries) {
 235         ++l0_inner_iterations;
 236
 237         if (0 == (slot_val & (slot_t(1) << next_pos))) {
 238           auto to_alloc = (next_pos - free_pos);
 239           *allocated += to_alloc * l0_granularity;
 240           ++alloc_fragments;
 241           need_entries -= to_alloc;
 242           _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
 243             to_alloc * l0_granularity, res);
 244           _mark_alloc_l0(base + free_pos, base + next_pos);
 245           free_pos = find_next_set_bit(slot_val, next_pos + 1);
 246           next_pos = free_pos + 1;
 247         } else {
 248           ++next_pos;
 249         }
 250       }
 251       if (need_entries && free_pos < bits_per_slot) {
 252         auto to_alloc = std::min(need_entries, d0 - free_pos);
 253         *allocated += to_alloc * l0_granularity;
 254         ++alloc_fragments;
 255         need_entries -= to_alloc;
 256         _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
 257           to_alloc * l0_granularity, res);
 258         _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
 259       }
 260     }
 261     return _is_empty_l0(l0_pos0, l0_pos1);
 262   }
 263
 264 protected:
 265
 266   friend class AllocatorLevel02<AllocatorLevel01Loose>;
 267
 268   void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
 269   {
 270     l0_granularity = _alloc_unit;
 271     // 512 bits at L0 mapped to L1 entry
 272     l1_granularity = l0_granularity * bits_per_slotset;
 273
 274     // capacity to have slot alignment at l1
 275     auto aligned_capacity =
 276       p2roundup((int64_t)capacity,
 277         int64_t(l1_granularity * slots_per_slotset * _children_per_slot()));
 278     size_t slot_count =
 279       aligned_capacity / l1_granularity / _children_per_slot();
 280     // we use set bit(s) as a marker for (partially) free entry
 281     l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
 282
 283     // l0 slot count
 284     size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
 285     // we use set bit(s) as a marker for (partially) free entry
 286     l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
 287
 288     partial_l1_count = unalloc_l1_count = 0;
 289     if (mark_as_free) {
 290       unalloc_l1_count = slot_count * _children_per_slot();
 291       auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
 292       _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
 293     }
 294   }
 295
 296   struct search_ctx_t
 297   {
 298     size_t partial_count = 0;
 299     size_t free_count = 0;
 300     uint64_t free_l1_pos = 0;
 301
 302     uint64_t min_affordable_len = 0;
 303     uint64_t min_affordable_offs = 0;
 304     uint64_t affordable_len = 0;
 305     uint64_t affordable_offs = 0;
 306
 307     bool fully_processed = false;
 308
 309     void reset()
 310     {
 311       *this = search_ctx_t();
 312     }
 313   };
 314   enum {
 315     NO_STOP,
 316     STOP_ON_EMPTY,
 317     STOP_ON_PARTIAL,
 318   };
 319   void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
 320     uint64_t length, uint64_t min_length, int mode,
 321     search_ctx_t* ctx);
 322
 323   void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
 324   void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
 325   uint64_t _claim_free_to_left_l0(int64_t l0_pos_start);
 326   uint64_t _claim_free_to_right_l0(int64_t l0_pos_start);
 327
 328
 329   void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 330   {
 331     _mark_alloc_l0(l0_pos_start, l0_pos_end);
 332     l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
 333     l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
 334     _mark_l1_on_l0(l0_pos_start, l0_pos_end);
 335   }
 336
 337   void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 338   {
 339     auto d0 = L0_ENTRIES_PER_SLOT;
 340
 341     auto pos = l0_pos_start;
 342     slot_t bits = (slot_t)1 << (l0_pos_start % d0);
 343     slot_t* val_s = &l0[pos / d0];
 344     int64_t pos_e = std::min(l0_pos_end,
 345                              p2roundup<int64_t>(l0_pos_start + 1, d0));
 346     while (pos < pos_e) {
 347       *val_s |=  bits;
 348       bits <<= 1;
 349       pos++;
 350     }
 351     pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
 352     while (pos < pos_e) {
 353       *(++val_s) = all_slot_set;
 354       pos += d0;
 355     }
 356     bits = 1;
 357     ++val_s;
 358     while (pos < l0_pos_end) {
 359       *val_s |= bits;
 360       bits <<= 1;
 361       pos++;
 362     }
 363   }
 364
 365   void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
 366   {
 367     _mark_free_l0(l0_pos_start, l0_pos_end);
 368     l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
 369     l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
 370     _mark_l1_on_l0(l0_pos_start, l0_pos_end);
 371   }
 372
 373   bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
 374   {
 375     bool no_free = true;
 376     uint64_t d = slots_per_slotset * L0_ENTRIES_PER_SLOT;
 377     ceph_assert(0 == (l0_pos % d));
 378     ceph_assert(0 == (l0_pos_end % d));
 379
 380     auto idx = l0_pos / L0_ENTRIES_PER_SLOT;
 381     auto idx_end = l0_pos_end / L0_ENTRIES_PER_SLOT;
 382     while (idx < idx_end && no_free) {
 383       no_free = l0[idx] == all_slot_clear;
 384       ++idx;
 385     }
 386     return no_free;
 387   }
 388   bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
 389   {
 390     bool no_free = true;
 391     uint64_t d = slots_per_slotset * _children_per_slot();
 392     ceph_assert(0 == (l1_pos % d));
 393     ceph_assert(0 == (l1_pos_end % d));
 394
 395     auto idx = l1_pos / L1_ENTRIES_PER_SLOT;
 396     auto idx_end = l1_pos_end / L1_ENTRIES_PER_SLOT;
 397     while (idx < idx_end && no_free) {
 398       no_free = _is_slot_fully_allocated(idx);
 399       ++idx;
 400     }
 401     return no_free;
 402   }
 403
 404   interval_t _allocate_l1_contiguous(uint64_t length,
 405     uint64_t min_length, uint64_t max_length,
 406     uint64_t pos_start, uint64_t pos_end);
 407
 408   bool _allocate_l1(uint64_t length,
 409     uint64_t min_length, uint64_t max_length,
 410     uint64_t l1_pos_start, uint64_t l1_pos_end,
 411     uint64_t* allocated,
 412     interval_vector_t* res);
 413
 414   uint64_t _mark_alloc_l1(uint64_t offset, uint64_t length)
 415   {
 416     uint64_t l0_pos_start = offset / l0_granularity;
 417     uint64_t l0_pos_end = p2roundup(offset + length, l0_granularity) / l0_granularity;
 418     _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
 419     return l0_granularity * (l0_pos_end - l0_pos_start);
 420   }
 421
 422   uint64_t _free_l1(uint64_t offs, uint64_t len)
 423   {
 424     uint64_t l0_pos_start = offs / l0_granularity;
 425     uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity;
 426     _mark_free_l1_l0(l0_pos_start, l0_pos_end);
 427     return l0_granularity * (l0_pos_end - l0_pos_start);
 428   }
 429
 430   uint64_t claim_free_to_left_l1(uint64_t offs)
 431   {
 432     uint64_t l0_pos_end = offs / l0_granularity;
 433     uint64_t l0_pos_start = _claim_free_to_left_l0(l0_pos_end);
 434     if (l0_pos_start < l0_pos_end) {
 435       _mark_l1_on_l0(
 436         p2align(l0_pos_start, uint64_t(bits_per_slotset)),
 437         p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
 438       return l0_granularity * (l0_pos_end - l0_pos_start);
 439     }
 440     return 0;
 441   }
 442
 443   uint64_t claim_free_to_right_l1(uint64_t offs)
 444   {
 445     uint64_t l0_pos_start = offs / l0_granularity;
 446     uint64_t l0_pos_end = _claim_free_to_right_l0(l0_pos_start);
 447
 448     if (l0_pos_start < l0_pos_end) {
 449       _mark_l1_on_l0(
 450         p2align(l0_pos_start, uint64_t(bits_per_slotset)),
 451         p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
 452       return l0_granularity * (l0_pos_end - l0_pos_start);
 453     }
 454     return 0;
 455   }
 456
 457
 458 public:
 459   uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
 460   {
 461     if (pos1 == 0) {
 462       pos1 = l1.size() * L1_ENTRIES_PER_SLOT;
 463     }
 464     auto avail = debug_get_free(pos0, pos1);
 465     return (pos1 - pos0) * l1_granularity - avail;
 466   }
 467
 468   uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
 469   {
 470     ceph_assert(0 == (l1_pos0 % L1_ENTRIES_PER_SLOT));
 471     ceph_assert(0 == (l1_pos1 % L1_ENTRIES_PER_SLOT));
 472
 473     auto idx0 = l1_pos0 * slots_per_slotset;
 474     auto idx1 = l1_pos1 * slots_per_slotset;
 475
 476     if (idx1 == 0) {
 477       idx1 = l0.size();
 478     }
 479
 480     uint64_t res = 0;
 481     for (uint64_t i = idx0; i < idx1; ++i) {
 482       auto v = l0[i];
 483       if (v == all_slot_set) {
 484         res += L0_ENTRIES_PER_SLOT;
 485       } else if (v != all_slot_clear) {
 486         size_t cnt = 0;
 487 #ifdef __GNUC__
 488         cnt = __builtin_popcountll(v);
 489 #else
 490         // Kernighan's Alg to count set bits
 491         while (v) {
 492           v &= (v - 1);
 493           cnt++;
 494         }
 495 #endif
 496         res += cnt;
 497       }
 498     }
 499     return res * l0_granularity;
 500   }
 501   void collect_stats(
 502     std::map<size_t, size_t>& bins_overall) override;
 503
 504   static inline ssize_t count_0s(slot_t slot_val, size_t start_pos);
 505   static inline ssize_t count_1s(slot_t slot_val, size_t start_pos);
 506   void dump(std::function<void(uint64_t offset, uint64_t length)> notify);
 507 };
 508
 509
 510 class AllocatorLevel01Compact : public AllocatorLevel01
 511 {
 512   uint64_t _children_per_slot() const override
 513   {
 514     return 8;
 515   }
 516 public:
 517   void collect_stats(
 518     std::map<size_t, size_t>& bins_overall) override
 519   {
 520     // not implemented
 521   }
 522 };
 523
 524 template <class L1>
 525 class AllocatorLevel02 : public AllocatorLevel
 526 {
 527 public:
 528   uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
 529   {
 530     std::lock_guard l(lock);
 531     return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
 532       pos1 * l1._children_per_slot() * bits_per_slot);
 533   }
 534   uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
 535   {
 536     std::lock_guard l(lock);
 537     return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
 538       pos1 * l1._children_per_slot() * bits_per_slot);
 539   }
 540
 541   uint64_t get_available()
 542   {
 543     std::lock_guard l(lock);
 544     return available;
 545   }
 546   inline uint64_t get_min_alloc_size() const
 547   {
 548     return l1.get_min_alloc_size();
 549   }
 550   void collect_stats(
 551     std::map<size_t, size_t>& bins_overall) override {
 552
 553       std::lock_guard l(lock);
 554       l1.collect_stats(bins_overall);
 555   }
 556   uint64_t claim_free_to_left(uint64_t offset) {
 557     std::lock_guard l(lock);
 558     auto allocated = l1.claim_free_to_left_l1(offset);
 559     ceph_assert(available >= allocated);
 560     available -= allocated;
 561
 562     uint64_t l2_pos = (offset - allocated) / l2_granularity;
 563     uint64_t l2_pos_end =
 564       p2roundup(int64_t(offset), int64_t(l2_granularity)) / l2_granularity;
 565     _mark_l2_on_l1(l2_pos, l2_pos_end);
 566     return allocated;
 567   }
 568
 569   uint64_t claim_free_to_right(uint64_t offset) {
 570     std::lock_guard l(lock);
 571     auto allocated = l1.claim_free_to_right_l1(offset);
 572     ceph_assert(available >= allocated);
 573     available -= allocated;
 574
 575     uint64_t l2_pos = (offset) / l2_granularity;
 576     int64_t end = offset + allocated;
 577     uint64_t l2_pos_end = p2roundup(end, int64_t(l2_granularity)) / l2_granularity;
 578     _mark_l2_on_l1(l2_pos, l2_pos_end);
 579     return allocated;
 580   }
 581 protected:
 582   ceph::mutex lock = ceph::make_mutex("AllocatorLevel02::lock");
 583   L1 l1;
 584   slot_vector_t l2;
 585   uint64_t l2_granularity = 0; // space per entry
 586   uint64_t available = 0;
 587   uint64_t last_pos = 0;
 588
 589   enum {
 590     L1_ENTRIES_PER_SLOT = bits_per_slot, // 64
 591   };
 592
 593   uint64_t _children_per_slot() const override
 594   {
 595     return L1_ENTRIES_PER_SLOT;
 596   }
 597   uint64_t _level_granularity() const override
 598   {
 599     return l2_granularity;
 600   }
 601
 602   void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
 603   {
 604     ceph_assert(isp2(_alloc_unit));
 605     l1._init(capacity, _alloc_unit, mark_as_free);
 606
 607     l2_granularity =
 608       l1._level_granularity() * l1._children_per_slot() * slots_per_slotset;
 609
 610     // capacity to have slot alignment at l2
 611     auto aligned_capacity =
 612       p2roundup((int64_t)capacity, (int64_t)l2_granularity * L1_ENTRIES_PER_SLOT);
 613     size_t elem_count = aligned_capacity / l2_granularity / L1_ENTRIES_PER_SLOT;
 614     // we use set bit(s) as a marker for (partially) free entry
 615     l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
 616
 617     if (mark_as_free) {
 618       // capacity to have slotset alignment at l1
 619       auto l2_pos_no_use =
 620         p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
 621       _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
 622       available = p2align(capacity, _alloc_unit);
 623     } else {
 624       available = 0;
 625     }
 626   }
 627
 628   void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
 629   {
 630     auto d = L1_ENTRIES_PER_SLOT;
 631     ceph_assert(0 <= l2_pos_end);
 632     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 633
 634     while (l2_pos < l2_pos_end) {
 635       l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
 636       ++l2_pos;
 637     }
 638   }
 639
 640   void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
 641   {
 642     auto d = L1_ENTRIES_PER_SLOT;
 643     ceph_assert(0 <= l2_pos_end);
 644     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 645
 646     while (l2_pos < l2_pos_end) {
 647         l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
 648         ++l2_pos;
 649     }
 650   }
 651
 652   void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
 653   {
 654     auto d = L1_ENTRIES_PER_SLOT;
 655     ceph_assert(0 <= l2_pos_end);
 656     ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
 657
 658     auto idx = l2_pos * slots_per_slotset;
 659     auto idx_end = l2_pos_end * slots_per_slotset;
 660     bool all_allocated = true;
 661     while (idx < idx_end) {
 662       if (!l1._is_slot_fully_allocated(idx)) {
 663         all_allocated = false;
 664         idx = p2roundup(int64_t(++idx), int64_t(slots_per_slotset));
 665       }
 666       else {
 667         ++idx;
 668       }
 669       if ((idx % slots_per_slotset) == 0) {
 670         if (all_allocated) {
 671           l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
 672         }
 673         else {
 674           l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
 675         }
 676         all_allocated = true;
 677         ++l2_pos;
 678       }
 679     }
 680   }
 681
 682   void _allocate_l2(uint64_t length,
 683     uint64_t min_length,
 684     uint64_t max_length,
 685     uint64_t hint,
 686
 687     uint64_t* allocated,
 688     interval_vector_t* res)
 689   {
 690     uint64_t prev_allocated = *allocated;
 691     uint64_t d = L1_ENTRIES_PER_SLOT;
 692     ceph_assert(min_length <= l2_granularity);
 693     ceph_assert(max_length == 0 || max_length >= min_length);
 694     ceph_assert(max_length == 0 || (max_length % min_length) == 0);
 695     ceph_assert(length >= min_length);
 696     ceph_assert((length % min_length) == 0);
 697
 698     uint64_t cap = 1ull << 31;
 699     if (max_length == 0 || max_length >= cap) {
 700       max_length = cap;
 701     }
 702
 703     uint64_t l1_w = slots_per_slotset * l1._children_per_slot();
 704
 705     std::lock_guard l(lock);
 706
 707     if (available < min_length) {
 708       return;
 709     }
 710     if (hint != 0) {
 711       last_pos = (hint / d) < l2.size() ? p2align(hint, d) : 0;
 712     }
 713     auto l2_pos = last_pos;
 714     auto last_pos0 = last_pos;
 715     auto pos = last_pos / d;
 716     auto pos_end = l2.size();
 717     // outer loop below is intended to optimize the performance by
 718     // avoiding 'modulo' operations inside the internal loop.
 719     // Looks like they have negative impact on the performance
 720     for (auto i = 0; i < 2; ++i) {
 721       for(; length > *allocated && pos < pos_end; ++pos) {
 722         slot_t& slot_val = l2[pos];
 723         size_t free_pos = 0;
 724         bool all_set = false;
 725         if (slot_val == all_slot_clear) {
 726           l2_pos += d;
 727           last_pos = l2_pos;
 728           continue;
 729         } else if (slot_val == all_slot_set) {
 730           free_pos = 0;
 731           all_set = true;
 732         } else {
 733           free_pos = find_next_set_bit(slot_val, 0);
 734           ceph_assert(free_pos < bits_per_slot);
 735         }
 736         do {
 737           ceph_assert(length > *allocated);
 738           bool empty = l1._allocate_l1(length,
 739             min_length,
 740             max_length,
 741             (l2_pos + free_pos) * l1_w,
 742             (l2_pos + free_pos + 1) * l1_w,
 743             allocated,
 744             res);
 745           if (empty) {
 746             slot_val &= ~(slot_t(1) << free_pos);
 747           }
 748           if (length <= *allocated || slot_val == all_slot_clear) {
 749             break;
 750           }
 751           ++free_pos;
 752           if (!all_set) {
 753             free_pos = find_next_set_bit(slot_val, free_pos);
 754           }
 755         } while (free_pos < bits_per_slot);
 756         last_pos = l2_pos;
 757         l2_pos += d;
 758       }
 759       l2_pos = 0;
 760       pos = 0;
 761       pos_end = last_pos0 / d;
 762     }
 763
 764     ++l2_allocs;
 765     auto allocated_here = *allocated - prev_allocated;
 766     ceph_assert(available >= allocated_here);
 767     available -= allocated_here;
 768   }
 769
 770 #ifndef NON_CEPH_BUILD
 771   // to provide compatibility with BlueStore's allocator interface
 772   void _free_l2(const interval_set<uint64_t> & rr)
 773   {
 774     uint64_t released = 0;
 775     std::lock_guard l(lock);
 776     for (auto r : rr) {
 777       released += l1._free_l1(r.first, r.second);
 778       uint64_t l2_pos = r.first / l2_granularity;
 779       uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
 780
 781       _mark_l2_free(l2_pos, l2_pos_end);
 782     }
 783     available += released;
 784   }
 785 #endif
 786
 787   template <typename T>
 788   void _free_l2(const T& rr)
 789   {
 790     uint64_t released = 0;
 791     std::lock_guard l(lock);
 792     for (auto r : rr) {
 793       released += l1._free_l1(r.offset, r.length);
 794       uint64_t l2_pos = r.offset / l2_granularity;
 795       uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
 796
 797       _mark_l2_free(l2_pos, l2_pos_end);
 798     }
 799     available += released;
 800   }
 801
 802   void _mark_allocated(uint64_t o, uint64_t len)
 803   {
 804     uint64_t l2_pos = o / l2_granularity;
 805     uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
 806
 807     std::lock_guard l(lock);
 808     auto allocated = l1._mark_alloc_l1(o, len);
 809     ceph_assert(available >= allocated);
 810     available -= allocated;
 811     _mark_l2_on_l1(l2_pos, l2_pos_end);
 812   }
 813
 814   void _mark_free(uint64_t o, uint64_t len)
 815   {
 816     uint64_t l2_pos = o / l2_granularity;
 817     uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
 818
 819     std::lock_guard l(lock);
 820     available += l1._free_l1(o, len);
 821     _mark_l2_free(l2_pos, l2_pos_end);
 822   }
 823   void _shutdown()
 824   {
 825     last_pos = 0;
 826   }
 827   double _get_fragmentation() {
 828     std::lock_guard l(lock);
 829     return l1.get_fragmentation();
 830   }
 831 };
 832
 833 #endif