1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Bitmap based in-memory allocator implementation.
5 * Author: Igor Fedotov, ifedotov@suse.com
9 #include "fastbmap_allocator_impl.h"
11 uint64_t AllocatorLevel::l0_dives
= 0;
12 uint64_t AllocatorLevel::l0_iterations
= 0;
13 uint64_t AllocatorLevel::l0_inner_iterations
= 0;
14 uint64_t AllocatorLevel::alloc_fragments
= 0;
15 uint64_t AllocatorLevel::alloc_fragments_fast
= 0;
16 uint64_t AllocatorLevel::l2_allocs
= 0;
18 inline interval_t
_align2units(uint64_t offset
, uint64_t len
, uint64_t min_length
)
21 if (len
>= min_length
) {
22 res
.offset
= p2roundup(offset
, min_length
);
23 auto delta_off
= res
.offset
- offset
;
24 if (len
> delta_off
) {
25 res
.length
= len
- delta_off
;
26 res
.length
= p2align
<uint64_t>(res
.length
, min_length
);
35 interval_t
AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0
,
36 uint64_t pos1
, uint64_t min_length
, interval_t
* tail
) const
44 interval_t res_candidate
;
45 if (tail
->length
!= 0) {
46 ceph_assert((tail
->offset
% l0_granularity
) == 0);
47 ceph_assert((tail
->length
% l0_granularity
) == 0);
48 res_candidate
.offset
= tail
->offset
/ l0_granularity
;
49 res_candidate
.length
= tail
->length
/ l0_granularity
;
53 auto d
= bits_per_slot
;
54 slot_t bits
= l0
[pos
/ d
];
56 bool end_loop
= false;
57 auto min_granules
= min_length
/ l0_granularity
;
62 if (pos1
- pos
>= d
) {
65 // slot is totally free
66 if (!res_candidate
.length
) {
67 res_candidate
.offset
= pos
;
69 res_candidate
.length
+= d
;
71 end_loop
= pos
>= pos1
;
73 *tail
= res_candidate
;
74 res_candidate
= _align2units(res_candidate
.offset
,
75 res_candidate
.length
, min_granules
);
76 if(res
.length
< res_candidate
.length
) {
82 // slot is totally allocated
83 res_candidate
= _align2units(res_candidate
.offset
,
84 res_candidate
.length
, min_granules
);
85 if (res
.length
< res_candidate
.length
) {
88 res_candidate
= interval_t();
90 end_loop
= pos
>= pos1
;
94 } //if ((pos % d) == 0)
96 end_loop
= ++pos
>= pos1
;
99 if (!res_candidate
.length
) {
100 res_candidate
.offset
= pos
- 1;
102 ++res_candidate
.length
;
104 *tail
= res_candidate
;
105 res_candidate
= _align2units(res_candidate
.offset
,
106 res_candidate
.length
, min_granules
);
107 if (res
.length
< res_candidate
.length
) {
112 res_candidate
= _align2units(res_candidate
.offset
,
113 res_candidate
.length
, min_granules
);
114 if (res
.length
< res_candidate
.length
) {
117 res_candidate
= interval_t();
121 res
.offset
*= l0_granularity
;
122 res
.length
*= l0_granularity
;
123 tail
->offset
*= l0_granularity
;
124 tail
->length
*= l0_granularity
;
128 void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start
,
129 uint64_t pos_end
, uint64_t length
, uint64_t min_length
, int mode
,
132 auto d
= L1_ENTRIES_PER_SLOT
;
133 ceph_assert((pos_start
% d
) == 0);
134 ceph_assert((pos_end
% d
) == 0);
136 uint64_t l0_w
= slots_per_slotset
* L0_ENTRIES_PER_SLOT
;
138 uint64_t l1_pos
= pos_start
;
139 const interval_t empty_tail
;
140 interval_t prev_tail
;
142 uint64_t next_free_l1_pos
= 0;
143 for (auto pos
= pos_start
/ d
; pos
< pos_end
/ d
; ++pos
) {
144 slot_t slot_val
= l1
[pos
];
145 // FIXME minor: code below can be optimized to check slot_val against
146 // all_slot_set(_clear) value
148 for (auto c
= 0; c
< d
; c
++) {
149 switch (slot_val
& L1_ENTRY_MASK
) {
151 prev_tail
= empty_tail
;
152 if (!ctx
->free_count
) {
153 ctx
->free_l1_pos
= l1_pos
;
154 } else if (l1_pos
!= next_free_l1_pos
){
155 auto o
= ctx
->free_l1_pos
* l1_granularity
;
156 auto l
= ctx
->free_count
* l1_granularity
;
157 // check if already found extent fits min_length after alignment
158 if (_align2units(o
, l
, min_length
).length
>= min_length
) {
161 // if not - proceed with the next one
162 ctx
->free_l1_pos
= l1_pos
;
165 next_free_l1_pos
= l1_pos
+ 1;
167 if (mode
== STOP_ON_EMPTY
) {
172 prev_tail
= empty_tail
;
174 case L1_ENTRY_PARTIAL
:
176 ++ctx
->partial_count
;
178 longest
= _get_longest_from_l0(l1_pos
* l0_w
, (l1_pos
+ 1) * l0_w
, min_length
, &prev_tail
);
180 if (longest
.length
>= length
) {
181 if ((ctx
->affordable_len
== 0) ||
182 ((ctx
->affordable_len
!= 0) &&
183 (longest
.length
< ctx
->affordable_len
))) {
184 ctx
->affordable_len
= longest
.length
;
185 ctx
->affordable_offs
= longest
.offset
;
188 if (longest
.length
>= min_length
&&
189 (ctx
->min_affordable_len
== 0 ||
190 (longest
.length
< ctx
->min_affordable_len
))) {
192 ctx
->min_affordable_len
= p2align
<uint64_t>(longest
.length
, min_length
);
193 ctx
->min_affordable_offs
= longest
.offset
;
195 if (mode
== STOP_ON_PARTIAL
) {
200 slot_val
>>= L1_ENTRY_WIDTH
;
204 ctx
->fully_processed
= true;
207 void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos
, int64_t l0_pos_end
)
209 if (l0_pos
== l0_pos_end
) {
212 auto d0
= bits_per_slotset
;
213 uint64_t l1_w
= L1_ENTRIES_PER_SLOT
;
214 // this should be aligned with slotset boundaries
215 ceph_assert(0 == (l0_pos
% d0
));
216 ceph_assert(0 == (l0_pos_end
% d0
));
218 int64_t idx
= l0_pos
/ bits_per_slot
;
219 int64_t idx_end
= l0_pos_end
/ bits_per_slot
;
220 slot_t mask_to_apply
= L1_ENTRY_NOT_USED
;
222 auto l1_pos
= l0_pos
/ d0
;
224 while (idx
< idx_end
) {
225 if (l0
[idx
] == all_slot_clear
) {
226 // if not all prev slots are allocated then no need to check the
227 // current slot set, it's partial
229 if (mask_to_apply
== L1_ENTRY_NOT_USED
) {
230 mask_to_apply
= L1_ENTRY_FULL
;
231 } else if (mask_to_apply
!= L1_ENTRY_FULL
) {
232 idx
= p2roundup(idx
, int64_t(slots_per_slotset
));
233 mask_to_apply
= L1_ENTRY_PARTIAL
;
235 } else if (l0
[idx
] == all_slot_set
) {
236 // if not all prev slots are free then no need to check the
237 // current slot set, it's partial
239 if (mask_to_apply
== L1_ENTRY_NOT_USED
) {
240 mask_to_apply
= L1_ENTRY_FREE
;
241 } else if (mask_to_apply
!= L1_ENTRY_FREE
) {
242 idx
= p2roundup(idx
, int64_t(slots_per_slotset
));
243 mask_to_apply
= L1_ENTRY_PARTIAL
;
246 // no need to check the current slot set, it's partial
247 mask_to_apply
= L1_ENTRY_PARTIAL
;
249 idx
= p2roundup(idx
, int64_t(slots_per_slotset
));
251 if ((idx
% slots_per_slotset
) == 0) {
252 ceph_assert(mask_to_apply
!= L1_ENTRY_NOT_USED
);
253 uint64_t shift
= (l1_pos
% l1_w
) * L1_ENTRY_WIDTH
;
254 slot_t
& slot_val
= l1
[l1_pos
/ l1_w
];
255 auto mask
= slot_t(L1_ENTRY_MASK
) << shift
;
257 slot_t old_mask
= (slot_val
& mask
) >> shift
;
262 case L1_ENTRY_PARTIAL
:
267 slot_val
|= slot_t(mask_to_apply
) << shift
;
268 switch(mask_to_apply
) {
272 case L1_ENTRY_PARTIAL
:
276 mask_to_apply
= L1_ENTRY_NOT_USED
;
282 void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start
,
285 auto d0
= L0_ENTRIES_PER_SLOT
;
287 int64_t pos
= l0_pos_start
;
288 slot_t bits
= (slot_t
)1 << (l0_pos_start
% d0
);
289 slot_t
* val_s
= l0
.data() + (pos
/ d0
);
290 int64_t pos_e
= std::min(l0_pos_end
, p2roundup
<int64_t>(l0_pos_start
+ 1, d0
));
291 while (pos
< pos_e
) {
296 pos_e
= std::min(l0_pos_end
, p2align
<int64_t>(l0_pos_end
, d0
));
297 while (pos
< pos_e
) {
298 *(++val_s
) = all_slot_clear
;
303 while (pos
< l0_pos_end
) {
310 interval_t
AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length
,
311 uint64_t min_length
, uint64_t max_length
,
312 uint64_t pos_start
, uint64_t pos_end
)
314 interval_t res
= { 0, 0 };
315 uint64_t l0_w
= slots_per_slotset
* L0_ENTRIES_PER_SLOT
;
317 if (unlikely(length
<= l0_granularity
)) {
319 _analyze_partials(pos_start
, pos_end
, l0_granularity
, l0_granularity
,
320 STOP_ON_PARTIAL
, &ctx
);
322 // check partially free slot sets first (including neighboring),
323 // full length match required.
324 if (ctx
.affordable_len
) {
325 // allocate as specified
326 ceph_assert(ctx
.affordable_len
>= length
);
327 auto pos
= ctx
.affordable_offs
/ l0_granularity
;
328 _mark_alloc_l1_l0(pos
, pos
+ 1);
329 res
= interval_t(ctx
.affordable_offs
, length
);
333 // allocate from free slot sets
334 if (ctx
.free_count
) {
335 auto l
= std::min(length
, ctx
.free_count
* l1_granularity
);
336 ceph_assert((l
% l0_granularity
) == 0);
337 auto pos_end
= ctx
.free_l1_pos
* l0_w
+ l
/ l0_granularity
;
339 _mark_alloc_l1_l0(ctx
.free_l1_pos
* l0_w
, pos_end
);
340 res
= interval_t(ctx
.free_l1_pos
* l1_granularity
, l
);
343 } else if (unlikely(length
== l1_granularity
)) {
345 _analyze_partials(pos_start
, pos_end
, length
, min_length
, STOP_ON_EMPTY
, &ctx
);
347 // allocate using contiguous extent found at l1 if any
348 if (ctx
.free_count
) {
350 auto l
= std::min(length
, ctx
.free_count
* l1_granularity
);
351 ceph_assert((l
% l0_granularity
) == 0);
352 auto pos_end
= ctx
.free_l1_pos
* l0_w
+ l
/ l0_granularity
;
354 _mark_alloc_l1_l0(ctx
.free_l1_pos
* l0_w
, pos_end
);
355 res
= interval_t(ctx
.free_l1_pos
* l1_granularity
, l
);
360 // we can terminate earlier on free entry only
361 ceph_assert(ctx
.fully_processed
);
363 // check partially free slot sets first (including neighboring),
364 // full length match required.
365 if (ctx
.affordable_len
) {
366 ceph_assert(ctx
.affordable_len
>= length
);
367 ceph_assert((length
% l0_granularity
) == 0);
368 auto pos_start
= ctx
.affordable_offs
/ l0_granularity
;
369 auto pos_end
= (ctx
.affordable_offs
+ length
) / l0_granularity
;
370 _mark_alloc_l1_l0(pos_start
, pos_end
);
371 res
= interval_t(ctx
.affordable_offs
, length
);
374 if (ctx
.min_affordable_len
) {
375 auto pos_start
= ctx
.min_affordable_offs
/ l0_granularity
;
376 auto pos_end
= (ctx
.min_affordable_offs
+ ctx
.min_affordable_len
) / l0_granularity
;
377 _mark_alloc_l1_l0(pos_start
, pos_end
);
378 return interval_t(ctx
.min_affordable_offs
, ctx
.min_affordable_len
);
382 _analyze_partials(pos_start
, pos_end
, length
, min_length
, NO_STOP
, &ctx
);
383 ceph_assert(ctx
.fully_processed
);
384 // check partially free slot sets first (including neighboring),
385 // full length match required.
386 if (ctx
.affordable_len
) {
387 ceph_assert(ctx
.affordable_len
>= length
);
388 ceph_assert((length
% l0_granularity
) == 0);
389 auto pos_start
= ctx
.affordable_offs
/ l0_granularity
;
390 auto pos_end
= (ctx
.affordable_offs
+ length
) / l0_granularity
;
391 _mark_alloc_l1_l0(pos_start
, pos_end
);
392 res
= interval_t(ctx
.affordable_offs
, length
);
395 // allocate using contiguous extent found at l1 if affordable
396 // align allocated extent with min_length
397 if (ctx
.free_count
) {
398 auto o
= ctx
.free_l1_pos
* l1_granularity
;
399 auto l
= ctx
.free_count
* l1_granularity
;
400 interval_t aligned_extent
= _align2units(o
, l
, min_length
);
401 if (aligned_extent
.length
> 0) {
402 aligned_extent
.length
= std::min(length
,
403 uint64_t(aligned_extent
.length
));
404 ceph_assert((aligned_extent
.offset
% l0_granularity
) == 0);
405 ceph_assert((aligned_extent
.length
% l0_granularity
) == 0);
407 auto pos_start
= aligned_extent
.offset
/ l0_granularity
;
408 auto pos_end
= (aligned_extent
.offset
+ aligned_extent
.length
) / l0_granularity
;
410 _mark_alloc_l1_l0(pos_start
, pos_end
);
411 return aligned_extent
;
414 if (ctx
.min_affordable_len
) {
415 auto pos_start
= ctx
.min_affordable_offs
/ l0_granularity
;
416 auto pos_end
= (ctx
.min_affordable_offs
+ ctx
.min_affordable_len
) / l0_granularity
;
417 _mark_alloc_l1_l0(pos_start
, pos_end
);
418 return interval_t(ctx
.min_affordable_offs
, ctx
.min_affordable_len
);
424 bool AllocatorLevel01Loose::_allocate_l1(uint64_t length
,
425 uint64_t min_length
, uint64_t max_length
,
426 uint64_t l1_pos_start
, uint64_t l1_pos_end
,
428 interval_vector_t
* res
)
430 uint64_t d0
= L0_ENTRIES_PER_SLOT
;
431 uint64_t d1
= L1_ENTRIES_PER_SLOT
;
433 ceph_assert(0 == (l1_pos_start
% (slots_per_slotset
* d1
)));
434 ceph_assert(0 == (l1_pos_end
% (slots_per_slotset
* d1
)));
435 if (min_length
!= l0_granularity
) {
436 // probably not the most effecient way but
437 // don't care much about that at the moment
438 bool has_space
= true;
439 while (length
> *allocated
&& has_space
) {
441 _allocate_l1_contiguous(length
- *allocated
, min_length
, max_length
,
442 l1_pos_start
, l1_pos_end
);
446 _fragment_and_emplace(max_length
, i
.offset
, i
.length
, res
);
447 *allocated
+= i
.length
;
451 uint64_t l0_w
= slots_per_slotset
* d0
;
453 for (auto idx
= l1_pos_start
/ d1
;
454 idx
< l1_pos_end
/ d1
&& length
> *allocated
;
456 slot_t
& slot_val
= l1
[idx
];
457 if (slot_val
== all_slot_clear
) {
459 } else if (slot_val
== all_slot_set
) {
460 uint64_t to_alloc
= std::min(length
- *allocated
,
461 l1_granularity
* d1
);
462 *allocated
+= to_alloc
;
463 ++alloc_fragments_fast
;
464 _fragment_and_emplace(max_length
, idx
* d1
* l1_granularity
, to_alloc
,
466 _mark_alloc_l1_l0(idx
* d1
* bits_per_slotset
,
467 idx
* d1
* bits_per_slotset
+ to_alloc
/ l0_granularity
);
470 auto free_pos
= find_next_set_bit(slot_val
, 0);
471 ceph_assert(free_pos
< bits_per_slot
);
473 ceph_assert(length
> *allocated
);
476 empty
= _allocate_l0(length
, max_length
,
477 (idx
* d1
+ free_pos
/ L1_ENTRY_WIDTH
) * l0_w
,
478 (idx
* d1
+ free_pos
/ L1_ENTRY_WIDTH
+ 1) * l0_w
,
482 auto mask
= slot_t(L1_ENTRY_MASK
) << free_pos
;
484 slot_t old_mask
= (slot_val
& mask
) >> free_pos
;
489 case L1_ENTRY_PARTIAL
:
495 // the next line is no op with the current L1_ENTRY_FULL but left
496 // as-is for the sake of uniformity and to avoid potential errors
498 slot_val
|= slot_t(L1_ENTRY_FULL
) << free_pos
;
500 slot_val
|= slot_t(L1_ENTRY_PARTIAL
) << free_pos
;
503 if (length
<= *allocated
|| slot_val
== all_slot_clear
) {
506 free_pos
= find_next_set_bit(slot_val
, free_pos
+ L1_ENTRY_WIDTH
);
507 } while (free_pos
< bits_per_slot
);
510 return _is_empty_l1(l1_pos_start
, l1_pos_end
);
513 void AllocatorLevel01Loose::collect_stats(
514 std::map
<size_t, size_t>& bins_overall
)
516 size_t free_seq_cnt
= 0;
517 for (auto slot
: l0
) {
518 if (slot
== all_slot_set
) {
519 free_seq_cnt
+= L0_ENTRIES_PER_SLOT
;
520 } else if(slot
!= all_slot_clear
) {
523 auto pos1
= find_next_set_bit(slot
, pos
);
529 bins_overall
[cbits(free_seq_cnt
) - 1]++;
532 if (pos1
< bits_per_slot
) {
537 } while (pos
< bits_per_slot
);
538 } else if (free_seq_cnt
) {
539 bins_overall
[cbits(free_seq_cnt
) - 1]++;
544 bins_overall
[cbits(free_seq_cnt
) - 1]++;
548 inline ssize_t
AllocatorLevel01Loose::count_0s(slot_t slot_val
, size_t start_pos
)
551 size_t pos
= __builtin_ffsll(slot_val
>> start_pos
);
553 return sizeof(slot_t
)*8 - start_pos
;
556 size_t pos
= start_pos
;
557 slot_t mask
= slot_t(1) << pos
;
558 while (pos
< bits_per_slot
&& (slot_val
& mask
) == 0) {
562 return pos
- start_pos
;
566 inline ssize_t
AllocatorLevel01Loose::count_1s(slot_t slot_val
, size_t start_pos
)
568 return count_0s(~slot_val
, start_pos
);
570 void AllocatorLevel01Loose::dump(
571 std::function
<void(uint64_t offset
, uint64_t length
)> notify
)
575 for (size_t i
= 0; i
< l1
.size(); i
++)
577 for (size_t j
= 0; j
< L1_ENTRIES_PER_SLOT
* L1_ENTRY_WIDTH
; j
+= L1_ENTRY_WIDTH
)
579 size_t w
= (l1
[i
] >> j
) & L1_ENTRY_MASK
;
589 off
= ( ( bits_per_slot
* i
+ j
) / L1_ENTRY_WIDTH
) * slots_per_slotset
* bits_per_slot
;
590 len
+= bits_per_slotset
;
592 case L1_ENTRY_PARTIAL
:
593 size_t pos
= ( ( bits_per_slot
* i
+ j
) / L1_ENTRY_WIDTH
) * slots_per_slotset
;
594 for (size_t t
= 0; t
< slots_per_slotset
; t
++) {
596 slot_t allocation_pattern
= l0
[pos
+ t
];
597 while (p
< bits_per_slot
) {
599 //continue to skip allocated space, meaning bits set to 0
600 ssize_t alloc_count
= count_0s(allocation_pattern
, p
);
602 //now we are switched to expecting free space
603 if (p
< bits_per_slot
) {
605 ssize_t free_count
= count_1s(allocation_pattern
, p
);
606 assert(free_count
> 0);
608 off
= (pos
+ t
) * bits_per_slot
+ p
;
612 //continue free region
613 ssize_t free_count
= count_1s(allocation_pattern
, p
);
614 if (free_count
== 0) {
632 uint64_t AllocatorLevel01Loose::_claim_free_to_left_l0(int64_t l0_pos_start
)
634 int64_t d0
= L0_ENTRIES_PER_SLOT
;
636 int64_t pos
= l0_pos_start
- 1;
637 slot_t bits
= (slot_t
)1 << (pos
% d0
);
638 int64_t idx
= pos
/ d0
;
639 slot_t
* val_s
= l0
.data() + idx
;
641 int64_t pos_e
= p2align
<int64_t>(pos
, d0
);
643 while (pos
>= pos_e
) {
644 if (0 == ((*val_s
) & bits
))
651 val_s
= l0
.data() + idx
;
652 while (idx
>= 0 && (*val_s
) == all_slot_set
) {
653 *val_s
= all_slot_clear
;
656 val_s
= l0
.data() + idx
;
660 (*val_s
) != all_slot_set
&& (*val_s
) != all_slot_clear
) {
661 int64_t pos_e
= p2align
<int64_t>(pos
, d0
);
662 slot_t bits
= (slot_t
)1 << (pos
% d0
);
663 while (pos
>= pos_e
) {
664 if (0 == ((*val_s
) & bits
))
674 uint64_t AllocatorLevel01Loose::_claim_free_to_right_l0(int64_t l0_pos_start
)
676 auto d0
= L0_ENTRIES_PER_SLOT
;
678 int64_t pos
= l0_pos_start
;
679 slot_t bits
= (slot_t
)1 << (pos
% d0
);
680 size_t idx
= pos
/ d0
;
681 slot_t
* val_s
= l0
.data() + idx
;
683 int64_t pos_e
= p2roundup
<int64_t>(pos
+ 1, d0
);
685 while (pos
< pos_e
) {
686 if (0 == ((*val_s
) & bits
))
693 val_s
= l0
.data() + idx
;
694 while (idx
< l0
.size() && (*val_s
) == all_slot_set
) {
695 *val_s
= all_slot_clear
;
698 val_s
= l0
.data() + idx
;
701 if (idx
< l0
.size() &&
702 (*val_s
) != all_slot_set
&& (*val_s
) != all_slot_clear
) {
703 int64_t pos_e
= p2roundup
<int64_t>(pos
+ 1, d0
);
704 slot_t bits
= (slot_t
)1 << (pos
% d0
);
705 while (pos
< pos_e
) {
706 if (0 == ((*val_s
) & bits
))