]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/fastbmap_allocator_impl.cc
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / os / bluestore / fastbmap_allocator_impl.cc
CommitLineData
a8e16298
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Bitmap based in-memory allocator implementation.
5 * Author: Igor Fedotov, ifedotov@suse.com
6 *
7 */
8
9#include "fastbmap_allocator_impl.h"
10
11uint64_t AllocatorLevel::l0_dives = 0;
12uint64_t AllocatorLevel::l0_iterations = 0;
13uint64_t AllocatorLevel::l0_inner_iterations = 0;
14uint64_t AllocatorLevel::alloc_fragments = 0;
15uint64_t AllocatorLevel::alloc_fragments_fast = 0;
16uint64_t AllocatorLevel::l2_allocs = 0;
17
18inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length)
19{
20 interval_t res;
21 if (len >= min_length) {
11fdf7f2 22 res.offset = p2roundup(offset, min_length);
a8e16298
TL
23 auto delta_off = res.offset - offset;
24 if (len > delta_off) {
25 res.length = len - delta_off;
11fdf7f2 26 res.length = p2align<uint64_t>(res.length, min_length);
a8e16298
TL
27 if (res.length) {
28 return res;
29 }
30 }
31 }
32 return interval_t();
33}
34
35interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0,
36 uint64_t pos1, uint64_t min_length, interval_t* tail) const
37{
38 interval_t res;
39 if (pos0 >= pos1) {
40 return res;
41 }
42 auto pos = pos0;
43
44 interval_t res_candidate;
45 if (tail->length != 0) {
11fdf7f2
TL
46 ceph_assert((tail->offset % l0_granularity) == 0);
47 ceph_assert((tail->length % l0_granularity) == 0);
a8e16298
TL
48 res_candidate.offset = tail->offset / l0_granularity;
49 res_candidate.length = tail->length / l0_granularity;
50 }
51 *tail = interval_t();
52
53 auto d = bits_per_slot;
54 slot_t bits = l0[pos / d];
55 bits >>= pos % d;
56 bool end_loop = false;
57 auto min_granules = min_length / l0_granularity;
58
59 do {
60 if ((pos % d) == 0) {
61 bits = l0[pos / d];
62 if (pos1 - pos >= d) {
63 switch(bits) {
64 case all_slot_set:
65 // slot is totally free
66 if (!res_candidate.length) {
67 res_candidate.offset = pos;
68 }
69 res_candidate.length += d;
70 pos += d;
71 end_loop = pos >= pos1;
72 if (end_loop) {
73 *tail = res_candidate;
74 res_candidate = _align2units(res_candidate.offset,
75 res_candidate.length, min_granules);
76 if(res.length < res_candidate.length) {
77 res = res_candidate;
78 }
79 }
80 continue;
81 case all_slot_clear:
82 // slot is totally allocated
83 res_candidate = _align2units(res_candidate.offset,
84 res_candidate.length, min_granules);
85 if (res.length < res_candidate.length) {
86 res = res_candidate;
87 }
88 res_candidate = interval_t();
89 pos += d;
90 end_loop = pos >= pos1;
91 continue;
92 }
93 }
94 } //if ((pos % d) == 0)
95
96 end_loop = ++pos >= pos1;
97 if (bits & 1) {
98 // item is free
99 if (!res_candidate.length) {
100 res_candidate.offset = pos - 1;
101 }
102 ++res_candidate.length;
103 if (end_loop) {
104 *tail = res_candidate;
105 res_candidate = _align2units(res_candidate.offset,
106 res_candidate.length, min_granules);
107 if (res.length < res_candidate.length) {
108 res = res_candidate;
109 }
110 }
111 } else {
112 res_candidate = _align2units(res_candidate.offset,
113 res_candidate.length, min_granules);
114 if (res.length < res_candidate.length) {
115 res = res_candidate;
116 }
117 res_candidate = interval_t();
118 }
119 bits >>= 1;
120 } while (!end_loop);
121 res.offset *= l0_granularity;
122 res.length *= l0_granularity;
123 tail->offset *= l0_granularity;
124 tail->length *= l0_granularity;
125 return res;
126}
127
128void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
129 uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
130 search_ctx_t* ctx)
131{
132 auto d = CHILD_PER_SLOT;
11fdf7f2
TL
133 ceph_assert((pos_start % d) == 0);
134 ceph_assert((pos_end % d) == 0);
a8e16298
TL
135
136 uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
137
138 uint64_t l1_pos = pos_start;
139 const interval_t empty_tail;
140 interval_t prev_tail;
141
142 uint64_t next_free_l1_pos = 0;
143 for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
144 slot_t slot_val = l1[pos];
145 // FIXME minor: code below can be optimized to check slot_val against
146 // all_slot_set(_clear) value
147
148 for (auto c = 0; c < d; c++) {
149 switch (slot_val & L1_ENTRY_MASK) {
150 case L1_ENTRY_FREE:
151 prev_tail = empty_tail;
152 if (!ctx->free_count) {
153 ctx->free_l1_pos = l1_pos;
154 } else if (l1_pos != next_free_l1_pos){
155 auto o = ctx->free_l1_pos * l1_granularity;
156 auto l = ctx->free_count * l1_granularity;
157 // check if already found extent fits min_length after alignment
158 if (_align2units(o, l, min_length).length >= min_length) {
159 break;
160 }
161 // if not - proceed with the next one
162 ctx->free_l1_pos = l1_pos;
163 ctx->free_count = 0;
164 }
165 next_free_l1_pos = l1_pos + 1;
166 ++ctx->free_count;
167 if (mode == STOP_ON_EMPTY) {
168 return;
169 }
170 break;
171 case L1_ENTRY_FULL:
172 prev_tail = empty_tail;
173 break;
174 case L1_ENTRY_PARTIAL:
175 interval_t longest;
176 ++ctx->partial_count;
177
178 longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail);
179
180 if (longest.length >= length) {
181 if ((ctx->affordable_len == 0) ||
182 ((ctx->affordable_len != 0) &&
183 (longest.length < ctx->affordable_len))) {
184 ctx->affordable_len = longest.length;
185 ctx->affordable_offs = longest.offset;
186 }
187 }
188 if (longest.length >= min_length &&
189 (ctx->min_affordable_len == 0 ||
190 (longest.length < ctx->min_affordable_len))) {
191
11fdf7f2 192 ctx->min_affordable_len = p2align<uint64_t>(longest.length, min_length);
a8e16298
TL
193 ctx->min_affordable_offs = longest.offset;
194 }
195 if (mode == STOP_ON_PARTIAL) {
196 return;
197 }
198 break;
199 }
200 slot_val >>= L1_ENTRY_WIDTH;
201 ++l1_pos;
202 }
203 }
204 ctx->fully_processed = true;
205}
206
207void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
208{
209 if (l0_pos == l0_pos_end) {
210 return;
211 }
212 auto d0 = bits_per_slotset;
213 uint64_t l1_w = CHILD_PER_SLOT;
214 // this should be aligned with slotset boundaries
11fdf7f2
TL
215 ceph_assert(0 == (l0_pos % d0));
216 ceph_assert(0 == (l0_pos_end % d0));
a8e16298
TL
217
218 int64_t idx = l0_pos / bits_per_slot;
219 int64_t idx_end = l0_pos_end / bits_per_slot;
220 slot_t mask_to_apply = L1_ENTRY_NOT_USED;
221
222 auto l1_pos = l0_pos / d0;
223
224 while (idx < idx_end) {
225 if (l0[idx] == all_slot_clear) {
226 // if not all prev slots are allocated then no need to check the
227 // current slot set, it's partial
228 ++idx;
229 if (mask_to_apply == L1_ENTRY_NOT_USED) {
230 mask_to_apply = L1_ENTRY_FULL;
231 } else if (mask_to_apply != L1_ENTRY_FULL) {
11fdf7f2 232 idx = p2roundup(idx, int64_t(slotset_width));
a8e16298
TL
233 mask_to_apply = L1_ENTRY_PARTIAL;
234 }
235 } else if (l0[idx] == all_slot_set) {
236 // if not all prev slots are free then no need to check the
237 // current slot set, it's partial
238 ++idx;
239 if (mask_to_apply == L1_ENTRY_NOT_USED) {
240 mask_to_apply = L1_ENTRY_FREE;
241 } else if (mask_to_apply != L1_ENTRY_FREE) {
11fdf7f2 242 idx = p2roundup(idx, int64_t(slotset_width));
a8e16298
TL
243 mask_to_apply = L1_ENTRY_PARTIAL;
244 }
245 } else {
246 // no need to check the current slot set, it's partial
247 mask_to_apply = L1_ENTRY_PARTIAL;
248 ++idx;
11fdf7f2 249 idx = p2roundup(idx, int64_t(slotset_width));
a8e16298
TL
250 }
251 if ((idx % slotset_width) == 0) {
11fdf7f2 252 ceph_assert(mask_to_apply != L1_ENTRY_NOT_USED);
a8e16298
TL
253 uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
254 slot_t& slot_val = l1[l1_pos / l1_w];
255 auto mask = slot_t(L1_ENTRY_MASK) << shift;
256
257 slot_t old_mask = (slot_val & mask) >> shift;
258 switch(old_mask) {
259 case L1_ENTRY_FREE:
260 unalloc_l1_count--;
261 break;
262 case L1_ENTRY_PARTIAL:
263 partial_l1_count--;
264 break;
265 }
266 slot_val &= ~mask;
267 slot_val |= slot_t(mask_to_apply) << shift;
268 switch(mask_to_apply) {
269 case L1_ENTRY_FREE:
270 unalloc_l1_count++;
271 break;
272 case L1_ENTRY_PARTIAL:
273 partial_l1_count++;
274 break;
275 }
276 mask_to_apply = L1_ENTRY_NOT_USED;
277 ++l1_pos;
278 }
279 }
280}
281
282void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
283 int64_t l0_pos_end)
284{
285 auto d0 = CHILD_PER_SLOT_L0;
286
287 int64_t pos = l0_pos_start;
288 slot_t bits = (slot_t)1 << (l0_pos_start % d0);
81eedcae
TL
289 slot_t* val_s = &l0[pos / d0];
290 int64_t pos_e = std::min(l0_pos_end, p2roundup<int64_t>(l0_pos_start + 1, d0));
291 while (pos < pos_e) {
292 (*val_s) &= ~bits;
a8e16298
TL
293 bits <<= 1;
294 pos++;
295 }
81eedcae
TL
296 pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
297 while (pos < pos_e) {
298 *(++val_s) = all_slot_clear;
a8e16298
TL
299 pos += d0;
300 }
301 bits = 1;
81eedcae 302 ++val_s;
a8e16298 303 while (pos < l0_pos_end) {
81eedcae 304 (*val_s) &= ~bits;
a8e16298
TL
305 bits <<= 1;
306 pos++;
307 }
308}
309
310interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length,
311 uint64_t min_length, uint64_t max_length,
312 uint64_t pos_start, uint64_t pos_end)
313{
314 interval_t res = { 0, 0 };
315 uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
316
317 if (unlikely(length <= l0_granularity)) {
318 search_ctx_t ctx;
319 _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
320 STOP_ON_PARTIAL, &ctx);
321
322 // check partially free slot sets first (including neighboring),
323 // full length match required.
324 if (ctx.affordable_len) {
325 // allocate as specified
11fdf7f2 326 ceph_assert(ctx.affordable_len >= length);
a8e16298
TL
327 auto pos = ctx.affordable_offs / l0_granularity;
328 _mark_alloc_l1_l0(pos, pos + 1);
329 res = interval_t(ctx.affordable_offs, length);
330 return res;
331 }
332
333 // allocate from free slot sets
334 if (ctx.free_count) {
335 auto l = std::min(length, ctx.free_count * l1_granularity);
11fdf7f2 336 ceph_assert((l % l0_granularity) == 0);
a8e16298
TL
337 auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
338
339 _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
340 res = interval_t(ctx.free_l1_pos * l1_granularity, l);
341 return res;
342 }
343 } else if (unlikely(length == l1_granularity)) {
344 search_ctx_t ctx;
345 _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
346
347 // allocate using contiguous extent found at l1 if any
348 if (ctx.free_count) {
349
350 auto l = std::min(length, ctx.free_count * l1_granularity);
11fdf7f2 351 ceph_assert((l % l0_granularity) == 0);
a8e16298
TL
352 auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
353
354 _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
355 res = interval_t(ctx.free_l1_pos * l1_granularity, l);
356
357 return res;
358 }
359
360 // we can terminate earlier on free entry only
11fdf7f2 361 ceph_assert(ctx.fully_processed);
a8e16298
TL
362
363 // check partially free slot sets first (including neighboring),
364 // full length match required.
365 if (ctx.affordable_len) {
11fdf7f2
TL
366 ceph_assert(ctx.affordable_len >= length);
367 ceph_assert((length % l0_granularity) == 0);
81eedcae 368 auto pos_start = ctx.affordable_offs / l0_granularity;
a8e16298
TL
369 auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
370 _mark_alloc_l1_l0(pos_start, pos_end);
371 res = interval_t(ctx.affordable_offs, length);
372 return res;
373 }
374 if (ctx.min_affordable_len) {
375 auto pos_start = ctx.min_affordable_offs / l0_granularity;
376 auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
377 _mark_alloc_l1_l0(pos_start, pos_end);
378 return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
379 }
380 } else {
381 search_ctx_t ctx;
382 _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
11fdf7f2 383 ceph_assert(ctx.fully_processed);
a8e16298
TL
384 // check partially free slot sets first (including neighboring),
385 // full length match required.
386 if (ctx.affordable_len) {
11fdf7f2
TL
387 ceph_assert(ctx.affordable_len >= length);
388 ceph_assert((length % l0_granularity) == 0);
a8e16298
TL
389 auto pos_start = ctx.affordable_offs / l0_granularity;
390 auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
391 _mark_alloc_l1_l0(pos_start, pos_end);
392 res = interval_t(ctx.affordable_offs, length);
393 return res;
394 }
395 // allocate using contiguous extent found at l1 if affordable
396 // align allocated extent with min_length
397 if (ctx.free_count) {
398 auto o = ctx.free_l1_pos * l1_granularity;
399 auto l = ctx.free_count * l1_granularity;
400 interval_t aligned_extent = _align2units(o, l, min_length);
401 if (aligned_extent.length > 0) {
402 aligned_extent.length = std::min(length,
403 uint64_t(aligned_extent.length));
11fdf7f2
TL
404 ceph_assert((aligned_extent.offset % l0_granularity) == 0);
405 ceph_assert((aligned_extent.length % l0_granularity) == 0);
a8e16298
TL
406
407 auto pos_start = aligned_extent.offset / l0_granularity;
408 auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity;
409
410 _mark_alloc_l1_l0(pos_start, pos_end);
411 return aligned_extent;
412 }
413 }
414 if (ctx.min_affordable_len) {
415 auto pos_start = ctx.min_affordable_offs / l0_granularity;
416 auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
417 _mark_alloc_l1_l0(pos_start, pos_end);
418 return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
419 }
420 }
421 return res;
422}
423
424bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
425 uint64_t min_length, uint64_t max_length,
426 uint64_t l1_pos_start, uint64_t l1_pos_end,
427 uint64_t* allocated,
428 interval_vector_t* res)
429{
430 uint64_t d0 = CHILD_PER_SLOT_L0;
431 uint64_t d1 = CHILD_PER_SLOT;
432
11fdf7f2
TL
433 ceph_assert(0 == (l1_pos_start % (slotset_width * d1)));
434 ceph_assert(0 == (l1_pos_end % (slotset_width * d1)));
a8e16298
TL
435 if (min_length != l0_granularity) {
436 // probably not the most effecient way but
437 // don't care much about that at the moment
438 bool has_space = true;
439 while (length > *allocated && has_space) {
440 interval_t i =
441 _allocate_l1_contiguous(length - *allocated, min_length, max_length,
442 l1_pos_start, l1_pos_end);
443 if (i.length == 0) {
444 has_space = false;
445 } else {
446 _fragment_and_emplace(max_length, i.offset, i.length, res);
447 *allocated += i.length;
448 }
449 }
450 } else {
451 uint64_t l0_w = slotset_width * d0;
452
453 for (auto idx = l1_pos_start / d1;
454 idx < l1_pos_end / d1 && length > *allocated;
455 ++idx) {
456 slot_t& slot_val = l1[idx];
457 if (slot_val == all_slot_clear) {
458 continue;
459 } else if (slot_val == all_slot_set) {
460 uint64_t to_alloc = std::min(length - *allocated,
461 l1_granularity * d1);
462 *allocated += to_alloc;
463 ++alloc_fragments_fast;
464 _fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
465 res);
466 _mark_alloc_l1_l0(idx * d1 * bits_per_slotset,
467 idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
468 continue;
469 }
470 auto free_pos = find_next_set_bit(slot_val, 0);
11fdf7f2 471 ceph_assert(free_pos < bits_per_slot);
a8e16298 472 do {
11fdf7f2 473 ceph_assert(length > *allocated);
a8e16298
TL
474
475 bool empty;
476 empty = _allocate_l0(length, max_length,
477 (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
478 (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
479 allocated,
480 res);
481
482 auto mask = slot_t(L1_ENTRY_MASK) << free_pos;
483
484 slot_t old_mask = (slot_val & mask) >> free_pos;
485 switch(old_mask) {
486 case L1_ENTRY_FREE:
487 unalloc_l1_count--;
488 break;
489 case L1_ENTRY_PARTIAL:
490 partial_l1_count--;
491 break;
492 }
493 slot_val &= ~mask;
494 if (empty) {
495 // the next line is no op with the current L1_ENTRY_FULL but left
496 // as-is for the sake of uniformity and to avoid potential errors
497 // in future
498 slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
499 } else {
500 slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
501 partial_l1_count++;
502 }
503 if (length <= *allocated || slot_val == all_slot_clear) {
504 break;
505 }
506 free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
507 } while (free_pos < bits_per_slot);
508 }
509 }
510 return _is_empty_l1(l1_pos_start, l1_pos_end);
511}
512
513void AllocatorLevel01Loose::collect_stats(
514 std::map<size_t, size_t>& bins_overall)
515{
516 size_t free_seq_cnt = 0;
517 for (auto slot : l0) {
518 if (slot == all_slot_set) {
519 free_seq_cnt += CHILD_PER_SLOT_L0;
520 } else if(slot != all_slot_clear) {
521 size_t pos = 0;
522 do {
523 auto pos1 = find_next_set_bit(slot, pos);
524 if (pos1 == pos) {
525 free_seq_cnt++;
526 pos = pos1 + 1;
527 } else {
528 if (free_seq_cnt) {
529 bins_overall[cbits(free_seq_cnt) - 1]++;
530 free_seq_cnt = 0;
531 }
532 if (pos1 < bits_per_slot) {
533 free_seq_cnt = 1;
534 }
535 pos = pos1 + 1;
536 }
537 } while (pos < bits_per_slot);
538 } else if (free_seq_cnt) {
539 bins_overall[cbits(free_seq_cnt) - 1]++;
540 free_seq_cnt = 0;
541 }
542 }
543 if (free_seq_cnt) {
544 bins_overall[cbits(free_seq_cnt) - 1]++;
545 }
546}