]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/fastbmap_allocator_impl.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / os / bluestore / fastbmap_allocator_impl.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Bitmap based in-memory allocator implementation.
5 * Author: Igor Fedotov, ifedotov@suse.com
6 *
7 */
8
9 #include "fastbmap_allocator_impl.h"
10
11 uint64_t AllocatorLevel::l0_dives = 0;
12 uint64_t AllocatorLevel::l0_iterations = 0;
13 uint64_t AllocatorLevel::l0_inner_iterations = 0;
14 uint64_t AllocatorLevel::alloc_fragments = 0;
15 uint64_t AllocatorLevel::alloc_fragments_fast = 0;
16 uint64_t AllocatorLevel::l2_allocs = 0;
17
18 inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length)
19 {
20 interval_t res;
21 if (len >= min_length) {
22 res.offset = p2roundup(offset, min_length);
23 auto delta_off = res.offset - offset;
24 if (len > delta_off) {
25 res.length = len - delta_off;
26 res.length = p2align<uint64_t>(res.length, min_length);
27 if (res.length) {
28 return res;
29 }
30 }
31 }
32 return interval_t();
33 }
34
35 interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0,
36 uint64_t pos1, uint64_t min_length, interval_t* tail) const
37 {
38 interval_t res;
39 if (pos0 >= pos1) {
40 return res;
41 }
42 auto pos = pos0;
43
44 interval_t res_candidate;
45 if (tail->length != 0) {
46 ceph_assert((tail->offset % l0_granularity) == 0);
47 ceph_assert((tail->length % l0_granularity) == 0);
48 res_candidate.offset = tail->offset / l0_granularity;
49 res_candidate.length = tail->length / l0_granularity;
50 }
51 *tail = interval_t();
52
53 auto d = bits_per_slot;
54 slot_t bits = l0[pos / d];
55 bits >>= pos % d;
56 bool end_loop = false;
57 auto min_granules = min_length / l0_granularity;
58
59 do {
60 if ((pos % d) == 0) {
61 bits = l0[pos / d];
62 if (pos1 - pos >= d) {
63 switch(bits) {
64 case all_slot_set:
65 // slot is totally free
66 if (!res_candidate.length) {
67 res_candidate.offset = pos;
68 }
69 res_candidate.length += d;
70 pos += d;
71 end_loop = pos >= pos1;
72 if (end_loop) {
73 *tail = res_candidate;
74 res_candidate = _align2units(res_candidate.offset,
75 res_candidate.length, min_granules);
76 if(res.length < res_candidate.length) {
77 res = res_candidate;
78 }
79 }
80 continue;
81 case all_slot_clear:
82 // slot is totally allocated
83 res_candidate = _align2units(res_candidate.offset,
84 res_candidate.length, min_granules);
85 if (res.length < res_candidate.length) {
86 res = res_candidate;
87 }
88 res_candidate = interval_t();
89 pos += d;
90 end_loop = pos >= pos1;
91 continue;
92 }
93 }
94 } //if ((pos % d) == 0)
95
96 end_loop = ++pos >= pos1;
97 if (bits & 1) {
98 // item is free
99 if (!res_candidate.length) {
100 res_candidate.offset = pos - 1;
101 }
102 ++res_candidate.length;
103 if (end_loop) {
104 *tail = res_candidate;
105 res_candidate = _align2units(res_candidate.offset,
106 res_candidate.length, min_granules);
107 if (res.length < res_candidate.length) {
108 res = res_candidate;
109 }
110 }
111 } else {
112 res_candidate = _align2units(res_candidate.offset,
113 res_candidate.length, min_granules);
114 if (res.length < res_candidate.length) {
115 res = res_candidate;
116 }
117 res_candidate = interval_t();
118 }
119 bits >>= 1;
120 } while (!end_loop);
121 res.offset *= l0_granularity;
122 res.length *= l0_granularity;
123 tail->offset *= l0_granularity;
124 tail->length *= l0_granularity;
125 return res;
126 }
127
128 void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
129 uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
130 search_ctx_t* ctx)
131 {
132 auto d = CHILD_PER_SLOT;
133 ceph_assert((pos_start % d) == 0);
134 ceph_assert((pos_end % d) == 0);
135
136 uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
137
138 uint64_t l1_pos = pos_start;
139 const interval_t empty_tail;
140 interval_t prev_tail;
141
142 uint64_t next_free_l1_pos = 0;
143 for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
144 slot_t slot_val = l1[pos];
145 // FIXME minor: code below can be optimized to check slot_val against
146 // all_slot_set(_clear) value
147
148 for (auto c = 0; c < d; c++) {
149 switch (slot_val & L1_ENTRY_MASK) {
150 case L1_ENTRY_FREE:
151 prev_tail = empty_tail;
152 if (!ctx->free_count) {
153 ctx->free_l1_pos = l1_pos;
154 } else if (l1_pos != next_free_l1_pos){
155 auto o = ctx->free_l1_pos * l1_granularity;
156 auto l = ctx->free_count * l1_granularity;
157 // check if already found extent fits min_length after alignment
158 if (_align2units(o, l, min_length).length >= min_length) {
159 break;
160 }
161 // if not - proceed with the next one
162 ctx->free_l1_pos = l1_pos;
163 ctx->free_count = 0;
164 }
165 next_free_l1_pos = l1_pos + 1;
166 ++ctx->free_count;
167 if (mode == STOP_ON_EMPTY) {
168 return;
169 }
170 break;
171 case L1_ENTRY_FULL:
172 prev_tail = empty_tail;
173 break;
174 case L1_ENTRY_PARTIAL:
175 interval_t longest;
176 ++ctx->partial_count;
177
178 longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail);
179
180 if (longest.length >= length) {
181 if ((ctx->affordable_len == 0) ||
182 ((ctx->affordable_len != 0) &&
183 (longest.length < ctx->affordable_len))) {
184 ctx->affordable_len = longest.length;
185 ctx->affordable_offs = longest.offset;
186 }
187 }
188 if (longest.length >= min_length &&
189 (ctx->min_affordable_len == 0 ||
190 (longest.length < ctx->min_affordable_len))) {
191
192 ctx->min_affordable_len = p2align<uint64_t>(longest.length, min_length);
193 ctx->min_affordable_offs = longest.offset;
194 }
195 if (mode == STOP_ON_PARTIAL) {
196 return;
197 }
198 break;
199 }
200 slot_val >>= L1_ENTRY_WIDTH;
201 ++l1_pos;
202 }
203 }
204 ctx->fully_processed = true;
205 }
206
207 void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
208 {
209 if (l0_pos == l0_pos_end) {
210 return;
211 }
212 auto d0 = bits_per_slotset;
213 uint64_t l1_w = CHILD_PER_SLOT;
214 // this should be aligned with slotset boundaries
215 ceph_assert(0 == (l0_pos % d0));
216 ceph_assert(0 == (l0_pos_end % d0));
217
218 int64_t idx = l0_pos / bits_per_slot;
219 int64_t idx_end = l0_pos_end / bits_per_slot;
220 slot_t mask_to_apply = L1_ENTRY_NOT_USED;
221
222 auto l1_pos = l0_pos / d0;
223
224 while (idx < idx_end) {
225 if (l0[idx] == all_slot_clear) {
226 // if not all prev slots are allocated then no need to check the
227 // current slot set, it's partial
228 ++idx;
229 if (mask_to_apply == L1_ENTRY_NOT_USED) {
230 mask_to_apply = L1_ENTRY_FULL;
231 } else if (mask_to_apply != L1_ENTRY_FULL) {
232 idx = p2roundup(idx, int64_t(slotset_width));
233 mask_to_apply = L1_ENTRY_PARTIAL;
234 }
235 } else if (l0[idx] == all_slot_set) {
236 // if not all prev slots are free then no need to check the
237 // current slot set, it's partial
238 ++idx;
239 if (mask_to_apply == L1_ENTRY_NOT_USED) {
240 mask_to_apply = L1_ENTRY_FREE;
241 } else if (mask_to_apply != L1_ENTRY_FREE) {
242 idx = p2roundup(idx, int64_t(slotset_width));
243 mask_to_apply = L1_ENTRY_PARTIAL;
244 }
245 } else {
246 // no need to check the current slot set, it's partial
247 mask_to_apply = L1_ENTRY_PARTIAL;
248 ++idx;
249 idx = p2roundup(idx, int64_t(slotset_width));
250 }
251 if ((idx % slotset_width) == 0) {
252 ceph_assert(mask_to_apply != L1_ENTRY_NOT_USED);
253 uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
254 slot_t& slot_val = l1[l1_pos / l1_w];
255 auto mask = slot_t(L1_ENTRY_MASK) << shift;
256
257 slot_t old_mask = (slot_val & mask) >> shift;
258 switch(old_mask) {
259 case L1_ENTRY_FREE:
260 unalloc_l1_count--;
261 break;
262 case L1_ENTRY_PARTIAL:
263 partial_l1_count--;
264 break;
265 }
266 slot_val &= ~mask;
267 slot_val |= slot_t(mask_to_apply) << shift;
268 switch(mask_to_apply) {
269 case L1_ENTRY_FREE:
270 unalloc_l1_count++;
271 break;
272 case L1_ENTRY_PARTIAL:
273 partial_l1_count++;
274 break;
275 }
276 mask_to_apply = L1_ENTRY_NOT_USED;
277 ++l1_pos;
278 }
279 }
280 }
281
282 void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
283 int64_t l0_pos_end)
284 {
285 auto d0 = CHILD_PER_SLOT_L0;
286
287 int64_t pos = l0_pos_start;
288 slot_t bits = (slot_t)1 << (l0_pos_start % d0);
289
290 while (pos < std::min(l0_pos_end, p2roundup<int64_t>(l0_pos_start, d0))) {
291 l0[pos / d0] &= ~bits;
292 bits <<= 1;
293 pos++;
294 }
295
296 while (pos < std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0))) {
297 l0[pos / d0] = all_slot_clear;
298 pos += d0;
299 }
300 bits = 1;
301 while (pos < l0_pos_end) {
302 l0[pos / d0] &= ~bits;
303 bits <<= 1;
304 pos++;
305 }
306 }
307
308 interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length,
309 uint64_t min_length, uint64_t max_length,
310 uint64_t pos_start, uint64_t pos_end)
311 {
312 interval_t res = { 0, 0 };
313 uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
314
315 if (unlikely(length <= l0_granularity)) {
316 search_ctx_t ctx;
317 _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
318 STOP_ON_PARTIAL, &ctx);
319
320 // check partially free slot sets first (including neighboring),
321 // full length match required.
322 if (ctx.affordable_len) {
323 // allocate as specified
324 ceph_assert(ctx.affordable_len >= length);
325 auto pos = ctx.affordable_offs / l0_granularity;
326 _mark_alloc_l1_l0(pos, pos + 1);
327 res = interval_t(ctx.affordable_offs, length);
328 return res;
329 }
330
331 // allocate from free slot sets
332 if (ctx.free_count) {
333 auto l = std::min(length, ctx.free_count * l1_granularity);
334 ceph_assert((l % l0_granularity) == 0);
335 auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
336
337 _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
338 res = interval_t(ctx.free_l1_pos * l1_granularity, l);
339 return res;
340 }
341 } else if (unlikely(length == l1_granularity)) {
342 search_ctx_t ctx;
343 _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
344
345 // allocate using contiguous extent found at l1 if any
346 if (ctx.free_count) {
347
348 auto l = std::min(length, ctx.free_count * l1_granularity);
349 ceph_assert((l % l0_granularity) == 0);
350 auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
351
352 _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
353 res = interval_t(ctx.free_l1_pos * l1_granularity, l);
354
355 return res;
356 }
357
358 // we can terminate earlier on free entry only
359 ceph_assert(ctx.fully_processed);
360
361 // check partially free slot sets first (including neighboring),
362 // full length match required.
363 if (ctx.affordable_len) {
364 ceph_assert(ctx.affordable_len >= length);
365 ceph_assert((length % l0_granularity) == 0);
366 auto pos_start = ctx.affordable_offs + length / l0_granularity;
367 auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
368 _mark_alloc_l1_l0(pos_start, pos_end);
369 res = interval_t(ctx.affordable_offs, length);
370 return res;
371 }
372 if (ctx.min_affordable_len) {
373 auto pos_start = ctx.min_affordable_offs / l0_granularity;
374 auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
375 _mark_alloc_l1_l0(pos_start, pos_end);
376 return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
377 }
378 } else {
379 search_ctx_t ctx;
380 _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
381 ceph_assert(ctx.fully_processed);
382 // check partially free slot sets first (including neighboring),
383 // full length match required.
384 if (ctx.affordable_len) {
385 ceph_assert(ctx.affordable_len >= length);
386 ceph_assert((length % l0_granularity) == 0);
387 auto pos_start = ctx.affordable_offs / l0_granularity;
388 auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
389 _mark_alloc_l1_l0(pos_start, pos_end);
390 res = interval_t(ctx.affordable_offs, length);
391 return res;
392 }
393 // allocate using contiguous extent found at l1 if affordable
394 // align allocated extent with min_length
395 if (ctx.free_count) {
396 auto o = ctx.free_l1_pos * l1_granularity;
397 auto l = ctx.free_count * l1_granularity;
398 interval_t aligned_extent = _align2units(o, l, min_length);
399 if (aligned_extent.length > 0) {
400 aligned_extent.length = std::min(length,
401 uint64_t(aligned_extent.length));
402 ceph_assert((aligned_extent.offset % l0_granularity) == 0);
403 ceph_assert((aligned_extent.length % l0_granularity) == 0);
404
405 auto pos_start = aligned_extent.offset / l0_granularity;
406 auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity;
407
408 _mark_alloc_l1_l0(pos_start, pos_end);
409 return aligned_extent;
410 }
411 }
412 if (ctx.min_affordable_len) {
413 auto pos_start = ctx.min_affordable_offs / l0_granularity;
414 auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
415 _mark_alloc_l1_l0(pos_start, pos_end);
416 return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
417 }
418 }
419 return res;
420 }
421
422 bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
423 uint64_t min_length, uint64_t max_length,
424 uint64_t l1_pos_start, uint64_t l1_pos_end,
425 uint64_t* allocated,
426 interval_vector_t* res)
427 {
428 uint64_t d0 = CHILD_PER_SLOT_L0;
429 uint64_t d1 = CHILD_PER_SLOT;
430
431 ceph_assert(0 == (l1_pos_start % (slotset_width * d1)));
432 ceph_assert(0 == (l1_pos_end % (slotset_width * d1)));
433 if (min_length != l0_granularity) {
434 // probably not the most effecient way but
435 // don't care much about that at the moment
436 bool has_space = true;
437 while (length > *allocated && has_space) {
438 interval_t i =
439 _allocate_l1_contiguous(length - *allocated, min_length, max_length,
440 l1_pos_start, l1_pos_end);
441 if (i.length == 0) {
442 has_space = false;
443 } else {
444 _fragment_and_emplace(max_length, i.offset, i.length, res);
445 *allocated += i.length;
446 }
447 }
448 } else {
449 uint64_t l0_w = slotset_width * d0;
450
451 for (auto idx = l1_pos_start / d1;
452 idx < l1_pos_end / d1 && length > *allocated;
453 ++idx) {
454 slot_t& slot_val = l1[idx];
455 if (slot_val == all_slot_clear) {
456 continue;
457 } else if (slot_val == all_slot_set) {
458 uint64_t to_alloc = std::min(length - *allocated,
459 l1_granularity * d1);
460 *allocated += to_alloc;
461 ++alloc_fragments_fast;
462 _fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
463 res);
464 _mark_alloc_l1_l0(idx * d1 * bits_per_slotset,
465 idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
466 continue;
467 }
468 auto free_pos = find_next_set_bit(slot_val, 0);
469 ceph_assert(free_pos < bits_per_slot);
470 do {
471 ceph_assert(length > *allocated);
472
473 bool empty;
474 empty = _allocate_l0(length, max_length,
475 (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
476 (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
477 allocated,
478 res);
479
480 auto mask = slot_t(L1_ENTRY_MASK) << free_pos;
481
482 slot_t old_mask = (slot_val & mask) >> free_pos;
483 switch(old_mask) {
484 case L1_ENTRY_FREE:
485 unalloc_l1_count--;
486 break;
487 case L1_ENTRY_PARTIAL:
488 partial_l1_count--;
489 break;
490 }
491 slot_val &= ~mask;
492 if (empty) {
493 // the next line is no op with the current L1_ENTRY_FULL but left
494 // as-is for the sake of uniformity and to avoid potential errors
495 // in future
496 slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
497 } else {
498 slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
499 partial_l1_count++;
500 }
501 if (length <= *allocated || slot_val == all_slot_clear) {
502 break;
503 }
504 free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
505 } while (free_pos < bits_per_slot);
506 }
507 }
508 return _is_empty_l1(l1_pos_start, l1_pos_end);
509 }
510
511 void AllocatorLevel01Loose::collect_stats(
512 std::map<size_t, size_t>& bins_overall)
513 {
514 size_t free_seq_cnt = 0;
515 for (auto slot : l0) {
516 if (slot == all_slot_set) {
517 free_seq_cnt += CHILD_PER_SLOT_L0;
518 } else if(slot != all_slot_clear) {
519 size_t pos = 0;
520 do {
521 auto pos1 = find_next_set_bit(slot, pos);
522 if (pos1 == pos) {
523 free_seq_cnt++;
524 pos = pos1 + 1;
525 } else {
526 if (free_seq_cnt) {
527 bins_overall[cbits(free_seq_cnt) - 1]++;
528 free_seq_cnt = 0;
529 }
530 if (pos1 < bits_per_slot) {
531 free_seq_cnt = 1;
532 }
533 pos = pos1 + 1;
534 }
535 } while (pos < bits_per_slot);
536 } else if (free_seq_cnt) {
537 bins_overall[cbits(free_seq_cnt) - 1]++;
538 free_seq_cnt = 0;
539 }
540 }
541 if (free_seq_cnt) {
542 bins_overall[cbits(free_seq_cnt) - 1]++;
543 }
544 }